]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/mptcp_subr.c
d9a35da3030b24fe44d8d320dee28b5de10eddf1
[apple/xnu.git] / bsd / netinet / mptcp_subr.c
1 /*
2 * Copyright (c) 2012-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <sys/param.h>
30 #include <sys/proc.h>
31 #include <sys/systm.h>
32 #include <sys/kernel.h>
33 #include <sys/mbuf.h>
34 #include <sys/mcache.h>
35 #include <sys/resourcevar.h>
36 #include <sys/socket.h>
37 #include <sys/socketvar.h>
38 #include <sys/syslog.h>
39 #include <sys/domain.h>
40 #include <sys/protosw.h>
41 #include <sys/sysctl.h>
42
43 #include <kern/zalloc.h>
44 #include <kern/locks.h>
45
46 #include <mach/thread_act.h>
47 #include <mach/sdt.h>
48
49 #include <net/if.h>
50 #include <net/if_var.h>
51 #include <netinet/in.h>
52 #include <netinet/in_pcb.h>
53 #include <netinet/in_var.h>
54 #include <netinet/tcp.h>
55 #include <netinet/tcp_fsm.h>
56 #include <netinet/tcp_seq.h>
57 #include <netinet/tcp_var.h>
58 #include <netinet/mptcp_var.h>
59 #include <netinet/mptcp.h>
60 #include <netinet/mptcp_seq.h>
61 #include <netinet/mptcp_timer.h>
62 #include <libkern/crypto/sha1.h>
63 #if INET6
64 #include <netinet6/in6_pcb.h>
65 #include <netinet6/ip6protosw.h>
66 #endif /* INET6 */
67 #include <dev/random/randomdev.h>
68
69 /*
70 * Notes on MPTCP implementation.
71 *
72 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
73 * communication domain. The structure mtcbinfo describes the MPTCP instance
74 * of a Multipath protocol in that domain. It is used to keep track of all
75 * MPTCP PCB instances in the system, and is protected by the global lock
76 * mppi_lock.
77 *
78 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
79 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
80 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
81 * allocated from the same memory block, and each structure has a pointer
82 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
83 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
84 * PCB (mppcb) as well as the MPTCP Session (mptses).
85 *
86 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
87 * in particular, the list of subflows as well as the MPTCP thread.
88 *
89 * A functioning MPTCP Session consists of one or more subflow sockets. Each
90 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
91 * represented by the mptsub structure. Because each subflow requires access
92 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
93 * subflow. This gets decremented prior to the subflow's destruction. The
94 * subflow lock (mpts_lock) is used to protect accesses to the subflow.
95 *
96 * To handle events (read, write, control) from the subflows, an MPTCP thread
97 * is created; currently, there is one thread per MPTCP Session. In order to
98 * prevent the MPTCP socket from being destroyed while being accessed by the
99 * MPTCP thread, we bump up the MPTCP socket's so_usecount for the thread,
100 * which will be decremented prior to the thread's termination. The thread
101 * lock (mpte_thread_lock) is used to synchronize its signalling.
102 *
103 * Lock ordering is defined as follows:
104 *
105 * mtcbinfo (mppi_lock)
106 * mp_so (mpp_lock)
107 * mpts (mpts_lock)
108 * so (inpcb_mtx)
109 * mptcb (mpt_lock)
110 *
111 * It is not a requirement that all of the above locks need to be acquired
112 * in succession, but the correct lock ordering must be followed when there
113 * are more than one locks that need to be held. The MPTCP thread lock is
114 * is not constrained by this arrangement, because none of the other locks
115 * is ever acquired while holding mpte_thread_lock; therefore it may be called
116 * at any moment to signal the thread.
117 *
118 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
119 * work is done by the MPTCP garbage collector which is invoked on demand by
120 * the PF_MULTIPATH garbage collector. This process will take place once all
121 * of the subflows have been destroyed, and the MPTCP thread be instructed to
122 * self-terminate.
123 */
124
125 static void mptcp_sesdestroy(struct mptses *);
126 static void mptcp_thread_signal_locked(struct mptses *);
127 static void mptcp_thread_terminate_signal(struct mptses *);
128 static void mptcp_thread_dowork(struct mptses *);
129 static void mptcp_thread_func(void *, wait_result_t);
130 static void mptcp_thread_destroy(struct mptses *);
131 static void mptcp_key_pool_init(void);
132 static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t);
133 static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
134 static void mptcp_conn_properties(struct mptcb *);
135
136 static uint32_t mptcp_gc(struct mppcbinfo *);
137 static int mptcp_subflow_socreate(struct mptses *, struct mptsub *,
138 int, struct proc *, struct socket **);
139 static int mptcp_subflow_soclose(struct mptsub *, struct socket *);
140 static int mptcp_subflow_soconnectx(struct mptses *, struct mptsub *);
141 static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
142 struct uio *, struct mbuf **, struct mbuf **, int *);
143 static void mptcp_subflow_rupcall(struct socket *, void *, int);
144 static void mptcp_subflow_input(struct mptses *, struct mptsub *);
145 static void mptcp_subflow_wupcall(struct socket *, void *, int);
146 static void mptcp_subflow_eupcall(struct socket *, void *, uint32_t);
147 static void mptcp_update_last_owner(struct mptsub *, struct socket *);
148 static void mptcp_output_needed(struct mptses *mpte, struct mptsub *to_mpts);
149 static void mptcp_get_rtt_measurement(struct mptsub *, struct mptses *);
150 static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *);
151
152 /*
153 * Possible return values for subflow event handlers. Note that success
154 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
155 * indicate errors or actions which require immediate attention; they will
156 * prevent the rest of the handlers from processing their respective events
157 * until the next round of events processing.
158 */
159 typedef enum {
160 MPTS_EVRET_DELETE = 1, /* delete this subflow */
161 MPTS_EVRET_OK = 2, /* OK */
162 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
163 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
164 } ev_ret_t;
165
166 static ev_ret_t mptcp_subflow_events(struct mptses *, struct mptsub *, uint64_t *);
167 static ev_ret_t mptcp_subflow_connreset_ev(struct mptses *, struct mptsub *, uint64_t *);
168 static ev_ret_t mptcp_subflow_cantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *);
169 static ev_ret_t mptcp_subflow_cantsendmore_ev(struct mptses *, struct mptsub *, uint64_t *);
170 static ev_ret_t mptcp_subflow_timeout_ev(struct mptses *, struct mptsub *, uint64_t *);
171 static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *, uint64_t *);
172 static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *, uint64_t *);
173 static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *, uint64_t *);
174 static ev_ret_t mptcp_subflow_suspend_ev(struct mptses *, struct mptsub *, uint64_t *);
175 static ev_ret_t mptcp_subflow_resume_ev(struct mptses *, struct mptsub *, uint64_t *);
176 static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *, uint64_t *);
177 static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *, uint64_t *);
178 static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *, uint64_t *);
179 static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *, uint64_t *);
180 static ev_ret_t mptcp_fastjoin_ev(struct mptses *, struct mptsub *, uint64_t *);
181 static ev_ret_t mptcp_deleteok_ev(struct mptses *, struct mptsub *, uint64_t *);
182 static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *);
183
184 static const char *mptcp_evret2str(ev_ret_t);
185
186 static mptcp_key_t *mptcp_reserve_key(void);
187 static int mptcp_do_sha1(mptcp_key_t *, char *, int);
188 static void mptcp_init_local_parms(struct mptcb *);
189
190 static unsigned int mptsub_zone_size; /* size of mptsub */
191 static struct zone *mptsub_zone; /* zone for mptsub */
192
193 static unsigned int mptopt_zone_size; /* size of mptopt */
194 static struct zone *mptopt_zone; /* zone for mptopt */
195
196 static unsigned int mpt_subauth_entry_size; /* size of subf auth entry */
197 static struct zone *mpt_subauth_zone; /* zone of subf auth entry */
198
199 struct mppcbinfo mtcbinfo;
200
201 static struct mptcp_keys_pool_head mptcp_keys_pool;
202
203 #define MPTCP_SUBFLOW_WRITELEN (8 * 1024) /* bytes to write each time */
204 #define MPTCP_SUBFLOW_READLEN (8 * 1024) /* bytes to read each time */
205
206 SYSCTL_DECL(_net_inet);
207
208 SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "MPTCP");
209
210 uint32_t mptcp_dbg_area = 0; /* more noise if greater than 1 */
211 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, dbg_area, CTLFLAG_RW|CTLFLAG_LOCKED,
212 &mptcp_dbg_area, 0, "MPTCP debug area");
213
214 uint32_t mptcp_dbg_level = 0;
215 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dbg_level, CTLFLAG_RW | CTLFLAG_LOCKED,
216 &mptcp_dbg_level, 0, "MPTCP debug level");
217
218
219 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD|CTLFLAG_LOCKED,
220 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
221
222 /*
223 * Since there is one kernel thread per mptcp socket, imposing an artificial
224 * limit on number of allowed mptcp sockets.
225 */
226 uint32_t mptcp_socket_limit = MPPCB_LIMIT;
227 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, sk_lim, CTLFLAG_RW|CTLFLAG_LOCKED,
228 &mptcp_socket_limit, 0, "MPTCP socket limit");
229
230 /*
231 * SYSCTL to turn on delayed cellular subflow start.
232 */
233 uint32_t mptcp_delayed_subf_start = 0;
234 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, delayed, CTLFLAG_RW|CTLFLAG_LOCKED,
235 &mptcp_delayed_subf_start, 0, "MPTCP Delayed Subflow start");
236
237 /*
238 * sysctl to use network status hints from symptomsd
239 */
240 uint32_t mptcp_use_symptomsd = 1;
241 SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, usesymptoms, CTLFLAG_RW|CTLFLAG_LOCKED,
242 &mptcp_use_symptomsd, 0, "MPTCP Use SymptomsD");
243
244 static struct protosw mptcp_subflow_protosw;
245 static struct pr_usrreqs mptcp_subflow_usrreqs;
246 #if INET6
247 static struct ip6protosw mptcp_subflow_protosw6;
248 static struct pr_usrreqs mptcp_subflow_usrreqs6;
249 #endif /* INET6 */
250
251 typedef struct mptcp_subflow_event_entry {
252 uint64_t sofilt_hint_mask;
253 ev_ret_t (*sofilt_hint_ev_hdlr)(
254 struct mptses *mpte,
255 struct mptsub *mpts,
256 uint64_t *p_mpsofilt_hint);
257 } mptsub_ev_entry_t;
258
259 /*
260 * XXX The order of the event handlers below is really
261 * really important.
262 * SO_FILT_HINT_DELETEOK event has to be handled first,
263 * else we may end up missing on this event.
264 * Please read radar://24043716 for more details.
265 */
266 static mptsub_ev_entry_t mpsub_ev_entry_tbl [] = {
267 {
268 .sofilt_hint_mask = SO_FILT_HINT_DELETEOK,
269 .sofilt_hint_ev_hdlr = mptcp_deleteok_ev,
270 },
271 {
272 .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
273 .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev,
274 },
275 {
276 .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
277 .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
278 },
279 {
280 .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
281 .sofilt_hint_ev_hdlr = mptcp_subflow_connreset_ev,
282 },
283 {
284 .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
285 .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
286 },
287 {
288 .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
289 .sofilt_hint_ev_hdlr = mptcp_subflow_cantrcvmore_ev,
290 },
291 { .sofilt_hint_mask = SO_FILT_HINT_CANTSENDMORE,
292 .sofilt_hint_ev_hdlr = mptcp_subflow_cantsendmore_ev,
293 },
294 {
295 .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
296 .sofilt_hint_ev_hdlr = mptcp_subflow_timeout_ev,
297 },
298 {
299 .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
300 .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
301 },
302 {
303 .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
304 .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
305 },
306 {
307 .sofilt_hint_mask = SO_FILT_HINT_SUSPEND,
308 .sofilt_hint_ev_hdlr = mptcp_subflow_suspend_ev,
309 },
310 {
311 .sofilt_hint_mask = SO_FILT_HINT_RESUME,
312 .sofilt_hint_ev_hdlr = mptcp_subflow_resume_ev,
313 },
314 {
315 .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
316 .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
317 },
318 {
319 .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
320 .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
321 },
322 {
323 .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
324 .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
325 },
326 {
327 .sofilt_hint_mask = SO_FILT_HINT_MPFASTJ,
328 .sofilt_hint_ev_hdlr = mptcp_fastjoin_ev,
329 }
330 };
331
332 /*
333 * Protocol pr_init callback.
334 */
335 void
336 mptcp_init(struct protosw *pp, struct domain *dp)
337 {
338 #pragma unused(dp)
339 static int mptcp_initialized = 0;
340 struct protosw *prp;
341 #if INET6
342 struct ip6protosw *prp6;
343 #endif /* INET6 */
344
345 VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED);
346
347 /* do this only once */
348 if (mptcp_initialized)
349 return;
350 mptcp_initialized = 1;
351
352 /*
353 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
354 * we must be able to find IPPROTO_TCP entries for both.
355 */
356 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
357 VERIFY(prp != NULL);
358 bcopy(prp, &mptcp_subflow_protosw, sizeof (*prp));
359 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
360 sizeof (mptcp_subflow_usrreqs));
361 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
362 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
363 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
364 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
365 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
366 /*
367 * Socket filters shouldn't attach/detach to/from this protosw
368 * since pr_protosw is to be used instead, which points to the
369 * real protocol; if they do, it is a bug and we should panic.
370 */
371 mptcp_subflow_protosw.pr_filter_head.tqh_first =
372 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
373 mptcp_subflow_protosw.pr_filter_head.tqh_last =
374 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
375
376 #if INET6
377 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
378 IPPROTO_TCP, SOCK_STREAM);
379 VERIFY(prp6 != NULL);
380 bcopy(prp6, &mptcp_subflow_protosw6, sizeof (*prp6));
381 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
382 sizeof (mptcp_subflow_usrreqs6));
383 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
384 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
385 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
386 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
387 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
388 /*
389 * Socket filters shouldn't attach/detach to/from this protosw
390 * since pr_protosw is to be used instead, which points to the
391 * real protocol; if they do, it is a bug and we should panic.
392 */
393 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
394 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
395 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
396 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
397 #endif /* INET6 */
398
399 bzero(&mtcbinfo, sizeof (mtcbinfo));
400 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
401 mtcbinfo.mppi_size = sizeof (struct mpp_mtp);
402 if ((mtcbinfo.mppi_zone = zinit(mtcbinfo.mppi_size,
403 1024 * mtcbinfo.mppi_size, 8192, "mptcb")) == NULL) {
404 panic("%s: unable to allocate MPTCP PCB zone\n", __func__);
405 /* NOTREACHED */
406 }
407 zone_change(mtcbinfo.mppi_zone, Z_CALLERACCT, FALSE);
408 zone_change(mtcbinfo.mppi_zone, Z_EXPAND, TRUE);
409
410 mtcbinfo.mppi_lock_grp_attr = lck_grp_attr_alloc_init();
411 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb",
412 mtcbinfo.mppi_lock_grp_attr);
413 mtcbinfo.mppi_lock_attr = lck_attr_alloc_init();
414 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
415 mtcbinfo.mppi_lock_attr);
416
417 mtcbinfo.mppi_gc = mptcp_gc;
418 mtcbinfo.mppi_timer = mptcp_timer;
419 mtcbinfo.mppi_pcbe_create = mptcp_sescreate;
420
421 /* attach to MP domain for garbage collection to take place */
422 mp_pcbinfo_attach(&mtcbinfo);
423
424 mptsub_zone_size = sizeof (struct mptsub);
425 if ((mptsub_zone = zinit(mptsub_zone_size, 1024 * mptsub_zone_size,
426 8192, "mptsub")) == NULL) {
427 panic("%s: unable to allocate MPTCP subflow zone\n", __func__);
428 /* NOTREACHED */
429 }
430 zone_change(mptsub_zone, Z_CALLERACCT, FALSE);
431 zone_change(mptsub_zone, Z_EXPAND, TRUE);
432
433 mptopt_zone_size = sizeof (struct mptopt);
434 if ((mptopt_zone = zinit(mptopt_zone_size, 128 * mptopt_zone_size,
435 1024, "mptopt")) == NULL) {
436 panic("%s: unable to allocate MPTCP option zone\n", __func__);
437 /* NOTREACHED */
438 }
439 zone_change(mptopt_zone, Z_CALLERACCT, FALSE);
440 zone_change(mptopt_zone, Z_EXPAND, TRUE);
441
442 mpt_subauth_entry_size = sizeof (struct mptcp_subf_auth_entry);
443 if ((mpt_subauth_zone = zinit(mpt_subauth_entry_size,
444 1024 * mpt_subauth_entry_size, 8192, "mptauth")) == NULL) {
445 panic("%s: unable to allocate MPTCP address auth zone \n",
446 __func__);
447 /* NOTREACHED */
448 }
449 zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE);
450 zone_change(mpt_subauth_zone, Z_EXPAND, TRUE);
451
452 /* Set up a list of unique keys */
453 mptcp_key_pool_init();
454 }
455
456 /*
457 * Create an MPTCP session, called as a result of opening a MPTCP socket.
458 */
459 void *
460 mptcp_sescreate(struct socket *mp_so, struct mppcb *mpp)
461 {
462 struct mppcbinfo *mppi;
463 struct mptses *mpte;
464 struct mptcb *mp_tp;
465 int error = 0;
466
467 VERIFY(mpp != NULL);
468 mppi = mpp->mpp_pcbinfo;
469 VERIFY(mppi != NULL);
470
471 __IGNORE_WCASTALIGN(mpte = &((struct mpp_mtp *)mpp)->mpp_ses);
472 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
473
474 /* MPTCP Multipath PCB Extension */
475 bzero(mpte, sizeof (*mpte));
476 VERIFY(mpp->mpp_pcbe == NULL);
477 mpp->mpp_pcbe = mpte;
478 mpte->mpte_mppcb = mpp;
479 mpte->mpte_mptcb = mp_tp;
480
481 TAILQ_INIT(&mpte->mpte_sopts);
482 TAILQ_INIT(&mpte->mpte_subflows);
483 mpte->mpte_associd = SAE_ASSOCID_ANY;
484 mpte->mpte_connid_last = SAE_CONNID_ANY;
485
486 lck_mtx_init(&mpte->mpte_thread_lock, mppi->mppi_lock_grp,
487 mppi->mppi_lock_attr);
488
489 /*
490 * XXX: adi@apple.com
491 *
492 * This can be rather expensive if we have lots of MPTCP sockets,
493 * but we need a kernel thread for this model to work. Perhaps we
494 * could amortize the costs by having one worker thread per a group
495 * of MPTCP sockets.
496 */
497 if (kernel_thread_start(mptcp_thread_func, mpte,
498 &mpte->mpte_thread) != KERN_SUCCESS) {
499 error = ENOBUFS;
500 goto out;
501 }
502 mp_so->so_usecount++; /* for thread */
503
504 /* MPTCP Protocol Control Block */
505 bzero(mp_tp, sizeof (*mp_tp));
506 lck_mtx_init(&mp_tp->mpt_lock, mppi->mppi_lock_grp,
507 mppi->mppi_lock_attr);
508 mp_tp->mpt_mpte = mpte;
509 mp_tp->mpt_state = MPTCPS_CLOSED;
510 out:
511 if (error != 0)
512 lck_mtx_destroy(&mpte->mpte_thread_lock, mppi->mppi_lock_grp);
513 DTRACE_MPTCP5(session__create, struct socket *, mp_so,
514 struct sockbuf *, &mp_so->so_rcv,
515 struct sockbuf *, &mp_so->so_snd,
516 struct mppcb *, mpp, int, error);
517
518 return ((error != 0) ? NULL : mpte);
519 }
520
521 /*
522 * Destroy an MPTCP session.
523 */
524 static void
525 mptcp_sesdestroy(struct mptses *mpte)
526 {
527 struct mptcb *mp_tp;
528
529 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
530
531 mp_tp = mpte->mpte_mptcb;
532 VERIFY(mp_tp != NULL);
533
534 /*
535 * MPTCP Multipath PCB Extension section
536 */
537 mptcp_flush_sopts(mpte);
538 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
539
540 lck_mtx_destroy(&mpte->mpte_thread_lock,
541 mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp);
542
543 /*
544 * MPTCP Protocol Control Block section
545 */
546 lck_mtx_destroy(&mp_tp->mpt_lock,
547 mpte->mpte_mppcb->mpp_pcbinfo->mppi_lock_grp);
548
549 DTRACE_MPTCP2(session__destroy, struct mptses *, mpte,
550 struct mptcb *, mp_tp);
551 }
552
553 /*
554 * Allocate an MPTCP socket option structure.
555 */
556 struct mptopt *
557 mptcp_sopt_alloc(int how)
558 {
559 struct mptopt *mpo;
560
561 mpo = (how == M_WAITOK) ? zalloc(mptopt_zone) :
562 zalloc_noblock(mptopt_zone);
563 if (mpo != NULL) {
564 bzero(mpo, mptopt_zone_size);
565 }
566
567 return (mpo);
568 }
569
570 /*
571 * Free an MPTCP socket option structure.
572 */
573 void
574 mptcp_sopt_free(struct mptopt *mpo)
575 {
576 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
577
578 zfree(mptopt_zone, mpo);
579 }
580
581 /*
582 * Add a socket option to the MPTCP socket option list.
583 */
584 void
585 mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
586 {
587 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
588 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
589 mpo->mpo_flags |= MPOF_ATTACHED;
590 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
591 }
592
593 /*
594 * Remove a socket option from the MPTCP socket option list.
595 */
596 void
597 mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
598 {
599 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
600 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
601 mpo->mpo_flags &= ~MPOF_ATTACHED;
602 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
603 }
604
605 /*
606 * Search for an existing <sopt_level,sopt_name> socket option.
607 */
608 struct mptopt *
609 mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
610 {
611 struct mptopt *mpo;
612
613 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
614
615 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
616 if (mpo->mpo_level == sopt->sopt_level &&
617 mpo->mpo_name == sopt->sopt_name)
618 break;
619 }
620 VERIFY(mpo == NULL || sopt->sopt_valsize == sizeof (int));
621
622 return (mpo);
623 }
624
625 /*
626 * Flushes all recorded socket options from an MP socket.
627 */
628 void
629 mptcp_flush_sopts(struct mptses *mpte)
630 {
631 struct mptopt *mpo, *tmpo;
632
633 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
634
635 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
636 mptcp_sopt_remove(mpte, mpo);
637 mptcp_sopt_free(mpo);
638 }
639 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
640 }
641
642 /*
643 * Allocate a MPTCP subflow structure.
644 */
645 struct mptsub *
646 mptcp_subflow_alloc(int how)
647 {
648 struct mptsub *mpts;
649
650 mpts = (how == M_WAITOK) ? zalloc(mptsub_zone) :
651 zalloc_noblock(mptsub_zone);
652 if (mpts != NULL) {
653 bzero(mpts, mptsub_zone_size);
654 lck_mtx_init(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp,
655 mtcbinfo.mppi_lock_attr);
656 }
657
658 return (mpts);
659 }
660
661 /*
662 * Deallocate a subflow structure, called when all of the references held
663 * on it have been released. This implies that the subflow has been deleted.
664 */
665 void
666 mptcp_subflow_free(struct mptsub *mpts)
667 {
668 MPTS_LOCK_ASSERT_HELD(mpts);
669
670 VERIFY(mpts->mpts_refcnt == 0);
671 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
672 VERIFY(mpts->mpts_mpte == NULL);
673 VERIFY(mpts->mpts_socket == NULL);
674
675 if (mpts->mpts_src_sl != NULL) {
676 sockaddrlist_free(mpts->mpts_src_sl);
677 mpts->mpts_src_sl = NULL;
678 }
679 if (mpts->mpts_dst_sl != NULL) {
680 sockaddrlist_free(mpts->mpts_dst_sl);
681 mpts->mpts_dst_sl = NULL;
682 }
683 MPTS_UNLOCK(mpts);
684 lck_mtx_destroy(&mpts->mpts_lock, mtcbinfo.mppi_lock_grp);
685
686 zfree(mptsub_zone, mpts);
687 }
688
689 /*
690 * Create an MPTCP subflow socket.
691 */
692 static int
693 mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
694 struct proc *p, struct socket **so)
695 {
696 struct mptopt smpo, *mpo, *tmpo;
697 struct socket *mp_so;
698 int error;
699
700 *so = NULL;
701 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
702 mp_so = mpte->mpte_mppcb->mpp_socket;
703
704 /*
705 * Create the subflow socket (multipath subflow, non-blocking.)
706 *
707 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
708 * socket; it will be cleared when the socket is peeled off or closed.
709 * It also indicates to the underlying TCP to handle MPTCP options.
710 * A multipath subflow socket implies SS_NOFDREF state.
711 */
712 if ((error = socreate_internal(dom, so, SOCK_STREAM,
713 IPPROTO_TCP, p, SOCF_ASYNC | SOCF_MP_SUBFLOW, PROC_NULL)) != 0) {
714 mptcplog((LOG_ERR, "MPTCP Socket: subflow socreate mp_so 0x%llx"
715 " unable to create subflow socket error %d\n",
716 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), error),
717 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
718 return (error);
719 }
720
721 socket_lock(*so, 0);
722 VERIFY((*so)->so_flags & SOF_MP_SUBFLOW);
723 VERIFY(((*so)->so_state & (SS_NBIO|SS_NOFDREF)) ==
724 (SS_NBIO|SS_NOFDREF));
725
726 /* prevent the socket buffers from being compressed */
727 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
728 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
729
730 /* Inherit preconnect and TFO data flags */
731 if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)
732 (*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
733
734 if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT)
735 (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
736
737 bzero(&smpo, sizeof (smpo));
738 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
739 smpo.mpo_level = SOL_SOCKET;
740 smpo.mpo_intval = 1;
741
742 /* disable SIGPIPE */
743 smpo.mpo_name = SO_NOSIGPIPE;
744 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
745 goto out;
746
747 /* find out if the subflow's source address goes away */
748 smpo.mpo_name = SO_NOADDRERR;
749 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
750 goto out;
751
752 /* enable keepalive */
753 smpo.mpo_name = SO_KEEPALIVE;
754 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
755 goto out;
756
757 /*
758 * Limit the receive socket buffer size to 64k.
759 *
760 * We need to take into consideration the window scale option
761 * which could be negotiated in one subflow but disabled in
762 * another subflow.
763 * XXX This can be improved in the future.
764 */
765 smpo.mpo_name = SO_RCVBUF;
766 smpo.mpo_intval = MPTCP_RWIN_MAX;
767 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
768 goto out;
769
770 /* N.B.: set by sosetopt */
771 VERIFY(!((*so)->so_rcv.sb_flags & SB_AUTOSIZE));
772 /* Prevent automatic socket buffer sizing. */
773 (*so)->so_snd.sb_flags &= ~SB_AUTOSIZE;
774
775 smpo.mpo_level = IPPROTO_TCP;
776 smpo.mpo_intval = mptcp_subflow_keeptime;
777 smpo.mpo_name = TCP_KEEPALIVE;
778 if ((error = mptcp_subflow_sosetopt(mpte, *so, &smpo)) != 0)
779 goto out;
780
781 /* replay setsockopt(2) on the subflow sockets for eligible options */
782 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
783 int interim;
784
785 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK))
786 continue;
787
788 /*
789 * Skip those that are handled internally; these options
790 * should not have been recorded and marked with the
791 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
792 */
793 if (mpo->mpo_level == SOL_SOCKET &&
794 (mpo->mpo_name == SO_NOSIGPIPE ||
795 mpo->mpo_name == SO_NOADDRERR ||
796 mpo->mpo_name == SO_KEEPALIVE))
797 continue;
798
799 interim = (mpo->mpo_flags & MPOF_INTERIM);
800 if (mptcp_subflow_sosetopt(mpte, *so, mpo) != 0 && interim) {
801 char buf[32];
802 mptcplog((LOG_ERR, "MPTCP Socket: subflow socreate"
803 " mp_so 0x%llx"
804 " sopt %s val %d interim record removed\n",
805 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
806 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
807 buf, sizeof (buf)), mpo->mpo_intval),
808 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
809 mptcp_sopt_remove(mpte, mpo);
810 mptcp_sopt_free(mpo);
811 continue;
812 }
813 }
814
815 /*
816 * We need to receive everything that the subflow socket has,
817 * so use a customized socket receive function. We will undo
818 * this when the socket is peeled off or closed.
819 */
820 mpts->mpts_oprotosw = (*so)->so_proto;
821 switch (dom) {
822 case PF_INET:
823 (*so)->so_proto = &mptcp_subflow_protosw;
824 break;
825 #if INET6
826 case PF_INET6:
827 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
828 break;
829 #endif /* INET6 */
830 default:
831 VERIFY(0);
832 /* NOTREACHED */
833 }
834
835 out:
836 socket_unlock(*so, 0);
837
838 DTRACE_MPTCP4(subflow__create, struct mptses *, mpte,
839 struct mptsub *, mpts, int, dom, int, error);
840
841 return (error);
842 }
843
844 /*
845 * Close an MPTCP subflow socket.
846 *
847 * Note that this may be called on an embryonic subflow, and the only
848 * thing that is guaranteed valid is the protocol-user request.
849 */
850 static int
851 mptcp_subflow_soclose(struct mptsub *mpts, struct socket *so)
852 {
853 MPTS_LOCK_ASSERT_HELD(mpts);
854
855 socket_lock(so, 0);
856 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
857 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
858
859 /* restore protocol-user requests */
860 VERIFY(mpts->mpts_oprotosw != NULL);
861 so->so_proto = mpts->mpts_oprotosw;
862 socket_unlock(so, 0);
863
864 mpts->mpts_socket = NULL; /* may already be NULL */
865
866 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
867 struct socket *, so,
868 struct sockbuf *, &so->so_rcv,
869 struct sockbuf *, &so->so_snd,
870 struct mptses *, mpts->mpts_mpte);
871
872 return (soclose(so));
873 }
874
875 /*
876 * Connect an MPTCP subflow socket.
877 *
878 * This may be called inline as part of adding a subflow, or asynchronously
879 * by the thread (upon progressing to MPTCPF_JOIN_READY). Note that in the
880 * pending connect case, the subflow socket may have been bound to an interface
881 * and/or a source IP address which may no longer be around by the time this
882 * routine is called; in that case the connect attempt will most likely fail.
883 */
884 static int
885 mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
886 {
887 struct socket *so;
888 int af, error;
889
890 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
891 MPTS_LOCK_ASSERT_HELD(mpts);
892
893 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)) ==
894 MPTSF_CONNECTING);
895 VERIFY(mpts->mpts_socket != NULL);
896 so = mpts->mpts_socket;
897 af = mpts->mpts_family;
898
899 if (af == AF_INET || af == AF_INET6) {
900 struct sockaddr_entry *dst_se;
901 char dbuf[MAX_IPv6_STR_LEN];
902
903 dst_se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head);
904 VERIFY(dst_se != NULL);
905
906 mptcplog((LOG_DEBUG, "MPTCP Socket: connectx mp_so 0x%llx "
907 "dst %s[%d] cid %d [pended %s]\n",
908 (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket),
909 inet_ntop(af, ((af == AF_INET) ?
910 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
911 (void *)&SIN6(dst_se->se_addr)->sin6_addr),
912 dbuf, sizeof (dbuf)), ((af == AF_INET) ?
913 ntohs(SIN(dst_se->se_addr)->sin_port) :
914 ntohs(SIN6(dst_se->se_addr)->sin6_port)),
915 mpts->mpts_connid,
916 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
917 "YES" : "NO")),
918 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
919 }
920
921 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
922
923 socket_lock(so, 0);
924 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
925
926 /* connect the subflow socket */
927 error = soconnectxlocked(so, &mpts->mpts_src_sl, &mpts->mpts_dst_sl,
928 mpts->mpts_mpcr.mpcr_proc, mpts->mpts_mpcr.mpcr_ifscope,
929 mpte->mpte_associd, NULL, CONNREQF_MPTCP,
930 &mpts->mpts_mpcr, sizeof (mpts->mpts_mpcr), NULL, NULL);
931 socket_unlock(so, 0);
932
933 /* Allocate a unique address id per subflow */
934 mpte->mpte_addrid_last++;
935 if (mpte->mpte_addrid_last == 0)
936 mpte->mpte_addrid_last++;
937
938 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
939 struct mptsub *, mpts, int, error);
940
941 return (error);
942 }
943
944 /*
945 * MPTCP subflow socket receive routine, derived from soreceive().
946 */
947 static int
948 mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
949 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
950 {
951 #pragma unused(uio)
952 int flags, error = 0;
953 struct proc *p = current_proc();
954 struct mbuf *m, **mp = mp0;
955 struct mbuf *nextrecord;
956
957 socket_lock(so, 1);
958 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
959
960 #ifdef MORE_LOCKING_DEBUG
961 if (so->so_usecount == 1) {
962 panic("%s: so=%x no other reference on socket\n", __func__, so);
963 /* NOTREACHED */
964 }
965 #endif
966 /*
967 * We return all that is there in the subflow's socket receive buffer
968 * to the MPTCP layer, so we require that the caller passes in the
969 * expected parameters.
970 */
971 if (mp == NULL || controlp != NULL) {
972 socket_unlock(so, 1);
973 return (EINVAL);
974 }
975 *mp = NULL;
976 if (psa != NULL)
977 *psa = NULL;
978 if (flagsp != NULL)
979 flags = *flagsp &~ MSG_EOR;
980 else
981 flags = 0;
982
983 if (flags & (MSG_PEEK|MSG_OOB|MSG_NEEDSA|MSG_WAITALL|MSG_WAITSTREAM)) {
984 socket_unlock(so, 1);
985 return (EOPNOTSUPP);
986 }
987 flags |= (MSG_DONTWAIT|MSG_NBIO);
988
989 /*
990 * If a recv attempt is made on a previously-accepted socket
991 * that has been marked as inactive (disconnected), reject
992 * the request.
993 */
994 if (so->so_flags & SOF_DEFUNCT) {
995 struct sockbuf *sb = &so->so_rcv;
996
997 error = ENOTCONN;
998 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
999 __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
1000 SOCK_DOM(so), SOCK_TYPE(so), error));
1001 /*
1002 * This socket should have been disconnected and flushed
1003 * prior to being returned from sodefunct(); there should
1004 * be no data on its receive list, so panic otherwise.
1005 */
1006 if (so->so_state & SS_DEFUNCT)
1007 sb_empty_assert(sb, __func__);
1008 socket_unlock(so, 1);
1009 return (error);
1010 }
1011
1012 /*
1013 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
1014 * and if so just return to the caller. This could happen when
1015 * soreceive() is called by a socket upcall function during the
1016 * time the socket is freed. The socket buffer would have been
1017 * locked across the upcall, therefore we cannot put this thread
1018 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
1019 * we may livelock), because the lock on the socket buffer will
1020 * only be released when the upcall routine returns to its caller.
1021 * Because the socket has been officially closed, there can be
1022 * no further read on it.
1023 *
1024 * A multipath subflow socket would have its SS_NOFDREF set by
1025 * default, so check for SOF_MP_SUBFLOW socket flag; when the
1026 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
1027 */
1028 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
1029 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
1030 socket_unlock(so, 1);
1031 return (0);
1032 }
1033
1034 /*
1035 * For consistency with soreceive() semantics, we need to obey
1036 * SB_LOCK in case some other code path has locked the buffer.
1037 */
1038 error = sblock(&so->so_rcv, 0);
1039 if (error != 0) {
1040 socket_unlock(so, 1);
1041 return (error);
1042 }
1043
1044 m = so->so_rcv.sb_mb;
1045 if (m == NULL) {
1046 /*
1047 * Panic if we notice inconsistencies in the socket's
1048 * receive list; both sb_mb and sb_cc should correctly
1049 * reflect the contents of the list, otherwise we may
1050 * end up with false positives during select() or poll()
1051 * which could put the application in a bad state.
1052 */
1053 SB_MB_CHECK(&so->so_rcv);
1054
1055 if (so->so_error != 0) {
1056 error = so->so_error;
1057 so->so_error = 0;
1058 goto release;
1059 }
1060
1061 if (so->so_state & SS_CANTRCVMORE) {
1062 goto release;
1063 }
1064
1065 if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING))) {
1066 error = ENOTCONN;
1067 goto release;
1068 }
1069
1070 /*
1071 * MSG_DONTWAIT is implicitly defined and this routine will
1072 * never block, so return EWOULDBLOCK when there is nothing.
1073 */
1074 error = EWOULDBLOCK;
1075 goto release;
1076 }
1077
1078 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
1079 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1080 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1081
1082 while (m != NULL) {
1083 nextrecord = m->m_nextpkt;
1084 sbfree(&so->so_rcv, m);
1085
1086 if (mp != NULL) {
1087 *mp = m;
1088 mp = &m->m_next;
1089 so->so_rcv.sb_mb = m = m->m_next;
1090 *mp = NULL;
1091 }
1092
1093 if (m != NULL) {
1094 m->m_nextpkt = nextrecord;
1095 if (nextrecord == NULL)
1096 so->so_rcv.sb_lastrecord = m;
1097 } else {
1098 m = so->so_rcv.sb_mb = nextrecord;
1099 SB_EMPTY_FIXUP(&so->so_rcv);
1100 }
1101 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1102 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1103 }
1104
1105 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
1106 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
1107 /* notify protocol that we drained all the data */
1108 if ((so->so_proto->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL)
1109 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, flags);
1110
1111 if (flagsp != NULL)
1112 *flagsp |= flags;
1113
1114 release:
1115 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
1116 return (error);
1117
1118 }
1119
1120
1121 /*
1122 * Prepare an MPTCP subflow socket for peeloff(2); basically undo
1123 * the work done earlier when the subflow socket was created.
1124 */
1125 void
1126 mptcp_subflow_sopeeloff(struct mptses *mpte, struct mptsub *mpts,
1127 struct socket *so)
1128 {
1129 struct mptopt smpo;
1130 struct socket *mp_so;
1131 int p, c;
1132
1133 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1134 mp_so = mpte->mpte_mppcb->mpp_socket;
1135 MPTS_LOCK_ASSERT_HELD(mpts);
1136
1137 socket_lock(so, 0);
1138 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1139 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
1140
1141 /* inherit MPTCP socket states */
1142 if (!(mp_so->so_state & SS_NBIO))
1143 so->so_state &= ~SS_NBIO;
1144
1145 /*
1146 * At this point, the socket is not yet closed, as there is at least
1147 * one outstanding usecount previously held by mpts_socket from
1148 * socreate(). Atomically clear SOF_MP_SUBFLOW and SS_NOFDREF here.
1149 */
1150 so->so_flags &= ~SOF_MP_SUBFLOW;
1151 so->so_state &= ~SS_NOFDREF;
1152 so->so_flags &= ~SOF_MPTCP_TRUE;
1153
1154 /* allow socket buffers to be compressed */
1155 so->so_rcv.sb_flags &= ~SB_NOCOMPRESS;
1156 so->so_snd.sb_flags &= ~SB_NOCOMPRESS;
1157
1158 /*
1159 * Allow socket buffer auto sizing.
1160 *
1161 * This will increase the current 64k buffer size to whatever is best.
1162 */
1163 if (!(so->so_rcv.sb_flags & SB_USRSIZE))
1164 so->so_rcv.sb_flags |= SB_AUTOSIZE;
1165 if (!(so->so_snd.sb_flags & SB_USRSIZE))
1166 so->so_snd.sb_flags |= SB_AUTOSIZE;
1167
1168 /* restore protocol-user requests */
1169 VERIFY(mpts->mpts_oprotosw != NULL);
1170 so->so_proto = mpts->mpts_oprotosw;
1171
1172 bzero(&smpo, sizeof (smpo));
1173 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1174 smpo.mpo_level = SOL_SOCKET;
1175
1176 /* inherit SOF_NOSIGPIPE from parent MP socket */
1177 p = (mp_so->so_flags & SOF_NOSIGPIPE);
1178 c = (so->so_flags & SOF_NOSIGPIPE);
1179 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1180 smpo.mpo_name = SO_NOSIGPIPE;
1181 if ((p - c) != 0)
1182 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1183
1184 /* inherit SOF_NOADDRAVAIL from parent MP socket */
1185 p = (mp_so->so_flags & SOF_NOADDRAVAIL);
1186 c = (so->so_flags & SOF_NOADDRAVAIL);
1187 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1188 smpo.mpo_name = SO_NOADDRERR;
1189 if ((p - c) != 0)
1190 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1191
1192 /* inherit SO_KEEPALIVE from parent MP socket */
1193 p = (mp_so->so_options & SO_KEEPALIVE);
1194 c = (so->so_options & SO_KEEPALIVE);
1195 smpo.mpo_intval = ((p - c) > 0) ? 1 : 0;
1196 smpo.mpo_name = SO_KEEPALIVE;
1197 if ((p - c) != 0)
1198 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1199
1200 /* unset TCP level default keepalive option */
1201 p = (intotcpcb(sotoinpcb(mp_so)))->t_keepidle;
1202 c = (intotcpcb(sotoinpcb(so)))->t_keepidle;
1203 smpo.mpo_level = IPPROTO_TCP;
1204 smpo.mpo_intval = 0;
1205 smpo.mpo_name = TCP_KEEPALIVE;
1206 if ((p - c) != 0)
1207 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1208 socket_unlock(so, 0);
1209
1210 DTRACE_MPTCP5(subflow__peeloff, struct mptses *, mpte,
1211 struct mptsub *, mpts, struct socket *, so,
1212 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
1213 }
1214
1215 /*
1216 * Establish an initial MPTCP connection (if first subflow and not yet
1217 * connected), or add a subflow to an existing MPTCP connection.
1218 */
1219 int
1220 mptcp_subflow_add(struct mptses *mpte, struct mptsub *mpts,
1221 struct proc *p, uint32_t ifscope)
1222 {
1223 struct sockaddr_entry *se, *src_se = NULL, *dst_se = NULL;
1224 struct socket *mp_so, *so = NULL;
1225 struct mptsub_connreq mpcr;
1226 struct mptcb *mp_tp;
1227 int af, error = 0;
1228
1229 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1230 mp_so = mpte->mpte_mppcb->mpp_socket;
1231 mp_tp = mpte->mpte_mptcb;
1232
1233 MPT_LOCK(mp_tp);
1234 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
1235 /* If the remote end sends Data FIN, refuse subflow adds */
1236 error = ENOTCONN;
1237 MPT_UNLOCK(mp_tp);
1238 return (error);
1239 }
1240 MPT_UNLOCK(mp_tp);
1241
1242 MPTS_LOCK(mpts);
1243 VERIFY(!(mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)));
1244 VERIFY(mpts->mpts_mpte == NULL);
1245 VERIFY(mpts->mpts_socket == NULL);
1246 VERIFY(mpts->mpts_dst_sl != NULL);
1247 VERIFY(mpts->mpts_connid == SAE_CONNID_ANY);
1248
1249 /* select source (if specified) and destination addresses */
1250 if ((error = in_selectaddrs(AF_UNSPEC, &mpts->mpts_src_sl, &src_se,
1251 &mpts->mpts_dst_sl, &dst_se)) != 0)
1252 goto out;
1253
1254 VERIFY(mpts->mpts_dst_sl != NULL && dst_se != NULL);
1255 VERIFY(src_se == NULL || mpts->mpts_src_sl != NULL);
1256 af = mpts->mpts_family = dst_se->se_addr->sa_family;
1257 VERIFY(src_se == NULL || src_se->se_addr->sa_family == af);
1258 VERIFY(af == AF_INET || af == AF_INET6);
1259
1260 /*
1261 * If the source address is not specified, allocate a storage for
1262 * it, so that later on we can fill it in with the actual source
1263 * IP address chosen by the underlying layer for the subflow after
1264 * it is connected.
1265 */
1266 if (mpts->mpts_src_sl == NULL) {
1267 mpts->mpts_src_sl =
1268 sockaddrlist_dup(mpts->mpts_dst_sl, M_WAITOK);
1269 if (mpts->mpts_src_sl == NULL) {
1270 error = ENOBUFS;
1271 goto out;
1272 }
1273 se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head);
1274 VERIFY(se != NULL && se->se_addr != NULL &&
1275 se->se_addr->sa_len == dst_se->se_addr->sa_len);
1276 bzero(se->se_addr, se->se_addr->sa_len);
1277 se->se_addr->sa_len = dst_se->se_addr->sa_len;
1278 se->se_addr->sa_family = dst_se->se_addr->sa_family;
1279 }
1280
1281 /* create the subflow socket */
1282 if ((error = mptcp_subflow_socreate(mpte, mpts, af, p, &so)) != 0)
1283 goto out;
1284
1285 /*
1286 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
1287 * -1 (SAE_CONNID_ALL).
1288 */
1289 mpte->mpte_connid_last++;
1290 if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
1291 mpte->mpte_connid_last == SAE_CONNID_ANY)
1292 mpte->mpte_connid_last++;
1293
1294 mpts->mpts_connid = mpte->mpte_connid_last;
1295 VERIFY(mpts->mpts_connid != SAE_CONNID_ANY &&
1296 mpts->mpts_connid != SAE_CONNID_ALL);
1297
1298 mpts->mpts_rel_seq = 1;
1299
1300 /* Allocate a unique address id per subflow */
1301 mpte->mpte_addrid_last++;
1302 if (mpte->mpte_addrid_last == 0)
1303 mpte->mpte_addrid_last++;
1304
1305 /* bind subflow socket to the specified interface */
1306 if (ifscope != IFSCOPE_NONE) {
1307 socket_lock(so, 0);
1308 error = inp_bindif(sotoinpcb(so), ifscope, &mpts->mpts_outif);
1309 if (error != 0) {
1310 socket_unlock(so, 0);
1311 (void) mptcp_subflow_soclose(mpts, so);
1312 goto out;
1313 }
1314 VERIFY(mpts->mpts_outif != NULL);
1315 mpts->mpts_flags |= MPTSF_BOUND_IF;
1316
1317 mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_add mp_so 0x%llx "
1318 "bindif %s[%d] cid d\n",
1319 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1320 mpts->mpts_outif->if_xname,
1321 ifscope, mpts->mpts_connid),
1322 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
1323 socket_unlock(so, 0);
1324 }
1325
1326 /* if source address and/or port is specified, bind to it */
1327 if (src_se != NULL) {
1328 struct sockaddr *sa = src_se->se_addr;
1329 uint32_t mpts_flags = 0;
1330 in_port_t lport;
1331
1332 switch (af) {
1333 case AF_INET:
1334 if (SIN(sa)->sin_addr.s_addr != INADDR_ANY)
1335 mpts_flags |= MPTSF_BOUND_IP;
1336 if ((lport = SIN(sa)->sin_port) != 0)
1337 mpts_flags |= MPTSF_BOUND_PORT;
1338 break;
1339 #if INET6
1340 case AF_INET6:
1341 VERIFY(af == AF_INET6);
1342 if (!IN6_IS_ADDR_UNSPECIFIED(&SIN6(sa)->sin6_addr))
1343 mpts_flags |= MPTSF_BOUND_IP;
1344 if ((lport = SIN6(sa)->sin6_port) != 0)
1345 mpts_flags |= MPTSF_BOUND_PORT;
1346 break;
1347 #endif /* INET6 */
1348 }
1349
1350 error = sobindlock(so, sa, 1); /* will lock/unlock socket */
1351 if (error != 0) {
1352 (void) mptcp_subflow_soclose(mpts, so);
1353 goto out;
1354 }
1355 mpts->mpts_flags |= mpts_flags;
1356
1357 if (af == AF_INET || af == AF_INET6) {
1358 char sbuf[MAX_IPv6_STR_LEN];
1359
1360 mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_add "
1361 "mp_so 0x%llx bindip %s[%d] cid %d\n",
1362 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1363 inet_ntop(af, ((af == AF_INET) ?
1364 (void *)&SIN(sa)->sin_addr.s_addr :
1365 (void *)&SIN6(sa)->sin6_addr), sbuf, sizeof (sbuf)),
1366 ntohs(lport), mpts->mpts_connid),
1367 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
1368 }
1369 }
1370
1371 /*
1372 * Insert the subflow into the list, and associate the MPTCP PCB
1373 * as well as the the subflow socket. From this point on, removing
1374 * the subflow needs to be done via mptcp_subflow_del().
1375 */
1376 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1377 mpte->mpte_numflows++;
1378
1379 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1380 mpts->mpts_mpte = mpte;
1381 mpts->mpts_socket = so;
1382 MPTS_ADDREF_LOCKED(mpts); /* for being in MPTCP subflow list */
1383 MPTS_ADDREF_LOCKED(mpts); /* for subflow socket */
1384 mp_so->so_usecount++; /* for subflow socket */
1385
1386 /* register for subflow socket read/write events */
1387 (void) sock_setupcalls(so, mptcp_subflow_rupcall, mpts,
1388 mptcp_subflow_wupcall, mpts);
1389
1390 /*
1391 * Register for subflow socket control events; ignore
1392 * SO_FILT_HINT_CONNINFO_UPDATED from below since we
1393 * will generate it here.
1394 */
1395 (void) sock_catchevents(so, mptcp_subflow_eupcall, mpts,
1396 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
1397 SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
1398 SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
1399 SO_FILT_HINT_SUSPEND | SO_FILT_HINT_RESUME |
1400 SO_FILT_HINT_CONNECTED | SO_FILT_HINT_DISCONNECTED |
1401 SO_FILT_HINT_MPFAILOVER | SO_FILT_HINT_MPSTATUS |
1402 SO_FILT_HINT_MUSTRST | SO_FILT_HINT_MPFASTJ |
1403 SO_FILT_HINT_DELETEOK | SO_FILT_HINT_MPCANTRCVMORE);
1404
1405 /* sanity check */
1406 VERIFY(!(mpts->mpts_flags &
1407 (MPTSF_CONNECTING|MPTSF_CONNECTED|MPTSF_CONNECT_PENDING)));
1408
1409 bzero(&mpcr, sizeof (mpcr));
1410 mpcr.mpcr_proc = p;
1411 mpcr.mpcr_ifscope = ifscope;
1412 /*
1413 * Indicate to the TCP subflow whether or not it should establish
1414 * the initial MPTCP connection, or join an existing one. Fill
1415 * in the connection request structure with additional info needed
1416 * by the underlying TCP (to be used in the TCP options, etc.)
1417 */
1418 MPT_LOCK(mp_tp);
1419 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
1420 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
1421 mptcp_init_local_parms(mp_tp);
1422 }
1423 MPT_UNLOCK(mp_tp);
1424 soisconnecting(mp_so);
1425 mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ENABLE;
1426 } else {
1427 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY))
1428 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
1429
1430 /* avoid starting up cellular subflow unless required */
1431 if ((mptcp_delayed_subf_start) &&
1432 (IFNET_IS_CELLULAR(mpts->mpts_outif))) {
1433 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
1434 }
1435 MPT_UNLOCK(mp_tp);
1436 mpcr.mpcr_type = MPTSUB_CONNREQ_MP_ADD;
1437 }
1438
1439 /* If fastjoin or fastopen is requested, set state in mpts */
1440 if (mpte->mpte_nummpcapflows == 0) {
1441 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
1442 MPT_LOCK(mp_tp);
1443 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
1444 mpts->mpts_flags |= MPTSF_TFO_REQD;
1445 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
1446 }
1447 MPT_UNLOCK(mp_tp);
1448 }
1449
1450 if (so->so_flags & SOF_MPTCP_FASTJOIN) {
1451 MPT_LOCK(mp_tp);
1452 if (mp_tp->mpt_state == MPTCPS_ESTABLISHED) {
1453 mpts->mpts_flags |= MPTSF_FASTJ_REQD;
1454 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
1455 }
1456 MPT_UNLOCK(mp_tp);
1457 }
1458 }
1459
1460 mpts->mpts_mpcr = mpcr;
1461 mpts->mpts_flags |= MPTSF_CONNECTING;
1462
1463 if (af == AF_INET || af == AF_INET6) {
1464 char dbuf[MAX_IPv6_STR_LEN];
1465
1466 mptcplog((LOG_DEBUG, "MPTCP Socket: %s "
1467 "mp_so 0x%llx dst %s[%d] cid %d "
1468 "[pending %s]\n", __func__,
1469 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1470 inet_ntop(af, ((af == AF_INET) ?
1471 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
1472 (void *)&SIN6(dst_se->se_addr)->sin6_addr),
1473 dbuf, sizeof (dbuf)), ((af == AF_INET) ?
1474 ntohs(SIN(dst_se->se_addr)->sin_port) :
1475 ntohs(SIN6(dst_se->se_addr)->sin6_port)),
1476 mpts->mpts_connid,
1477 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
1478 "YES" : "NO")),
1479 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
1480 }
1481
1482 /* connect right away if first attempt, or if join can be done now */
1483 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING))
1484 error = mptcp_subflow_soconnectx(mpte, mpts);
1485
1486 out:
1487 MPTS_UNLOCK(mpts);
1488 if (error == 0) {
1489 soevent(mp_so, SO_FILT_HINT_LOCKED |
1490 SO_FILT_HINT_CONNINFO_UPDATED);
1491 }
1492 return (error);
1493 }
1494
1495 /*
1496 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
1497 * will no longer be accessible after a subflow is deleted, thus this
1498 * should occur only after the subflow socket has been disconnected.
1499 * If peeloff(2) is called, leave the socket open.
1500 */
1501 void
1502 mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts, boolean_t close)
1503 {
1504 struct socket *mp_so, *so;
1505
1506 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1507 mp_so = mpte->mpte_mppcb->mpp_socket;
1508
1509 MPTS_LOCK(mpts);
1510 so = mpts->mpts_socket;
1511 VERIFY(so != NULL);
1512
1513 if (close && !((mpts->mpts_flags & MPTSF_DELETEOK) &&
1514 (mpts->mpts_flags & MPTSF_USER_DISCONNECT))) {
1515 MPTS_UNLOCK(mpts);
1516 mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_del returning"
1517 " mp_so 0x%llx flags %x\n",
1518 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_flags),
1519 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
1520 return;
1521 }
1522
1523 mptcplog((LOG_DEBUG, "MPTCP Socket: subflow_del mp_so 0x%llx "
1524 "[u=%d,r=%d] cid %d [close %s] %d %x error %d\n",
1525 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1526 mp_so->so_usecount,
1527 mp_so->so_retaincnt, mpts->mpts_connid,
1528 (close ? "YES" : "NO"), mpts->mpts_soerror,
1529 mpts->mpts_flags,
1530 mp_so->so_error),
1531 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
1532
1533 VERIFY(mpts->mpts_mpte == mpte);
1534 VERIFY(mpts->mpts_connid != SAE_CONNID_ANY &&
1535 mpts->mpts_connid != SAE_CONNID_ALL);
1536
1537 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
1538 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1539 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
1540 VERIFY(mpte->mpte_numflows != 0);
1541 mpte->mpte_numflows--;
1542 if (mpte->mpte_active_sub == mpts)
1543 mpte->mpte_active_sub = NULL;
1544
1545 /*
1546 * Drop references held by this subflow socket; there
1547 * will be no further upcalls made from this point.
1548 */
1549 (void) sock_setupcalls(so, NULL, NULL, NULL, NULL);
1550 (void) sock_catchevents(so, NULL, NULL, 0);
1551
1552 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
1553
1554 if (close)
1555 (void) mptcp_subflow_soclose(mpts, so);
1556
1557 VERIFY(mp_so->so_usecount != 0);
1558 mp_so->so_usecount--; /* for subflow socket */
1559 mpts->mpts_mpte = NULL;
1560 mpts->mpts_socket = NULL;
1561 MPTS_UNLOCK(mpts);
1562
1563 MPTS_REMREF(mpts); /* for MPTCP subflow list */
1564 MPTS_REMREF(mpts); /* for subflow socket */
1565
1566 soevent(mp_so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
1567 }
1568
1569 /*
1570 * Disconnect a subflow socket.
1571 */
1572 void
1573 mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts,
1574 boolean_t deleteok)
1575 {
1576 struct socket *so;
1577 struct mptcb *mp_tp;
1578 int send_dfin = 0;
1579
1580 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1581 MPTS_LOCK_ASSERT_HELD(mpts);
1582
1583 VERIFY(mpts->mpts_mpte == mpte);
1584 VERIFY(mpts->mpts_socket != NULL);
1585 VERIFY(mpts->mpts_connid != SAE_CONNID_ANY &&
1586 mpts->mpts_connid != SAE_CONNID_ALL);
1587
1588 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|MPTSF_DISCONNECTED))
1589 return;
1590
1591 mpts->mpts_flags |= MPTSF_DISCONNECTING;
1592
1593 /*
1594 * If this is coming from disconnectx(2) or issued as part of
1595 * closing the MPTCP socket, the subflow shouldn't stick around.
1596 * Otherwise let it linger around in case the upper layers need
1597 * to retrieve its conninfo.
1598 */
1599 if (deleteok)
1600 mpts->mpts_flags |= MPTSF_DELETEOK;
1601
1602 so = mpts->mpts_socket;
1603 mp_tp = mpte->mpte_mptcb;
1604 MPT_LOCK(mp_tp);
1605 if (mp_tp->mpt_state > MPTCPS_ESTABLISHED)
1606 send_dfin = 1;
1607 MPT_UNLOCK(mp_tp);
1608
1609 socket_lock(so, 0);
1610 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
1611 (so->so_state & SS_ISCONNECTED)) {
1612 mptcplog((LOG_DEBUG, "MPTCP Socket %s: cid %d fin %d "
1613 "[linger %s]\n", __func__, mpts->mpts_connid, send_dfin,
1614 (deleteok ? "NO" : "YES")),
1615 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
1616
1617 if (send_dfin)
1618 mptcp_send_dfin(so);
1619 (void) soshutdownlock(so, SHUT_RD);
1620 (void) soshutdownlock(so, SHUT_WR);
1621 (void) sodisconnectlocked(so);
1622 }
1623 socket_unlock(so, 0);
1624 /*
1625 * Generate a disconnect event for this subflow socket, in case
1626 * the lower layer doesn't do it; this is needed because the
1627 * subflow socket deletion relies on it. This will also end up
1628 * generating SO_FILT_HINT_CONNINFO_UPDATED on the MPTCP socket;
1629 * we cannot do that here because subflow lock is currently held.
1630 */
1631 mptcp_subflow_eupcall(so, mpts, SO_FILT_HINT_DISCONNECTED);
1632 }
1633
1634 /*
1635 * Subflow socket read upcall.
1636 *
1637 * Called when the associated subflow socket posted a read event. The subflow
1638 * socket lock has been released prior to invoking the callback. Note that the
1639 * upcall may occur synchronously as a result of MPTCP performing an action on
1640 * it, or asynchronously as a result of an event happening at the subflow layer.
1641 * Therefore, to maintain lock ordering, the only lock that can be acquired
1642 * here is the thread lock, for signalling purposes.
1643 */
1644 static void
1645 mptcp_subflow_rupcall(struct socket *so, void *arg, int waitf)
1646 {
1647 #pragma unused(so, waitf)
1648 struct mptsub *mpts = arg;
1649 struct mptses *mpte = mpts->mpts_mpte;
1650
1651 /*
1652 * mpte should never be NULL, except in a race with
1653 * mptcp_subflow_del
1654 */
1655 if (mpte == NULL)
1656 return;
1657
1658 lck_mtx_lock(&mpte->mpte_thread_lock);
1659 mptcp_thread_signal_locked(mpte);
1660 lck_mtx_unlock(&mpte->mpte_thread_lock);
1661 }
1662
1663 /*
1664 * Subflow socket input.
1665 *
1666 * Called in the context of the MPTCP thread, for reading data from the
1667 * underlying subflow socket and delivering it to MPTCP.
1668 */
1669 static void
1670 mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
1671 {
1672 struct mbuf *m = NULL;
1673 struct socket *so;
1674 int error;
1675 struct mptsub *mpts_alt = NULL;
1676
1677 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1678 MPTS_LOCK_ASSERT_HELD(mpts);
1679
1680 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
1681 struct mptsub *, mpts);
1682
1683 if (!(mpts->mpts_flags & MPTSF_CONNECTED))
1684 return;
1685
1686 so = mpts->mpts_socket;
1687
1688 error = sock_receive_internal(so, NULL, &m, 0, NULL);
1689 if (error != 0 && error != EWOULDBLOCK) {
1690 mptcplog((LOG_ERR, "MPTCP Receiver: %s cid %d error %d\n",
1691 __func__, mpts->mpts_connid, error),
1692 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
1693 MPTS_UNLOCK(mpts);
1694 mpts_alt = mptcp_get_subflow(mpte, mpts, NULL);
1695 if (mpts_alt == NULL) {
1696 if (mptcp_delayed_subf_start) {
1697 mpts_alt = mptcp_get_pending_subflow(mpte,
1698 mpts);
1699 if (mpts_alt) {
1700 mptcplog((LOG_DEBUG,"MPTCP Receiver:"
1701 " %s: pending %d\n",
1702 __func__, mpts_alt->mpts_connid),
1703 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
1704 } else {
1705 mptcplog((LOG_ERR, "MPTCP Receiver:"
1706 " %s: no pending flow for cid %d",
1707 __func__, mpts->mpts_connid),
1708 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
1709 }
1710 } else {
1711 mptcplog((LOG_ERR, "MPTCP Receiver: %s: no alt"
1712 " path for cid %d\n", __func__,
1713 mpts->mpts_connid),
1714 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
1715 }
1716 if (error == ENODATA) {
1717 /*
1718 * Don't ignore ENODATA so as to discover
1719 * nasty middleboxes.
1720 */
1721 struct socket *mp_so =
1722 mpte->mpte_mppcb->mpp_socket;
1723 mp_so->so_error = ENODATA;
1724 sorwakeup(mp_so);
1725 }
1726 }
1727 MPTS_LOCK(mpts);
1728 } else if (error == 0) {
1729 mptcplog((LOG_DEBUG, "MPTCP Receiver: %s: cid %d \n",
1730 __func__, mpts->mpts_connid),
1731 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
1732 }
1733
1734 /* In fallback, make sure to accept data on all but one subflow */
1735 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1736 (!(mpts->mpts_flags & MPTSF_ACTIVE))) {
1737 m_freem(m);
1738 return;
1739 }
1740
1741 if (m != NULL) {
1742
1743 /* Did we receive data on the backup subflow? */
1744 if (!(mpts->mpts_flags & MPTSF_ACTIVE))
1745 mpts->mpts_peerswitch++;
1746 else
1747 mpts->mpts_peerswitch = 0;
1748
1749 /*
1750 * Release subflow lock since this may trigger MPTCP to send,
1751 * possibly on a different subflow. An extra reference has
1752 * been held on the subflow by the MPTCP thread before coming
1753 * here, so we can be sure that it won't go away, in the event
1754 * the MP socket lock gets released.
1755 */
1756 MPTS_UNLOCK(mpts);
1757 mptcp_input(mpte, m);
1758 MPTS_LOCK(mpts);
1759 }
1760 }
1761
1762 /*
1763 * Subflow socket write upcall.
1764 *
1765 * Called when the associated subflow socket posted a read event. The subflow
1766 * socket lock has been released prior to invoking the callback. Note that the
1767 * upcall may occur synchronously as a result of MPTCP performing an action on
1768 * it, or asynchronously as a result of an event happening at the subflow layer.
1769 * Therefore, to maintain lock ordering, the only lock that can be acquired
1770 * here is the thread lock, for signalling purposes.
1771 */
1772 static void
1773 mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
1774 {
1775 #pragma unused(so, waitf)
1776 struct mptsub *mpts = arg;
1777 struct mptses *mpte = mpts->mpts_mpte;
1778
1779 /*
1780 * mpte should never be NULL except in a race with
1781 * mptcp_subflow_del which doesn't hold socket lock across critical
1782 * section. This upcall is made after releasing the socket lock.
1783 * Interleaving of socket operations becomes possible therefore.
1784 */
1785 if (mpte == NULL)
1786 return;
1787
1788 lck_mtx_lock(&mpte->mpte_thread_lock);
1789 mptcp_thread_signal_locked(mpte);
1790 lck_mtx_unlock(&mpte->mpte_thread_lock);
1791 }
1792
1793 /*
1794 * Subflow socket output.
1795 *
1796 * Called for sending data from MPTCP to the underlying subflow socket.
1797 */
1798 int
1799 mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts)
1800 {
1801 struct socket *mp_so, *so;
1802 size_t sb_cc = 0, tot_sent = 0;
1803 struct mbuf *sb_mb;
1804 int error = 0;
1805 u_int64_t mpt_dsn = 0;
1806 struct mptcb *mp_tp = mpte->mpte_mptcb;
1807 struct mbuf *mpt_mbuf = NULL;
1808 u_int64_t off = 0;
1809 struct mbuf *head, *tail;
1810 int tcp_zero_len_write = 0;
1811
1812 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1813 MPTS_LOCK_ASSERT_HELD(mpts);
1814 mp_so = mpte->mpte_mppcb->mpp_socket;
1815 so = mpts->mpts_socket;
1816
1817 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
1818 struct mptsub *, mpts);
1819
1820 /* subflow socket is suspended? */
1821 if (mpts->mpts_flags & MPTSF_SUSPENDED) {
1822 mptcplog((LOG_ERR, "MPTCP Sender: %s mp_so 0x%llx cid %d is "
1823 "flow controlled\n", __func__,
1824 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid),
1825 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
1826 goto out;
1827 }
1828
1829 /* subflow socket is not MPTCP capable? */
1830 if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE) &&
1831 !(mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1832 !(mpts->mpts_flags & MPTSF_FASTJ_SEND) &&
1833 !(mpts->mpts_flags & MPTSF_TFO_REQD)) {
1834 mptcplog((LOG_ERR, "MPTCP Sender: %s mp_so 0x%llx cid %d not "
1835 "MPTCP capable\n", __func__,
1836 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid),
1837 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
1838 goto out;
1839 }
1840
1841 /* Remove Addr Option is not sent reliably as per I-D */
1842 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
1843 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1844 tp->t_rem_aid = mpte->mpte_lost_aid;
1845 if (mptcp_remaddr_enable)
1846 tp->t_mpflags |= TMPF_SND_REM_ADDR;
1847 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
1848 }
1849
1850 if (mpts->mpts_flags & MPTSF_TFO_REQD) {
1851 mptcp_drop_tfo_data(mpte, mpts);
1852 }
1853
1854 /*
1855 * The mbuf chains containing the metadata (as well as pointing to
1856 * the user data sitting at the MPTCP output queue) would then be
1857 * sent down to the subflow socket.
1858 *
1859 * Some notes on data sequencing:
1860 *
1861 * a. Each mbuf must be a M_PKTHDR.
1862 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
1863 * in the mbuf pkthdr structure.
1864 * c. Each mbuf containing the MPTCP metadata must have its
1865 * pkt_flags marked with the PKTF_MPTCP flag.
1866 */
1867
1868 /* First, drop acknowledged data */
1869 sb_mb = mp_so->so_snd.sb_mb;
1870 if (sb_mb == NULL) {
1871 goto out;
1872 }
1873
1874 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
1875
1876 mpt_mbuf = sb_mb;
1877 while (mpt_mbuf && mpt_mbuf->m_pkthdr.mp_rlen == 0) {
1878 if (((so->so_state & SS_ISCONNECTED) == 0) &&
1879 (mpt_mbuf->m_next == NULL) &&
1880 (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1881 /*
1882 * If TFO, allow connection establishment with zero
1883 * length write.
1884 */
1885 tcp_zero_len_write = 1;
1886 goto zero_len_write;
1887 }
1888 mpt_mbuf = mpt_mbuf->m_next;
1889 }
1890 if (mpt_mbuf && (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1891 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1892 } else {
1893 goto out;
1894 }
1895
1896 MPT_LOCK(mp_tp);
1897 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
1898 u_int64_t len = 0;
1899 len = mp_tp->mpt_snduna - mpt_dsn;
1900 MPT_UNLOCK(mp_tp);
1901 sbdrop(&mp_so->so_snd, (int)len);
1902 MPT_LOCK(mp_tp);
1903 }
1904
1905 /*
1906 * In degraded mode, we don't receive data acks, so force free
1907 * mbufs less than snd_nxt
1908 */
1909 if (mp_so->so_snd.sb_mb == NULL) {
1910 MPT_UNLOCK(mp_tp);
1911 goto out;
1912 }
1913
1914 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
1915 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1916 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
1917 MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_sndnxt)) {
1918 u_int64_t len = 0;
1919 len = mp_tp->mpt_sndnxt - mpt_dsn;
1920 sbdrop(&mp_so->so_snd, (int)len);
1921 mp_tp->mpt_snduna = mp_tp->mpt_sndnxt;
1922 }
1923
1924 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
1925 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
1926 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
1927 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
1928 if (mp_tp->mpt_flags & MPTCPF_RECVD_MPFAIL)
1929 mpts->mpts_sndnxt = mp_tp->mpt_dsn_at_csum_fail;
1930 }
1931
1932 /*
1933 * Adjust the subflow's notion of next byte to send based on
1934 * the last unacknowledged byte
1935 */
1936 if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_snduna)) {
1937 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
1938 }
1939
1940 /*
1941 * Adjust the top level notion of next byte used for retransmissions
1942 * and sending FINs.
1943 */
1944 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
1945 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
1946 }
1947
1948
1949 /* Now determine the offset from which to start transmitting data */
1950 sb_mb = mp_so->so_snd.sb_mb;
1951 sb_cc = mp_so->so_snd.sb_cc;
1952 if (sb_mb == NULL) {
1953 MPT_UNLOCK(mp_tp);
1954 goto out;
1955 }
1956 if (MPTCP_SEQ_LT(mpts->mpts_sndnxt, mp_tp->mpt_sndmax)) {
1957 off = mpts->mpts_sndnxt - mp_tp->mpt_snduna;
1958 sb_cc -= (size_t)off;
1959 } else {
1960 MPT_UNLOCK(mp_tp);
1961 goto out;
1962 }
1963 MPT_UNLOCK(mp_tp);
1964
1965 mpt_mbuf = sb_mb;
1966 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1967
1968 while (mpt_mbuf && ((mpt_mbuf->m_pkthdr.mp_rlen == 0) ||
1969 (mpt_mbuf->m_pkthdr.mp_rlen <= (u_int32_t)off))) {
1970 off -= mpt_mbuf->m_pkthdr.mp_rlen;
1971 mpt_mbuf = mpt_mbuf->m_next;
1972 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
1973 }
1974 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
1975 mptcplog((LOG_DEBUG, "MPTCP Sender: %s cid = %d "
1976 "snduna = %llu sndnxt = %llu probe %d\n",
1977 __func__, mpts->mpts_connid,
1978 mp_tp->mpt_snduna, mpts->mpts_sndnxt,
1979 mpts->mpts_probecnt),
1980 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
1981
1982 VERIFY(mpt_mbuf && (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
1983
1984 head = tail = NULL;
1985
1986 while (tot_sent < sb_cc) {
1987 struct mbuf *m;
1988 size_t mlen;
1989
1990 mlen = mpt_mbuf->m_pkthdr.mp_rlen;
1991 mlen -= off;
1992 if (mlen == 0)
1993 goto out;
1994
1995 if (mlen > sb_cc) {
1996 panic("%s: unexpected %lu %lu \n", __func__,
1997 mlen, sb_cc);
1998 }
1999
2000 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
2001 M_COPYM_MUST_COPY_HDR);
2002 if (m == NULL) {
2003 error = ENOBUFS;
2004 break;
2005 }
2006
2007 /* Create a DSN mapping for the data (m_copym does it) */
2008 mpt_dsn = mpt_mbuf->m_pkthdr.mp_dsn;
2009 VERIFY(m->m_flags & M_PKTHDR);
2010 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
2011 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
2012 m->m_pkthdr.mp_dsn = mpt_dsn + off;
2013 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
2014 m->m_pkthdr.mp_rlen = mlen;
2015 mpts->mpts_rel_seq += mlen;
2016 m->m_pkthdr.len = mlen;
2017
2018 if (head == NULL) {
2019 head = tail = m;
2020 } else {
2021 tail->m_next = m;
2022 tail = m;
2023 }
2024
2025 tot_sent += mlen;
2026 off = 0;
2027 mpt_mbuf = mpt_mbuf->m_next;
2028 }
2029
2030 if (head != NULL) {
2031 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
2032
2033 if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
2034 (tp->t_tfo_stats == 0)) {
2035 tp->t_mpflags |= TMPF_TFO_REQUEST;
2036 } else if (mpts->mpts_flags & MPTSF_FASTJ_SEND) {
2037 tp->t_mpflags |= TMPF_FASTJOIN_SEND;
2038 }
2039
2040 error = sock_sendmbuf(so, NULL, head, 0, NULL);
2041
2042 DTRACE_MPTCP7(send, struct mbuf *, head, struct socket *, so,
2043 struct sockbuf *, &so->so_rcv,
2044 struct sockbuf *, &so->so_snd,
2045 struct mptses *, mpte, struct mptsub *, mpts,
2046 size_t, tot_sent);
2047 } else if (tcp_zero_len_write == 1) {
2048 zero_len_write:
2049 socket_lock(so, 1);
2050 /* Opting to call pru_send as no mbuf at subflow level */
2051 error = (*so->so_proto->pr_usrreqs->pru_send)
2052 (so, 0, NULL, NULL, NULL, current_proc());
2053 socket_unlock(so, 1);
2054 }
2055
2056 if ((error == 0) || (error == EWOULDBLOCK)) {
2057 mpts->mpts_sndnxt += tot_sent;
2058
2059 if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
2060 tcpstat.tcps_mp_num_probes++;
2061 if (tot_sent < mpts->mpts_maxseg)
2062 mpts->mpts_probecnt += 1;
2063 else
2064 mpts->mpts_probecnt +=
2065 tot_sent/mpts->mpts_maxseg;
2066 }
2067
2068 MPT_LOCK(mp_tp);
2069
2070 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mpts->mpts_sndnxt)) {
2071 if (MPTCP_DATASEQ_HIGH32(mpts->mpts_sndnxt) >
2072 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt))
2073 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
2074 mp_tp->mpt_sndnxt = mpts->mpts_sndnxt;
2075 }
2076 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
2077 MPT_UNLOCK(mp_tp);
2078
2079 if (so->so_flags1 & SOF1_PRECONNECT_DATA)
2080 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
2081
2082 /* Send once in SYN_SENT state to avoid sending SYN spam */
2083 if (mpts->mpts_flags & MPTSF_FASTJ_SEND) {
2084 so->so_flags &= ~SOF_MPTCP_FASTJOIN;
2085 mpts->mpts_flags &= ~MPTSF_FASTJ_SEND;
2086 }
2087
2088 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
2089 (mpts->mpts_probesoon != 0))
2090 mptcplog((LOG_DEBUG, "MPTCP Sender: %s cid %d "
2091 "wrote %d %d probe %d probedelta %d\n",
2092 __func__, mpts->mpts_connid, (int)tot_sent,
2093 (int) sb_cc, mpts->mpts_probecnt,
2094 (tcp_now - mpts->mpts_probesoon)),
2095 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
2096 } else {
2097 mptcplog((LOG_ERR, "MPTCP Sender: %s cid %d error %d len %zd\n",
2098 __func__, mpts->mpts_connid, error, tot_sent),
2099 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2100 }
2101 out:
2102 return (error);
2103 }
2104
2105 /*
2106 * Subflow socket control event upcall.
2107 *
2108 * Called when the associated subflow socket posted one or more control events.
2109 * The subflow socket lock has been released prior to invoking the callback.
2110 * Note that the upcall may occur synchronously as a result of MPTCP performing
2111 * an action on it, or asynchronously as a result of an event happening at the
2112 * subflow layer. Therefore, to maintain lock ordering, the only lock that can
2113 * be acquired here is the thread lock, for signalling purposes.
2114 */
2115 static void
2116 mptcp_subflow_eupcall(struct socket *so, void *arg, uint32_t events)
2117 {
2118 #pragma unused(so)
2119 struct mptsub *mpts = arg;
2120 struct mptses *mpte = mpts->mpts_mpte;
2121
2122 VERIFY(mpte != NULL);
2123
2124 lck_mtx_lock(&mpte->mpte_thread_lock);
2125 atomic_bitset_32(&mpts->mpts_evctl, events);
2126 mptcp_thread_signal_locked(mpte);
2127 lck_mtx_unlock(&mpte->mpte_thread_lock);
2128 }
2129
2130 /*
2131 * Subflow socket control events.
2132 *
2133 * Called for handling events related to the underlying subflow socket.
2134 */
2135 static ev_ret_t
2136 mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
2137 uint64_t *p_mpsofilt_hint)
2138 {
2139 uint32_t events, save_events;
2140 ev_ret_t ret = MPTS_EVRET_OK;
2141 int i = 0;
2142 int mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl)/
2143 sizeof(mpsub_ev_entry_tbl[0]);
2144 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2145 MPTS_LOCK_ASSERT_HELD(mpts);
2146
2147 /* bail if there's nothing to process */
2148 if ((events = mpts->mpts_evctl) == 0)
2149 return (ret);
2150
2151 if (events & (SO_FILT_HINT_CONNRESET|SO_FILT_HINT_MUSTRST|
2152 SO_FILT_HINT_CANTRCVMORE|SO_FILT_HINT_CANTSENDMORE|
2153 SO_FILT_HINT_TIMEOUT|SO_FILT_HINT_NOSRCADDR|
2154 SO_FILT_HINT_IFDENIED|SO_FILT_HINT_SUSPEND|
2155 SO_FILT_HINT_DISCONNECTED)) {
2156 events |= SO_FILT_HINT_MPFAILOVER;
2157 }
2158
2159 save_events = events;
2160
2161 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
2162 struct mptsub *, mpts, uint32_t, events);
2163
2164 mptcplog((LOG_DEBUG, "MPTCP Events: %s cid %d events=%b\n", __func__,
2165 mpts->mpts_connid, events, SO_FILT_HINT_BITS),
2166 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
2167
2168 /*
2169 * Process all the socket filter hints and reset the hint
2170 * once it is handled
2171 */
2172 for (i = 0; (i < mpsub_ev_entry_count) && events; i++) {
2173 /*
2174 * Always execute the DISCONNECTED event, because it will wakeup
2175 * the app.
2176 */
2177 if ((events & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
2178 (ret >= MPTS_EVRET_OK ||
2179 mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
2180 ev_ret_t error =
2181 mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint);
2182 events &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
2183 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
2184 }
2185 }
2186
2187 /*
2188 * We should be getting only events specified via sock_catchevents(),
2189 * so loudly complain if we have any unprocessed one(s).
2190 */
2191 if (events != 0 || ret < MPTS_EVRET_OK) {
2192 mptcplog((LOG_ERR, "MPTCP Events %s%s: cid %d evret %s (%d)"
2193 " unhandled events=%b\n",
2194 (events != 0) && (ret == MPTS_EVRET_OK) ? "MPTCP_ERROR " : "",
2195 __func__, mpts->mpts_connid,
2196 mptcp_evret2str(ret), ret, events, SO_FILT_HINT_BITS),
2197 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
2198 }
2199
2200 /* clear the ones we've processed */
2201 atomic_bitclear_32(&mpts->mpts_evctl, save_events);
2202 return (ret);
2203 }
2204
2205 /*
2206 * Handle SO_FILT_HINT_CONNRESET subflow socket event.
2207 */
2208 static ev_ret_t
2209 mptcp_subflow_connreset_ev(struct mptses *mpte, struct mptsub *mpts,
2210 uint64_t *p_mpsofilt_hint)
2211 {
2212 struct socket *mp_so, *so;
2213 struct mptcb *mp_tp;
2214 boolean_t linger;
2215
2216 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2217 MPTS_LOCK_ASSERT_HELD(mpts);
2218 VERIFY(mpte->mpte_mppcb != NULL);
2219 mp_so = mpte->mpte_mppcb->mpp_socket;
2220 mp_tp = mpte->mpte_mptcb;
2221 so = mpts->mpts_socket;
2222
2223 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2224 !(mp_so->so_flags & SOF_PCBCLEARING));
2225
2226 mptcplog((LOG_DEBUG, "MPTCP Events: "
2227 "%s: cid %d [linger %s]\n", __func__,
2228 mpts->mpts_connid, (linger ? "YES" : "NO")),
2229 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
2230
2231 /*
2232 * We got a TCP RST for this subflow connection.
2233 *
2234 * Right now, we simply propagate ECONNREFUSED to the MPTCP socket
2235 * client if the MPTCP connection has not been established or
2236 * if the connection has only one subflow and is a connection being
2237 * resumed. Otherwise we close the socket.
2238 */
2239 mptcp_subflow_disconnect(mpte, mpts, !linger);
2240
2241 MPT_LOCK(mp_tp);
2242 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2243 mpts->mpts_soerror = mp_so->so_error = ECONNREFUSED;
2244 } else if (mpte->mpte_nummpcapflows < 1) {
2245 mpts->mpts_soerror = mp_so->so_error = ECONNRESET;
2246 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNRESET;
2247 }
2248 MPT_UNLOCK(mp_tp);
2249
2250 /*
2251 * Keep the subflow socket around, unless the MPTCP socket has
2252 * been detached or the subflow has been disconnected explicitly,
2253 * in which case it should be deleted right away.
2254 */
2255 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2256 }
2257
2258 /*
2259 * Handle SO_FILT_HINT_CANTRCVMORE subflow socket event.
2260 */
2261 static ev_ret_t
2262 mptcp_subflow_cantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
2263 uint64_t *p_mpsofilt_hint)
2264 {
2265 #pragma unused(p_mpsofilt_hint)
2266 struct socket *so;
2267
2268 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2269 MPTS_LOCK_ASSERT_HELD(mpts);
2270
2271 so = mpts->mpts_socket;
2272
2273 mptcplog((LOG_DEBUG, "MPTCP Events: "
2274 "%s: cid %d\n", __func__, mpts->mpts_connid),
2275 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
2276
2277 /*
2278 * We got a FIN for this subflow connection. This subflow socket
2279 * is no longer available for receiving data;
2280 * The FIN may arrive with data. The data is handed up to the
2281 * mptcp socket and the subflow is disconnected.
2282 */
2283
2284 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2285 }
2286
2287 /*
2288 * Handle SO_FILT_HINT_CANTSENDMORE subflow socket event.
2289 */
2290 static ev_ret_t
2291 mptcp_subflow_cantsendmore_ev(struct mptses *mpte, struct mptsub *mpts,
2292 uint64_t *p_mpsofilt_hint)
2293 {
2294 #pragma unused(p_mpsofilt_hint)
2295 struct socket *so;
2296
2297 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2298 MPTS_LOCK_ASSERT_HELD(mpts);
2299
2300 so = mpts->mpts_socket;
2301
2302 mptcplog((LOG_DEBUG, "MPTCP Events: "
2303 "%s: cid %d\n", __func__, mpts->mpts_connid),
2304 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
2305
2306 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2307 }
2308
2309 /*
2310 * Handle SO_FILT_HINT_TIMEOUT subflow socket event.
2311 */
2312 static ev_ret_t
2313 mptcp_subflow_timeout_ev(struct mptses *mpte, struct mptsub *mpts,
2314 uint64_t *p_mpsofilt_hint)
2315 {
2316 #pragma unused(p_mpsofilt_hint)
2317 struct socket *mp_so, *so;
2318 struct mptcb *mp_tp;
2319 boolean_t linger;
2320
2321 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2322 MPTS_LOCK_ASSERT_HELD(mpts);
2323 VERIFY(mpte->mpte_mppcb != NULL);
2324 mp_so = mpte->mpte_mppcb->mpp_socket;
2325 mp_tp = mpte->mpte_mptcb;
2326 so = mpts->mpts_socket;
2327
2328 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2329 !(mp_so->so_flags & SOF_PCBCLEARING));
2330
2331 mptcplog((LOG_NOTICE, "MPTCP Events: "
2332 "%s: cid %d [linger %s]\n", __func__,
2333 mpts->mpts_connid, (linger ? "YES" : "NO")),
2334 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
2335
2336 if (mpts->mpts_soerror == 0)
2337 mpts->mpts_soerror = ETIMEDOUT;
2338
2339 /*
2340 * The subflow connection has timed out.
2341 *
2342 * Right now, we simply propagate ETIMEDOUT to the MPTCP socket
2343 * client if the MPTCP connection has not been established. Otherwise
2344 * drop it.
2345 */
2346 mptcp_subflow_disconnect(mpte, mpts, !linger);
2347
2348 MPT_LOCK(mp_tp);
2349 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2350 mp_so->so_error = ETIMEDOUT;
2351 }
2352 MPT_UNLOCK(mp_tp);
2353
2354 /*
2355 * Keep the subflow socket around, unless the MPTCP socket has
2356 * been detached or the subflow has been disconnected explicitly,
2357 * in which case it should be deleted right away.
2358 */
2359 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2360 }
2361
2362 /*
2363 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
2364 */
2365 static ev_ret_t
2366 mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
2367 uint64_t *p_mpsofilt_hint)
2368 {
2369 #pragma unused(p_mpsofilt_hint)
2370 struct socket *mp_so, *so;
2371 struct mptcb *mp_tp;
2372 boolean_t linger;
2373 struct tcpcb *tp = NULL;
2374
2375 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2376 MPTS_LOCK_ASSERT_HELD(mpts);
2377
2378 VERIFY(mpte->mpte_mppcb != NULL);
2379 mp_so = mpte->mpte_mppcb->mpp_socket;
2380 mp_tp = mpte->mpte_mptcb;
2381 so = mpts->mpts_socket;
2382
2383 /* Not grabbing socket lock as t_local_aid is write once only */
2384 tp = intotcpcb(sotoinpcb(so));
2385 /*
2386 * This overwrites any previous mpte_lost_aid to avoid storing
2387 * too much state when the typical case has only two subflows.
2388 */
2389 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
2390 mpte->mpte_lost_aid = tp->t_local_aid;
2391
2392 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2393 !(mp_so->so_flags & SOF_PCBCLEARING));
2394
2395 mptcplog((LOG_DEBUG, "MPTCP Events: "
2396 "%s cid %d [linger %s]\n", __func__,
2397 mpts->mpts_connid, (linger ? "YES" : "NO")),
2398 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
2399
2400 if (mpts->mpts_soerror == 0)
2401 mpts->mpts_soerror = EADDRNOTAVAIL;
2402
2403 /*
2404 * The subflow connection has lost its source address.
2405 *
2406 * Right now, we simply propagate EADDRNOTAVAIL to the MPTCP socket
2407 * client if the MPTCP connection has not been established. If it
2408 * has been established with one subflow , we keep the MPTCP
2409 * connection valid without any subflows till closed by application.
2410 * This lets tcp connection manager decide whether to close this or
2411 * not as it reacts to reachability changes too.
2412 */
2413 mptcp_subflow_disconnect(mpte, mpts, !linger);
2414
2415 MPT_LOCK(mp_tp);
2416 if ((mp_tp->mpt_state < MPTCPS_ESTABLISHED) &&
2417 (mp_so->so_flags & SOF_NOADDRAVAIL)) {
2418 mp_so->so_error = EADDRNOTAVAIL;
2419 }
2420 MPT_UNLOCK(mp_tp);
2421
2422 /*
2423 * Keep the subflow socket around, unless the MPTCP socket has
2424 * been detached or the subflow has been disconnected explicitly,
2425 * in which case it should be deleted right away.
2426 */
2427 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2428 }
2429
2430 /*
2431 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
2432 * indicates that the remote side sent a Data FIN
2433 */
2434 static ev_ret_t
2435 mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
2436 uint64_t *p_mpsofilt_hint)
2437 {
2438 struct socket *so, *mp_so;
2439 struct mptcb *mp_tp;
2440
2441 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2442 MPTS_LOCK_ASSERT_HELD(mpts);
2443 mp_so = mpte->mpte_mppcb->mpp_socket;
2444 so = mpts->mpts_socket;
2445 mp_tp = mpte->mpte_mptcb;
2446
2447 mptcplog((LOG_DEBUG, "MPTCP Events: "
2448 "%s: cid %d\n", __func__, mpts->mpts_connid),
2449 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
2450
2451 /*
2452 * We got a Data FIN for the MPTCP connection.
2453 * The FIN may arrive with data. The data is handed up to the
2454 * mptcp socket and the user is notified so that it may close
2455 * the socket if needed.
2456 */
2457 MPT_LOCK(mp_tp);
2458 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
2459 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CANTRCVMORE;
2460 }
2461 MPT_UNLOCK(mp_tp);
2462 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2463 }
2464
2465 /*
2466 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
2467 */
2468 static ev_ret_t
2469 mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
2470 uint64_t *p_mpsofilt_hint)
2471 {
2472 struct mptsub *mpts_alt = NULL;
2473 struct socket *so = NULL;
2474 struct socket *mp_so;
2475 int altpath_exists = 0;
2476
2477 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2478 MPTS_LOCK_ASSERT_HELD(mpts);
2479 mp_so = mpte->mpte_mppcb->mpp_socket;
2480 mptcplog((LOG_NOTICE, "MPTCP Events: "
2481 "%s: mp_so 0x%llx\n", __func__,
2482 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
2483 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
2484
2485 MPTS_UNLOCK(mpts);
2486 mpts_alt = mptcp_get_subflow(mpte, mpts, NULL);
2487
2488 /*
2489 * If there is no alternate eligible subflow, ignore the
2490 * failover hint.
2491 */
2492 if (mpts_alt == NULL) {
2493 mptcplog((LOG_WARNING, "MPTCP Events: "
2494 "%s: no alternate path\n", __func__),
2495 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
2496
2497 if (mptcp_delayed_subf_start) {
2498 mpts_alt = mptcp_get_pending_subflow(mpte, mpts);
2499 if (mpts_alt != NULL) {
2500 MPTS_LOCK(mpts_alt);
2501 (void) mptcp_subflow_soconnectx(mpte,
2502 mpts_alt);
2503 MPTS_UNLOCK(mpts_alt);
2504 }
2505 }
2506 MPTS_LOCK(mpts);
2507 goto done;
2508 }
2509 MPTS_LOCK(mpts_alt);
2510 altpath_exists = 1;
2511 so = mpts_alt->mpts_socket;
2512 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
2513 socket_lock(so, 1);
2514 /* All data acknowledged and no RTT spike */
2515 if ((so->so_snd.sb_cc == 0) &&
2516 (mptcp_no_rto_spike(so))) {
2517 so->so_flags &= ~SOF_MP_TRYFAILOVER;
2518 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
2519 } else {
2520 /* no alternate path available */
2521 altpath_exists = 0;
2522 }
2523 socket_unlock(so, 1);
2524 }
2525 if (altpath_exists) {
2526 mptcplog((LOG_INFO, "MPTCP Events: "
2527 "%s: cid = %d\n",
2528 __func__, mpts_alt->mpts_connid),
2529 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
2530 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
2531 mpts_alt->mpts_peerswitch = 0;
2532 struct mptcb *mp_tp = mpte->mpte_mptcb;
2533 /* Bring the subflow's notion of snd_nxt into the send window */
2534 MPT_LOCK(mp_tp);
2535 mpts_alt->mpts_sndnxt = mp_tp->mpt_snduna;
2536 MPT_UNLOCK(mp_tp);
2537 mpte->mpte_active_sub = mpts_alt;
2538 socket_lock(so, 1);
2539 sowwakeup(so);
2540 socket_unlock(so, 1);
2541 }
2542 MPTS_UNLOCK(mpts_alt);
2543
2544 if (altpath_exists) {
2545 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED;
2546 mptcplog((LOG_NOTICE, "MPTCP Events: "
2547 "%s: mp_so 0x%llx switched from "
2548 "%d to %d\n", __func__,
2549 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2550 mpts->mpts_connid, mpts_alt->mpts_connid),
2551 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
2552 tcpstat.tcps_mp_switches++;
2553 }
2554
2555 MPTS_LOCK(mpts);
2556 if (altpath_exists) {
2557 mpts->mpts_flags |= MPTSF_FAILINGOVER;
2558 mpts->mpts_flags &= ~MPTSF_ACTIVE;
2559 } else {
2560 mptcplog((LOG_DEBUG, "MPTCP Events %s: no alt cid = %d\n",
2561 __func__, mpts->mpts_connid),
2562 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
2563 done:
2564 so = mpts->mpts_socket;
2565 socket_lock(so, 1);
2566 so->so_flags &= ~SOF_MP_TRYFAILOVER;
2567 socket_unlock(so, 1);
2568 }
2569 MPTS_LOCK_ASSERT_HELD(mpts);
2570 return (MPTS_EVRET_OK);
2571 }
2572
2573 /*
2574 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
2575 */
2576 static ev_ret_t
2577 mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
2578 uint64_t *p_mpsofilt_hint)
2579 {
2580 struct socket *mp_so, *so;
2581 struct mptcb *mp_tp;
2582 boolean_t linger;
2583
2584 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2585 MPTS_LOCK_ASSERT_HELD(mpts);
2586 VERIFY(mpte->mpte_mppcb != NULL);
2587 mp_so = mpte->mpte_mppcb->mpp_socket;
2588 mp_tp = mpte->mpte_mptcb;
2589 so = mpts->mpts_socket;
2590
2591 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2592 !(mp_so->so_flags & SOF_PCBCLEARING));
2593
2594 mptcplog((LOG_DEBUG, "MPTCP Events: "
2595 "%s: cid %d [linger %s]\n", __func__,
2596 mpts->mpts_connid, (linger ? "YES" : "NO")),
2597 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
2598
2599 if (mpts->mpts_soerror == 0)
2600 mpts->mpts_soerror = EHOSTUNREACH;
2601
2602 /*
2603 * The subflow connection cannot use the outgoing interface.
2604 *
2605 * Right now, we simply propagate EHOSTUNREACH to the MPTCP socket
2606 * client if the MPTCP connection has not been established. If it
2607 * has been established, let the upper layer call disconnectx.
2608 */
2609 mptcp_subflow_disconnect(mpte, mpts, !linger);
2610 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_IFDENIED;
2611
2612 MPT_LOCK(mp_tp);
2613 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2614 mp_so->so_error = EHOSTUNREACH;
2615 }
2616 MPT_UNLOCK(mp_tp);
2617
2618 /*
2619 * Keep the subflow socket around, unless the MPTCP socket has
2620 * been detached or the subflow has been disconnected explicitly,
2621 * in which case it should be deleted right away.
2622 */
2623 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
2624 }
2625
2626 /*
2627 * Handle SO_FILT_HINT_SUSPEND subflow socket event.
2628 */
2629 static ev_ret_t
2630 mptcp_subflow_suspend_ev(struct mptses *mpte, struct mptsub *mpts,
2631 uint64_t *p_mpsofilt_hint)
2632 {
2633 #pragma unused(p_mpsofilt_hint)
2634 struct socket *so;
2635
2636 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2637 MPTS_LOCK_ASSERT_HELD(mpts);
2638
2639 so = mpts->mpts_socket;
2640
2641 /* the subflow connection is being flow controlled */
2642 mpts->mpts_flags |= MPTSF_SUSPENDED;
2643
2644 mptcplog((LOG_DEBUG, "MPTCP Events: "
2645 "%s: cid %d\n", __func__,
2646 mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
2647
2648 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2649 }
2650
2651 /*
2652 * Handle SO_FILT_HINT_RESUME subflow socket event.
2653 */
2654 static ev_ret_t
2655 mptcp_subflow_resume_ev(struct mptses *mpte, struct mptsub *mpts,
2656 uint64_t *p_mpsofilt_hint)
2657 {
2658 #pragma unused(p_mpsofilt_hint)
2659 struct socket *so;
2660
2661 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2662 MPTS_LOCK_ASSERT_HELD(mpts);
2663
2664 so = mpts->mpts_socket;
2665
2666 /* the subflow connection is no longer flow controlled */
2667 mpts->mpts_flags &= ~MPTSF_SUSPENDED;
2668
2669 mptcplog((LOG_DEBUG, "MPTCP Events: "
2670 "%s: cid %d\n", __func__, mpts->mpts_connid),
2671 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
2672
2673 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2674 }
2675
2676 /*
2677 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
2678 */
2679 static ev_ret_t
2680 mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
2681 uint64_t *p_mpsofilt_hint)
2682 {
2683 char buf0[MAX_IPv6_STR_LEN], buf1[MAX_IPv6_STR_LEN];
2684 struct sockaddr_entry *src_se, *dst_se;
2685 struct sockaddr_storage src;
2686 struct socket *mp_so, *so;
2687 struct mptcb *mp_tp;
2688 struct ifnet *outifp;
2689 int af, error = 0;
2690 boolean_t mpok = FALSE;
2691 boolean_t cell = FALSE;
2692 boolean_t wifi = FALSE;
2693 boolean_t wired = FALSE;
2694
2695 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2696 VERIFY(mpte->mpte_mppcb != NULL);
2697 mp_so = mpte->mpte_mppcb->mpp_socket;
2698 mp_tp = mpte->mpte_mptcb;
2699
2700 MPTS_LOCK_ASSERT_HELD(mpts);
2701 so = mpts->mpts_socket;
2702 af = mpts->mpts_family;
2703
2704 if (mpts->mpts_flags & MPTSF_CONNECTED)
2705 return (MPTS_EVRET_OK);
2706
2707 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
2708 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
2709 socket_lock(so, 0);
2710 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2711 (so->so_state & SS_ISCONNECTED)) {
2712 mptcplog((LOG_DEBUG, "MPTCP Events: "
2713 "%s: cid %d disconnect before tcp connect\n",
2714 __func__, mpts->mpts_connid),
2715 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
2716 (void) soshutdownlock(so, SHUT_RD);
2717 (void) soshutdownlock(so, SHUT_WR);
2718 (void) sodisconnectlocked(so);
2719 }
2720 socket_unlock(so, 0);
2721 return (MPTS_EVRET_OK);
2722 }
2723
2724 /*
2725 * The subflow connection has been connected. Find out whether it
2726 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
2727 *
2728 * a. If MPTCP connection is not yet established, then this must be
2729 * the first subflow connection. If MPTCP failed to negotiate,
2730 * indicate to the MPTCP socket client via EPROTO, that the
2731 * underlying TCP connection may be peeled off via peeloff(2).
2732 * Otherwise, mark the MPTCP socket as connected.
2733 *
2734 * b. If MPTCP connection has been established, then this must be
2735 * one of the subsequent subflow connections. If MPTCP failed
2736 * to negotiate, disconnect the connection since peeloff(2)
2737 * is no longer possible.
2738 *
2739 * Right now, we simply unblock any waiters at the MPTCP socket layer
2740 * if the MPTCP connection has not been established.
2741 */
2742 socket_lock(so, 0);
2743
2744 if (so->so_state & SS_ISDISCONNECTED) {
2745 /*
2746 * With MPTCP joins, a connection is connected at the subflow
2747 * level, but the 4th ACK from the server elevates the MPTCP
2748 * subflow to connected state. So there is a small window
2749 * where the subflow could get disconnected before the
2750 * connected event is processed.
2751 */
2752 socket_unlock(so, 0);
2753 return (MPTS_EVRET_OK);
2754 }
2755
2756 mpts->mpts_soerror = 0;
2757 mpts->mpts_flags &= ~MPTSF_CONNECTING;
2758 mpts->mpts_flags |= MPTSF_CONNECTED;
2759
2760 if (!(so->so_flags1 & SOF1_DATA_IDEMPOTENT))
2761 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
2762
2763 struct tcpcb *tp = sototcpcb(so);
2764 if (tp->t_mpflags & TMPF_MPTCP_TRUE)
2765 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
2766
2767 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
2768
2769 VERIFY(mpts->mpts_dst_sl != NULL);
2770 dst_se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head);
2771 VERIFY(dst_se != NULL && dst_se->se_addr != NULL &&
2772 dst_se->se_addr->sa_family == af);
2773
2774 VERIFY(mpts->mpts_src_sl != NULL);
2775 src_se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head);
2776 VERIFY(src_se != NULL && src_se->se_addr != NULL &&
2777 src_se->se_addr->sa_family == af);
2778
2779 /* get/check source IP address */
2780 switch (af) {
2781 case AF_INET: {
2782 error = in_getsockaddr_s(so, &src);
2783 if (error == 0) {
2784 struct sockaddr_in *ms = SIN(src_se->se_addr);
2785 struct sockaddr_in *s = SIN(&src);
2786
2787 VERIFY(s->sin_len == ms->sin_len);
2788 VERIFY(ms->sin_family == AF_INET);
2789
2790 if ((mpts->mpts_flags & MPTSF_BOUND_IP) &&
2791 bcmp(&ms->sin_addr, &s->sin_addr,
2792 sizeof (ms->sin_addr)) != 0) {
2793 mptcplog((LOG_ERR, "MPTCP Events: "
2794 "%s: cid %d local "
2795 "address %s (expected %s)\n", __func__,
2796 mpts->mpts_connid, inet_ntop(AF_INET,
2797 (void *)&s->sin_addr.s_addr, buf0,
2798 sizeof (buf0)), inet_ntop(AF_INET,
2799 (void *)&ms->sin_addr.s_addr, buf1,
2800 sizeof (buf1))),
2801 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
2802 }
2803 bcopy(s, ms, sizeof (*s));
2804 }
2805 break;
2806 }
2807 #if INET6
2808 case AF_INET6: {
2809 error = in6_getsockaddr_s(so, &src);
2810 if (error == 0) {
2811 struct sockaddr_in6 *ms = SIN6(src_se->se_addr);
2812 struct sockaddr_in6 *s = SIN6(&src);
2813
2814 VERIFY(s->sin6_len == ms->sin6_len);
2815 VERIFY(ms->sin6_family == AF_INET6);
2816
2817 if ((mpts->mpts_flags & MPTSF_BOUND_IP) &&
2818 bcmp(&ms->sin6_addr, &s->sin6_addr,
2819 sizeof (ms->sin6_addr)) != 0) {
2820 mptcplog((LOG_ERR, "MPTCP Events: "
2821 "%s: cid %d local "
2822 "address %s (expected %s)\n", __func__,
2823 mpts->mpts_connid, inet_ntop(AF_INET6,
2824 (void *)&s->sin6_addr, buf0,
2825 sizeof (buf0)), inet_ntop(AF_INET6,
2826 (void *)&ms->sin6_addr, buf1,
2827 sizeof (buf1))),
2828 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
2829 }
2830 bcopy(s, ms, sizeof (*s));
2831 }
2832 break;
2833 }
2834 #endif /* INET6 */
2835 default:
2836 VERIFY(0);
2837 /* NOTREACHED */
2838 }
2839
2840 if (error != 0) {
2841 mptcplog((LOG_ERR, "MPTCP Events "
2842 "%s: cid %d getsockaddr failed (%d)\n",
2843 __func__, mpts->mpts_connid, error),
2844 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
2845 }
2846
2847 /* get/verify the outbound interface */
2848 outifp = sotoinpcb(so)->inp_last_outifp; /* could be NULL */
2849 if (mpts->mpts_flags & MPTSF_BOUND_IF) {
2850 VERIFY(mpts->mpts_outif != NULL);
2851 if (mpts->mpts_outif != outifp) {
2852 mptcplog((LOG_ERR, "MPTCP Events: %s: cid %d outif %s "
2853 "(expected %s)\n", __func__, mpts->mpts_connid,
2854 ((outifp != NULL) ? outifp->if_xname : "NULL"),
2855 mpts->mpts_outif->if_xname),
2856 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_ERR);
2857
2858 if (outifp == NULL)
2859 outifp = mpts->mpts_outif;
2860 }
2861 } else {
2862 mpts->mpts_outif = outifp;
2863 }
2864
2865 mpts->mpts_srtt = (intotcpcb(sotoinpcb(so)))->t_srtt;
2866 mpts->mpts_rxtcur = (intotcpcb(sotoinpcb(so)))->t_rxtcur;
2867 mpts->mpts_maxseg = (intotcpcb(sotoinpcb(so)))->t_maxseg;
2868
2869 cell = IFNET_IS_CELLULAR(mpts->mpts_outif);
2870 wifi = (!cell && IFNET_IS_WIFI(mpts->mpts_outif));
2871 wired = (!wifi && IFNET_IS_WIRED(mpts->mpts_outif));
2872
2873 if (cell)
2874 mpts->mpts_linktype |= MPTSL_CELL;
2875 else if (wifi)
2876 mpts->mpts_linktype |= MPTSL_WIFI;
2877 else if (wired)
2878 mpts->mpts_linktype |= MPTSL_WIRED;
2879
2880 socket_unlock(so, 0);
2881
2882 mptcplog((LOG_DEBUG, "MPTCP Sender: %s: cid %d "
2883 "establishment srtt %d \n", __func__,
2884 mpts->mpts_connid, (mpts->mpts_srtt >> 5)),
2885 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
2886
2887
2888 mptcplog((LOG_DEBUG, "MPTCP Socket: "
2889 "%s: cid %d outif %s %s[%d] -> %s[%d] "
2890 "is %s\n", __func__, mpts->mpts_connid, ((outifp != NULL) ?
2891 outifp->if_xname : "NULL"), inet_ntop(af, (af == AF_INET) ?
2892 (void *)&SIN(src_se->se_addr)->sin_addr.s_addr :
2893 (void *)&SIN6(src_se->se_addr)->sin6_addr, buf0, sizeof (buf0)),
2894 ((af == AF_INET) ? ntohs(SIN(src_se->se_addr)->sin_port) :
2895 ntohs(SIN6(src_se->se_addr)->sin6_port)),
2896 inet_ntop(af, ((af == AF_INET) ?
2897 (void *)&SIN(dst_se->se_addr)->sin_addr.s_addr :
2898 (void *)&SIN6(dst_se->se_addr)->sin6_addr), buf1, sizeof (buf1)),
2899 ((af == AF_INET) ? ntohs(SIN(dst_se->se_addr)->sin_port) :
2900 ntohs(SIN6(dst_se->se_addr)->sin6_port)),
2901 ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ?
2902 "MPTCP capable" : "a regular TCP")),
2903 (MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG), MPTCP_LOGLVL_LOG);
2904
2905 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
2906 MPTS_UNLOCK(mpts);
2907
2908 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED;
2909
2910 MPT_LOCK(mp_tp);
2911 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
2912 /* case (a) above */
2913 if (!mpok) {
2914 mp_tp->mpt_flags |= MPTCPF_PEEL_OFF;
2915 (void) mptcp_drop(mpte, mp_tp, EPROTO);
2916 MPT_UNLOCK(mp_tp);
2917 } else {
2918 MPT_UNLOCK(mp_tp);
2919 mptcplog((LOG_DEBUG, "MPTCP State: "
2920 "MPTCPS_ESTABLISHED for mp_so 0x%llx \n",
2921 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
2922 MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
2923 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
2924 mpte->mpte_associd = mpts->mpts_connid;
2925 DTRACE_MPTCP2(state__change,
2926 struct mptcb *, mp_tp,
2927 uint32_t, 0 /* event */);
2928
2929 (void) mptcp_setconnorder(mpte, mpts->mpts_connid, 1);
2930 soisconnected(mp_so);
2931 }
2932 MPTS_LOCK(mpts);
2933 if (mpok) {
2934 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
2935 mpte->mpte_nummpcapflows++;
2936 MPT_LOCK_SPIN(mp_tp);
2937 /* With TFO, sndnxt may be initialized earlier */
2938 if (mpts->mpts_sndnxt == 0)
2939 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
2940 MPT_UNLOCK(mp_tp);
2941 }
2942 } else if (mpok) {
2943 MPT_UNLOCK(mp_tp);
2944 if (mptcp_rwnotify && (mpte->mpte_nummpcapflows == 0)) {
2945 /* Experimental code, disabled by default. */
2946 sorwakeup(mp_so);
2947 sowwakeup(mp_so);
2948 }
2949 /*
2950 * case (b) above
2951 * In case of additional flows, the MPTCP socket is not
2952 * MPTSF_MP_CAPABLE until an ACK is received from server
2953 * for 3-way handshake. TCP would have guaranteed that this
2954 * is an MPTCP subflow.
2955 */
2956 MPTS_LOCK(mpts);
2957 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
2958 mpts->mpts_flags &= ~MPTSF_FASTJ_REQD;
2959 mpte->mpte_nummpcapflows++;
2960 MPT_LOCK_SPIN(mp_tp);
2961 /* With Fastjoin, sndnxt is updated before connected_ev */
2962 if (mpts->mpts_sndnxt == 0) {
2963 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
2964 mpts->mpts_rel_seq = 1;
2965 }
2966 MPT_UNLOCK(mp_tp);
2967 mptcp_output_needed(mpte, mpts);
2968 } else {
2969 MPT_UNLOCK(mp_tp);
2970 MPTS_LOCK(mpts);
2971 }
2972
2973 MPTS_LOCK_ASSERT_HELD(mpts);
2974
2975 return (MPTS_EVRET_OK); /* keep the subflow socket around */
2976 }
2977
2978 /*
2979 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
2980 */
2981 static ev_ret_t
2982 mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
2983 uint64_t *p_mpsofilt_hint)
2984 {
2985 struct socket *mp_so, *so;
2986 struct mptcb *mp_tp;
2987 boolean_t linger;
2988
2989 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
2990 MPTS_LOCK_ASSERT_HELD(mpts);
2991 VERIFY(mpte->mpte_mppcb != NULL);
2992 mp_so = mpte->mpte_mppcb->mpp_socket;
2993 mp_tp = mpte->mpte_mptcb;
2994 so = mpts->mpts_socket;
2995
2996 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
2997 !(mp_so->so_flags & SOF_PCBCLEARING));
2998
2999 mptcplog((LOG_DEBUG, "MPTCP Events: "
3000 "%s: cid %d [linger %s]\n", __func__,
3001 mpts->mpts_connid, (linger ? "YES" : "NO")),
3002 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3003
3004 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
3005 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
3006
3007 /*
3008 * Clear flags that are used by getconninfo to return state.
3009 * Retain like MPTSF_DELETEOK for internal purposes.
3010 */
3011 mpts->mpts_flags &= ~(MPTSF_CONNECTING|MPTSF_CONNECT_PENDING|
3012 MPTSF_CONNECTED|MPTSF_DISCONNECTING|MPTSF_PREFERRED|
3013 MPTSF_MP_CAPABLE|MPTSF_MP_READY|MPTSF_MP_DEGRADED|
3014 MPTSF_SUSPENDED|MPTSF_ACTIVE);
3015 mpts->mpts_flags |= MPTSF_DISCONNECTED;
3016
3017 /*
3018 * The subflow connection has been disconnected.
3019 *
3020 * Right now, we simply unblock any waiters at the MPTCP socket layer
3021 * if the MPTCP connection has not been established.
3022 */
3023 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED;
3024
3025 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
3026 mpte->mpte_nummpcapflows--;
3027 if (mpte->mpte_active_sub == mpts) {
3028 mpte->mpte_active_sub = NULL;
3029 mptcplog((LOG_DEBUG, "MPTCP Events: "
3030 "%s: resetting active subflow \n",
3031 __func__), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3032 }
3033 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
3034 }
3035
3036 MPT_LOCK(mp_tp);
3037 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
3038 MPT_UNLOCK(mp_tp);
3039 MPTS_UNLOCK(mpts);
3040 soisdisconnected(mp_so);
3041 MPTS_LOCK(mpts);
3042 } else {
3043 MPT_UNLOCK(mp_tp);
3044 }
3045
3046 /*
3047 * The underlying subflow socket has been disconnected;
3048 * it is no longer useful to us. Keep the subflow socket
3049 * around, unless the MPTCP socket has been detached or
3050 * the subflow has been disconnected explicitly, in which
3051 * case it should be deleted right away.
3052 */
3053 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
3054 }
3055
3056 /*
3057 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
3058 */
3059 static ev_ret_t
3060 mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
3061 uint64_t *p_mpsofilt_hint)
3062 {
3063 struct socket *mp_so, *so;
3064 struct mptcb *mp_tp;
3065 ev_ret_t ret = MPTS_EVRET_OK;
3066
3067 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3068 VERIFY(mpte->mpte_mppcb != NULL);
3069 mp_so = mpte->mpte_mppcb->mpp_socket;
3070 mp_tp = mpte->mpte_mptcb;
3071
3072 MPTS_LOCK_ASSERT_HELD(mpts);
3073 so = mpts->mpts_socket;
3074
3075 socket_lock(so, 0);
3076 MPT_LOCK(mp_tp);
3077
3078 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE)
3079 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
3080 else
3081 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
3082
3083 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
3084 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
3085 goto done;
3086 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
3087 }
3088 else
3089 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
3090
3091 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY)
3092 mpts->mpts_flags |= MPTSF_MP_READY;
3093 else
3094 mpts->mpts_flags &= ~MPTSF_MP_READY;
3095
3096 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3097 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
3098 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
3099 }
3100
3101 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
3102 VERIFY(!(mp_tp->mpt_flags & MPTCPF_JOIN_READY));
3103 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
3104 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED |
3105 SO_FILT_HINT_CONNINFO_UPDATED;
3106 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
3107 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
3108 ret = MPTS_EVRET_CONNECT_PENDING;
3109 } else {
3110 *p_mpsofilt_hint |= SO_FILT_HINT_LOCKED |
3111 SO_FILT_HINT_CONNINFO_UPDATED;
3112 }
3113
3114 mptcplog((LOG_DEBUG, "MPTCP Events: "
3115 "%s: mp_so 0x%llx mpt_flags=%b cid %d "
3116 "mptsf=%b\n", __func__,
3117 (u_int64_t)VM_KERNEL_ADDRPERM(mpte->mpte_mppcb->mpp_socket),
3118 mp_tp->mpt_flags, MPTCPF_BITS, mpts->mpts_connid,
3119 mpts->mpts_flags, MPTSF_BITS),
3120 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3121
3122 done:
3123 MPT_UNLOCK(mp_tp);
3124 socket_unlock(so, 0);
3125 return (ret);
3126 }
3127
3128 /*
3129 * Handle SO_FILT_HINT_MUSTRST subflow socket event
3130 */
3131 static ev_ret_t
3132 mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
3133 uint64_t *p_mpsofilt_hint)
3134 {
3135 struct socket *mp_so, *so;
3136 struct mptcb *mp_tp;
3137 boolean_t linger;
3138
3139
3140 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3141 MPTS_LOCK_ASSERT_HELD(mpts);
3142 VERIFY(mpte->mpte_mppcb != NULL);
3143 mp_so = mpte->mpte_mppcb->mpp_socket;
3144 mp_tp = mpte->mpte_mptcb;
3145 so = mpts->mpts_socket;
3146
3147 linger = (!(mpts->mpts_flags & MPTSF_DELETEOK) &&
3148 !(mp_so->so_flags & SOF_PCBCLEARING));
3149
3150 if (mpts->mpts_soerror == 0)
3151 mpts->mpts_soerror = ECONNABORTED;
3152
3153 /* We got an invalid option or a fast close */
3154 socket_lock(so, 0);
3155 struct tcptemp *t_template;
3156 struct inpcb *inp = sotoinpcb(so);
3157 struct tcpcb *tp = NULL;
3158
3159 tp = intotcpcb(inp);
3160 so->so_error = ECONNABORTED;
3161
3162 t_template = tcp_maketemplate(tp);
3163 if (t_template) {
3164 struct tcp_respond_args tra;
3165
3166 bzero(&tra, sizeof(tra));
3167 if (inp->inp_flags & INP_BOUND_IF)
3168 tra.ifscope = inp->inp_boundifp->if_index;
3169 else
3170 tra.ifscope = IFSCOPE_NONE;
3171 tra.awdl_unrestricted = 1;
3172
3173 tcp_respond(tp, t_template->tt_ipgen,
3174 &t_template->tt_t, (struct mbuf *)NULL,
3175 tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
3176 (void) m_free(dtom(t_template));
3177 mptcplog((LOG_DEBUG, "MPTCP Events: "
3178 "%s: mp_so 0x%llx cid %d \n",
3179 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3180 so, mpts->mpts_connid),
3181 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3182 }
3183 socket_unlock(so, 0);
3184 mptcp_subflow_disconnect(mpte, mpts, !linger);
3185
3186 *p_mpsofilt_hint |= (SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNINFO_UPDATED);
3187
3188 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP))
3189 *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
3190
3191 MPT_LOCK(mp_tp);
3192 if ((mp_tp->mpt_state < MPTCPS_ESTABLISHED) ||
3193 (mp_tp->mpt_state == MPTCPS_FASTCLOSE_WAIT)) {
3194 mp_so->so_error = ECONNABORTED;
3195 }
3196 /*
3197 * Ideally there should be a state transition for when a FASTCLOSE
3198 * is received. Right now we keep the connection in MPTCPS_ESTABLISHED
3199 * state and only go to terminal state when the user level code calls
3200 * close after processing the SO_FILT_HINT_CONNRESET event.
3201 */
3202 if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS)
3203 mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
3204 MPT_UNLOCK(mp_tp);
3205
3206 /*
3207 * Keep the subflow socket around unless the subflow has been
3208 * disconnected explicitly.
3209 */
3210 return (linger ? MPTS_EVRET_OK : MPTS_EVRET_DELETE);
3211 }
3212
3213 static ev_ret_t
3214 mptcp_fastjoin_ev(struct mptses *mpte, struct mptsub *mpts,
3215 uint64_t *p_mpsofilt_hint)
3216 {
3217 #pragma unused(p_mpsofilt_hint)
3218 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3219 MPTS_LOCK_ASSERT_HELD(mpts);
3220 VERIFY(mpte->mpte_mppcb != NULL);
3221
3222 if (mpte->mpte_nummpcapflows == 0) {
3223 struct mptcb *mp_tp = mpte->mpte_mptcb;
3224 mptcplog((LOG_DEBUG,"MPTCP Events: %s: %llx %llx \n",
3225 __func__, mp_tp->mpt_snduna, mpts->mpts_sndnxt),
3226 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3227
3228 mpte->mpte_active_sub = mpts;
3229 mpts->mpts_flags |= (MPTSF_FASTJ_SEND | MPTSF_ACTIVE);
3230 MPT_LOCK(mp_tp);
3231 /*
3232 * If mptcp_subflow_output is called before fastjoin_ev
3233 * then mpts->mpts_sndnxt is initialized to mp_tp->mpt_snduna
3234 * and further mpts->mpts_sndnxt is incremented by len copied.
3235 */
3236 if (mpts->mpts_sndnxt == 0) {
3237 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
3238 }
3239 MPT_UNLOCK(mp_tp);
3240 }
3241
3242 return (MPTS_EVRET_OK);
3243 }
3244
3245 static ev_ret_t
3246 mptcp_deleteok_ev(struct mptses *mpte, struct mptsub *mpts,
3247 uint64_t *p_mpsofilt_hint)
3248 {
3249 #pragma unused(p_mpsofilt_hint)
3250 MPTE_LOCK_ASSERT_HELD(mpte);
3251 MPTS_LOCK_ASSERT_HELD(mpts);
3252 VERIFY(mpte->mpte_mppcb != NULL);
3253
3254 mptcplog((LOG_DEBUG, "MPTCP Events: "
3255 "%s cid %d\n", __func__, mpts->mpts_connid),
3256 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3257
3258 mpts->mpts_flags |= MPTSF_DELETEOK;
3259 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
3260 return (MPTS_EVRET_DELETE);
3261 else
3262 return (MPTS_EVRET_OK);
3263 }
3264
3265 static const char *
3266 mptcp_evret2str(ev_ret_t ret)
3267 {
3268 const char *c = "UNKNOWN";
3269
3270 switch (ret) {
3271 case MPTS_EVRET_DELETE:
3272 c = "MPTS_EVRET_DELETE";
3273 break;
3274 case MPTS_EVRET_CONNECT_PENDING:
3275 c = "MPTS_EVRET_CONNECT_PENDING";
3276 break;
3277 case MPTS_EVRET_DISCONNECT_FALLBACK:
3278 c = "MPTS_EVRET_DISCONNECT_FALLBACK";
3279 break;
3280 case MPTS_EVRET_OK:
3281 c = "MPTS_EVRET_OK";
3282 break;
3283 default:
3284 break;
3285 }
3286 return (c);
3287 }
3288
3289 /*
3290 * Add a reference to a subflow structure; used by MPTS_ADDREF().
3291 */
3292 void
3293 mptcp_subflow_addref(struct mptsub *mpts, int locked)
3294 {
3295 if (!locked)
3296 MPTS_LOCK(mpts);
3297 else
3298 MPTS_LOCK_ASSERT_HELD(mpts);
3299
3300 if (++mpts->mpts_refcnt == 0) {
3301 panic("%s: mpts %p wraparound refcnt\n", __func__, mpts);
3302 /* NOTREACHED */
3303 }
3304 if (!locked)
3305 MPTS_UNLOCK(mpts);
3306 }
3307
3308 /*
3309 * Remove a reference held on a subflow structure; used by MPTS_REMREF();
3310 */
3311 void
3312 mptcp_subflow_remref(struct mptsub *mpts)
3313 {
3314 MPTS_LOCK(mpts);
3315 if (mpts->mpts_refcnt == 0) {
3316 panic("%s: mpts %p negative refcnt\n", __func__, mpts);
3317 /* NOTREACHED */
3318 }
3319 if (--mpts->mpts_refcnt > 0) {
3320 MPTS_UNLOCK(mpts);
3321 return;
3322 }
3323 /* callee will unlock and destroy lock */
3324 mptcp_subflow_free(mpts);
3325 }
3326
3327 /*
3328 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
3329 * caller must ensure that the option can be issued on subflow sockets, via
3330 * MPOF_SUBFLOW_OK flag.
3331 */
3332 int
3333 mptcp_subflow_sosetopt(struct mptses *mpte, struct socket *so,
3334 struct mptopt *mpo)
3335 {
3336 struct socket *mp_so;
3337 struct sockopt sopt;
3338 char buf[32];
3339 int error;
3340
3341 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
3342 mpo->mpo_flags &= ~MPOF_INTERIM;
3343
3344 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3345 mp_so = mpte->mpte_mppcb->mpp_socket;
3346
3347 bzero(&sopt, sizeof (sopt));
3348 sopt.sopt_dir = SOPT_SET;
3349 sopt.sopt_level = mpo->mpo_level;
3350 sopt.sopt_name = mpo->mpo_name;
3351 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
3352 sopt.sopt_valsize = sizeof (int);
3353 sopt.sopt_p = kernproc;
3354
3355 error = sosetoptlock(so, &sopt, 0); /* already locked */
3356 if (error == 0) {
3357 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3358 "%s: mp_so 0x%llx sopt %s "
3359 "val %d set successful\n", __func__,
3360 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3361 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3362 buf, sizeof (buf)), mpo->mpo_intval),
3363 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3364 } else {
3365 mptcplog((LOG_ERR, "MPTCP Socket: "
3366 "%s: mp_so 0x%llx sopt %s "
3367 "val %d set error %d\n", __func__,
3368 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3369 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3370 buf, sizeof (buf)), mpo->mpo_intval, error),
3371 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3372 }
3373 return (error);
3374 }
3375
3376 /*
3377 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
3378 * caller must ensure that the option can be issued on subflow sockets, via
3379 * MPOF_SUBFLOW_OK flag.
3380 */
3381 int
3382 mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
3383 struct mptopt *mpo)
3384 {
3385 struct socket *mp_so;
3386 struct sockopt sopt;
3387 char buf[32];
3388 int error;
3389
3390 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
3391 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3392 mp_so = mpte->mpte_mppcb->mpp_socket;
3393
3394 bzero(&sopt, sizeof (sopt));
3395 sopt.sopt_dir = SOPT_GET;
3396 sopt.sopt_level = mpo->mpo_level;
3397 sopt.sopt_name = mpo->mpo_name;
3398 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
3399 sopt.sopt_valsize = sizeof (int);
3400 sopt.sopt_p = kernproc;
3401
3402 error = sogetoptlock(so, &sopt, 0); /* already locked */
3403 if (error == 0) {
3404 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3405 "%s: mp_so 0x%llx sopt %s "
3406 "val %d get successful\n", __func__,
3407 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3408 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name,
3409 buf, sizeof (buf)), mpo->mpo_intval),
3410 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3411 } else {
3412 mptcplog((LOG_ERR, "MPTCP Socket: "
3413 "%s: mp_so 0x%llx sopt %s get error %d\n",
3414 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3415 mptcp_sopt2str(mpo->mpo_level,
3416 mpo->mpo_name, buf, sizeof (buf)), error),
3417 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
3418 }
3419 return (error);
3420 }
3421
3422
3423 /*
3424 * MPTCP garbage collector.
3425 *
3426 * This routine is called by the MP domain on-demand, periodic callout,
3427 * which is triggered when a MPTCP socket is closed. The callout will
3428 * repeat as long as this routine returns a non-zero value.
3429 */
3430 static uint32_t
3431 mptcp_gc(struct mppcbinfo *mppi)
3432 {
3433 struct mppcb *mpp, *tmpp;
3434 uint32_t active = 0;
3435
3436 lck_mtx_assert(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
3437
3438 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
3439 struct socket *mp_so;
3440 struct mptses *mpte;
3441 struct mptcb *mp_tp;
3442
3443 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
3444 mp_so = mpp->mpp_socket;
3445 VERIFY(mp_so != NULL);
3446 mpte = mptompte(mpp);
3447 VERIFY(mpte != NULL);
3448 mp_tp = mpte->mpte_mptcb;
3449 VERIFY(mp_tp != NULL);
3450
3451 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3452 "%s: mp_so 0x%llx found "
3453 "(u=%d,r=%d,s=%d)\n", __func__,
3454 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_usecount,
3455 mp_so->so_retaincnt, mpp->mpp_state),
3456 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3457
3458 if (!lck_mtx_try_lock(&mpp->mpp_lock)) {
3459 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3460 "%s: mp_so 0x%llx skipped "
3461 "(u=%d,r=%d)\n", __func__,
3462 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3463 mp_so->so_usecount, mp_so->so_retaincnt),
3464 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3465 active++;
3466 continue;
3467 }
3468
3469 /* check again under the lock */
3470 if (mp_so->so_usecount > 1) {
3471 boolean_t wakeup = FALSE;
3472 struct mptsub *mpts, *tmpts;
3473
3474 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3475 "%s: mp_so 0x%llx skipped "
3476 "[u=%d,r=%d] %d %d\n", __func__,
3477 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3478 mp_so->so_usecount, mp_so->so_retaincnt,
3479 mp_tp->mpt_gc_ticks,
3480 mp_tp->mpt_state),
3481 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3482
3483 MPT_LOCK(mp_tp);
3484 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
3485 if (mp_tp->mpt_gc_ticks > 0)
3486 mp_tp->mpt_gc_ticks--;
3487 if (mp_tp->mpt_gc_ticks == 0) {
3488 wakeup = TRUE;
3489 if (mp_tp->mpt_localkey != NULL) {
3490 mptcp_free_key(
3491 mp_tp->mpt_localkey);
3492 mp_tp->mpt_localkey = NULL;
3493 }
3494 }
3495 }
3496 MPT_UNLOCK(mp_tp);
3497 if (wakeup) {
3498 TAILQ_FOREACH_SAFE(mpts,
3499 &mpte->mpte_subflows, mpts_entry, tmpts) {
3500 MPTS_LOCK(mpts);
3501 mpts->mpts_flags |= MPTSF_DELETEOK;
3502 if (mpts->mpts_soerror == 0)
3503 mpts->mpts_soerror = ETIMEDOUT;
3504 mptcp_subflow_eupcall(mpts->mpts_socket,
3505 mpts, SO_FILT_HINT_DISCONNECTED);
3506 MPTS_UNLOCK(mpts);
3507 }
3508 }
3509 lck_mtx_unlock(&mpp->mpp_lock);
3510 active++;
3511 continue;
3512 }
3513
3514 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
3515 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3516 "%s: mp_so 0x%llx skipped "
3517 "[u=%d,r=%d,s=%d]\n", __func__,
3518 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3519 mp_so->so_usecount, mp_so->so_retaincnt,
3520 mpp->mpp_state),
3521 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3522 lck_mtx_unlock(&mpp->mpp_lock);
3523 active++;
3524 continue;
3525 }
3526
3527 /*
3528 * The PCB has been detached, and there is exactly 1 refnct
3529 * held by the MPTCP thread. Signal that thread to terminate,
3530 * after which the last refcnt will be released. That will
3531 * allow it to be destroyed below during the next round.
3532 */
3533 if (mp_so->so_usecount == 1) {
3534 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3535 "%s: mp_so 0x%llx scheduled for "
3536 "termination [u=%d,r=%d]\n", __func__,
3537 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3538 mp_so->so_usecount, mp_so->so_retaincnt),
3539 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3540
3541 /* signal MPTCP thread to terminate */
3542 mptcp_thread_terminate_signal(mpte);
3543 lck_mtx_unlock(&mpp->mpp_lock);
3544 active++;
3545 continue;
3546 }
3547
3548 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3549 "%s: mp_so 0x%llx destroyed [u=%d,r=%d]\n",
3550 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3551 mp_so->so_usecount, mp_so->so_retaincnt),
3552 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3553
3554 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
3555 struct sockbuf *, &mp_so->so_rcv,
3556 struct sockbuf *, &mp_so->so_snd,
3557 struct mppcb *, mpp);
3558
3559 mp_pcbdispose(mpp);
3560 }
3561
3562 return (active);
3563 }
3564
3565 /*
3566 * Drop a MPTCP connection, reporting the specified error.
3567 */
3568 struct mptses *
3569 mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno)
3570 {
3571 struct socket *mp_so;
3572
3573 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3574 MPT_LOCK_ASSERT_HELD(mp_tp);
3575 VERIFY(mpte->mpte_mptcb == mp_tp);
3576 mp_so = mpte->mpte_mppcb->mpp_socket;
3577
3578 mp_tp->mpt_state = MPTCPS_TERMINATE;
3579 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
3580 uint32_t, 0 /* event */);
3581
3582 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0)
3583 errno = mp_tp->mpt_softerror;
3584 mp_so->so_error = errno;
3585
3586 return (mptcp_close(mpte, mp_tp));
3587 }
3588
3589 /*
3590 * Close a MPTCP control block.
3591 */
3592 struct mptses *
3593 mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
3594 {
3595 struct socket *mp_so = NULL;
3596 struct mptsub *mpts = NULL, *tmpts = NULL;
3597
3598 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
3599 MPT_LOCK_ASSERT_HELD(mp_tp);
3600 VERIFY(mpte->mpte_mptcb == mp_tp);
3601 mp_so = mpte->mpte_mppcb->mpp_socket;
3602 if (mp_tp->mpt_localkey != NULL) {
3603 mptcp_free_key(mp_tp->mpt_localkey);
3604 mp_tp->mpt_localkey = NULL;
3605 }
3606
3607 MPT_UNLOCK(mp_tp);
3608 soisdisconnected(mp_so);
3609
3610 MPT_LOCK(mp_tp);
3611 if (mp_tp->mpt_flags & MPTCPF_PEEL_OFF) {
3612 return (NULL);
3613 }
3614 MPT_UNLOCK(mp_tp);
3615
3616 /* Clean up all subflows */
3617 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3618 MPTS_LOCK(mpts);
3619 mpts->mpts_flags |= MPTSF_USER_DISCONNECT;
3620 mptcp_subflow_disconnect(mpte, mpts, TRUE);
3621 MPTS_UNLOCK(mpts);
3622 mptcp_subflow_del(mpte, mpts, TRUE);
3623 }
3624 MPT_LOCK(mp_tp);
3625
3626 return (NULL);
3627 }
3628
3629 void
3630 mptcp_notify_close(struct socket *so)
3631 {
3632 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
3633 }
3634
3635 /*
3636 * Signal MPTCP thread to wake up.
3637 */
3638 void
3639 mptcp_thread_signal(struct mptses *mpte)
3640 {
3641 lck_mtx_lock(&mpte->mpte_thread_lock);
3642 mptcp_thread_signal_locked(mpte);
3643 lck_mtx_unlock(&mpte->mpte_thread_lock);
3644 }
3645
3646 /*
3647 * Signal MPTCP thread to wake up (locked version)
3648 */
3649 static void
3650 mptcp_thread_signal_locked(struct mptses *mpte)
3651 {
3652 lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED);
3653
3654 mpte->mpte_thread_reqs++;
3655 if (!mpte->mpte_thread_active && mpte->mpte_thread != THREAD_NULL)
3656 wakeup_one((caddr_t)&mpte->mpte_thread);
3657 }
3658
3659 /*
3660 * Signal MPTCP thread to terminate.
3661 */
3662 static void
3663 mptcp_thread_terminate_signal(struct mptses *mpte)
3664 {
3665 lck_mtx_lock(&mpte->mpte_thread_lock);
3666 if (mpte->mpte_thread != THREAD_NULL) {
3667 mpte->mpte_thread = THREAD_NULL;
3668 mpte->mpte_thread_reqs++;
3669 if (!mpte->mpte_thread_active)
3670 wakeup_one((caddr_t)&mpte->mpte_thread);
3671 }
3672 lck_mtx_unlock(&mpte->mpte_thread_lock);
3673 }
3674
3675 /*
3676 * MPTCP thread workloop.
3677 */
3678 static void
3679 mptcp_thread_dowork(struct mptses *mpte)
3680 {
3681 struct socket *mp_so;
3682 struct mptsub *mpts, *tmpts;
3683 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
3684 uint64_t mpsofilt_hint_mask = 0;
3685
3686 MPTE_LOCK(mpte); /* same as MP socket lock */
3687 VERIFY(mpte->mpte_mppcb != NULL);
3688 mp_so = mpte->mpte_mppcb->mpp_socket;
3689 VERIFY(mp_so != NULL);
3690
3691 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3692 ev_ret_t ret;
3693
3694 MPTS_LOCK(mpts);
3695 MPTS_ADDREF_LOCKED(mpts); /* for us */
3696
3697 /* Update process ownership based on parent mptcp socket */
3698 mptcp_update_last_owner(mpts, mp_so);
3699
3700 mptcp_subflow_input(mpte, mpts);
3701
3702 mptcp_get_rtt_measurement(mpts, mpte);
3703
3704 ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
3705
3706 if (mpts->mpts_flags & MPTSF_ACTIVE) {
3707 mptcplog((LOG_DEBUG, "MPTCP Socket: "
3708 "%s: cid %d \n", __func__,
3709 mpts->mpts_connid),
3710 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3711 (void) mptcp_subflow_output(mpte, mpts);
3712 }
3713
3714 /*
3715 * If MPTCP socket is closed, disconnect all subflows.
3716 * This will generate a disconnect event which will
3717 * be handled during the next iteration, causing a
3718 * non-zero error to be returned above.
3719 */
3720 if (mp_so->so_flags & SOF_PCBCLEARING)
3721 mptcp_subflow_disconnect(mpte, mpts, FALSE);
3722 MPTS_UNLOCK(mpts);
3723
3724 switch (ret) {
3725 case MPTS_EVRET_OK:
3726 /* nothing to do */
3727 break;
3728 case MPTS_EVRET_DELETE:
3729 mptcp_subflow_del(mpte, mpts, TRUE);
3730 break;
3731 case MPTS_EVRET_CONNECT_PENDING:
3732 connect_pending = TRUE;
3733 break;
3734 case MPTS_EVRET_DISCONNECT_FALLBACK:
3735 disconnect_fallback = TRUE;
3736 break;
3737 default:
3738 mptcplog((LOG_DEBUG,
3739 "MPTCP Socket: %s: mptcp_subflow_events "
3740 "returned invalid value: %d\n", __func__,
3741 ret),
3742 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
3743 break;
3744 }
3745 MPTS_REMREF(mpts); /* ours */
3746 }
3747
3748 if (mpsofilt_hint_mask) {
3749 soevent(mp_so, mpsofilt_hint_mask);
3750 }
3751
3752 if (!connect_pending && !disconnect_fallback) {
3753 MPTE_UNLOCK(mpte);
3754 return;
3755 }
3756
3757 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
3758 MPTS_LOCK(mpts);
3759 if (disconnect_fallback) {
3760 struct socket *so = NULL;
3761 struct inpcb *inp = NULL;
3762 struct tcpcb *tp = NULL;
3763
3764 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3765 MPTS_UNLOCK(mpts);
3766 continue;
3767 }
3768
3769 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
3770
3771 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|
3772 MPTSF_DISCONNECTED|MPTSF_CONNECT_PENDING)) {
3773 MPTS_UNLOCK(mpts);
3774 continue;
3775 }
3776
3777 if (mpts->mpts_flags & MPTSF_TFO_REQD)
3778 mptcp_drop_tfo_data(mpte, mpts);
3779
3780 so = mpts->mpts_socket;
3781
3782 /*
3783 * The MPTCP connection has degraded to a fallback
3784 * mode, so there is no point in keeping this subflow
3785 * regardless of its MPTCP-readiness state, unless it
3786 * is the primary one which we use for fallback. This
3787 * assumes that the subflow used for fallback is the
3788 * ACTIVE one.
3789 */
3790
3791 socket_lock(so, 1);
3792 inp = sotoinpcb(so);
3793 tp = intotcpcb(inp);
3794 tp->t_mpflags &=
3795 ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
3796 tp->t_mpflags |= TMPF_TCP_FALLBACK;
3797
3798 if (mpts->mpts_flags & MPTSF_ACTIVE) {
3799 socket_unlock(so, 1);
3800 MPTS_UNLOCK(mpts);
3801 continue;
3802 }
3803 tp->t_mpflags |= TMPF_RESET;
3804 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
3805 socket_unlock(so, 1);
3806
3807 } else if (connect_pending) {
3808 /*
3809 * If delayed subflow start is set and cellular,
3810 * delay the connect till a retransmission timeout
3811 */
3812
3813 if ((mptcp_delayed_subf_start) &&
3814 (IFNET_IS_CELLULAR(mpts->mpts_outif))) {
3815 MPTS_UNLOCK(mpts);
3816 continue;
3817 }
3818
3819 /*
3820 * The MPTCP connection has progressed to a state
3821 * where it supports full multipath semantics; allow
3822 * additional joins to be attempted for all subflows
3823 * that are in the PENDING state.
3824 */
3825 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
3826 (void) mptcp_subflow_soconnectx(mpte, mpts);
3827 }
3828 }
3829 MPTS_UNLOCK(mpts);
3830 }
3831
3832 MPTE_UNLOCK(mpte);
3833 }
3834
3835 /*
3836 * MPTCP thread.
3837 */
3838 static void
3839 mptcp_thread_func(void *v, wait_result_t w)
3840 {
3841 #pragma unused(w)
3842 struct mptses *mpte = v;
3843 struct timespec *ts = NULL;
3844
3845 VERIFY(mpte != NULL);
3846
3847 lck_mtx_lock_spin(&mpte->mpte_thread_lock);
3848
3849 for (;;) {
3850 lck_mtx_assert(&mpte->mpte_thread_lock, LCK_MTX_ASSERT_OWNED);
3851
3852 if (mpte->mpte_thread != THREAD_NULL) {
3853 (void) msleep(&mpte->mpte_thread,
3854 &mpte->mpte_thread_lock, (PZERO - 1) | PSPIN,
3855 __func__, ts);
3856 }
3857
3858 /* MPTCP socket is closed? */
3859 if (mpte->mpte_thread == THREAD_NULL) {
3860 lck_mtx_unlock(&mpte->mpte_thread_lock);
3861 /* callee will destroy thread lock */
3862 mptcp_thread_destroy(mpte);
3863 /* NOTREACHED */
3864 return;
3865 }
3866
3867 mpte->mpte_thread_active = 1;
3868 for (;;) {
3869 uint32_t reqs = mpte->mpte_thread_reqs;
3870
3871 lck_mtx_unlock(&mpte->mpte_thread_lock);
3872 mptcp_thread_dowork(mpte);
3873 lck_mtx_lock_spin(&mpte->mpte_thread_lock);
3874
3875 /* if there's no pending request, we're done */
3876 if (reqs == mpte->mpte_thread_reqs ||
3877 mpte->mpte_thread == THREAD_NULL)
3878 break;
3879 }
3880 mpte->mpte_thread_reqs = 0;
3881 mpte->mpte_thread_active = 0;
3882 }
3883 }
3884
3885 /*
3886 * Destroy a MTCP thread, to be called in the MPTCP thread context
3887 * upon receiving an indication to self-terminate. This routine
3888 * will not return, as the current thread is terminated at the end.
3889 */
3890 static void
3891 mptcp_thread_destroy(struct mptses *mpte)
3892 {
3893 struct socket *mp_so;
3894
3895 MPTE_LOCK(mpte); /* same as MP socket lock */
3896 VERIFY(mpte->mpte_thread == THREAD_NULL);
3897 VERIFY(mpte->mpte_mppcb != NULL);
3898
3899 mptcp_sesdestroy(mpte);
3900
3901 mp_so = mpte->mpte_mppcb->mpp_socket;
3902 VERIFY(mp_so != NULL);
3903 VERIFY(mp_so->so_usecount != 0);
3904 mp_so->so_usecount--; /* for thread */
3905 mpte->mpte_mppcb->mpp_flags |= MPP_DEFUNCT;
3906 MPTE_UNLOCK(mpte);
3907
3908 /* for the extra refcnt from kernel_thread_start() */
3909 thread_deallocate(current_thread());
3910 /* this is the end */
3911 thread_terminate(current_thread());
3912 /* NOTREACHED */
3913 }
3914
3915 /*
3916 * Protocol pr_lock callback.
3917 */
3918 int
3919 mptcp_lock(struct socket *mp_so, int refcount, void *lr)
3920 {
3921 struct mppcb *mpp = sotomppcb(mp_so);
3922 void *lr_saved;
3923
3924 if (lr == NULL)
3925 lr_saved = __builtin_return_address(0);
3926 else
3927 lr_saved = lr;
3928
3929 if (mpp == NULL) {
3930 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
3931 mp_so, lr_saved, solockhistory_nr(mp_so));
3932 /* NOTREACHED */
3933 }
3934 lck_mtx_lock(&mpp->mpp_lock);
3935
3936 if (mp_so->so_usecount < 0) {
3937 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__,
3938 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
3939 solockhistory_nr(mp_so));
3940 /* NOTREACHED */
3941 }
3942 if (refcount != 0)
3943 mp_so->so_usecount++;
3944 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
3945 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
3946
3947 return (0);
3948 }
3949
3950 /*
3951 * Protocol pr_unlock callback.
3952 */
3953 int
3954 mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
3955 {
3956 struct mppcb *mpp = sotomppcb(mp_so);
3957 void *lr_saved;
3958
3959 if (lr == NULL)
3960 lr_saved = __builtin_return_address(0);
3961 else
3962 lr_saved = lr;
3963
3964 if (mpp == NULL) {
3965 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__,
3966 mp_so, mp_so->so_usecount, lr_saved,
3967 solockhistory_nr(mp_so));
3968 /* NOTREACHED */
3969 }
3970 lck_mtx_assert(&mpp->mpp_lock, LCK_MTX_ASSERT_OWNED);
3971
3972 if (refcount != 0)
3973 mp_so->so_usecount--;
3974
3975 if (mp_so->so_usecount < 0) {
3976 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
3977 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
3978 /* NOTREACHED */
3979 }
3980 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
3981 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
3982 lck_mtx_unlock(&mpp->mpp_lock);
3983
3984 return (0);
3985 }
3986
3987 /*
3988 * Protocol pr_getlock callback.
3989 */
3990 lck_mtx_t *
3991 mptcp_getlock(struct socket *mp_so, int locktype)
3992 {
3993 #pragma unused(locktype)
3994 struct mppcb *mpp = sotomppcb(mp_so);
3995
3996 if (mpp == NULL) {
3997 panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so,
3998 solockhistory_nr(mp_so));
3999 /* NOTREACHED */
4000 }
4001 if (mp_so->so_usecount < 0) {
4002 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4003 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4004 /* NOTREACHED */
4005 }
4006 return (&mpp->mpp_lock);
4007 }
4008
4009 /*
4010 * Key generation functions
4011 */
4012 static void
4013 mptcp_generate_unique_key(struct mptcp_key_entry *key_entry)
4014 {
4015 struct mptcp_key_entry *key_elm;
4016 try_again:
4017 read_random(&key_entry->mkey_value, sizeof (key_entry->mkey_value));
4018 if (key_entry->mkey_value == 0)
4019 goto try_again;
4020 mptcp_do_sha1(&key_entry->mkey_value, key_entry->mkey_digest,
4021 sizeof (key_entry->mkey_digest));
4022
4023 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
4024 if (key_elm->mkey_value == key_entry->mkey_value) {
4025 goto try_again;
4026 }
4027 if (bcmp(key_elm->mkey_digest, key_entry->mkey_digest, 4) ==
4028 0) {
4029 goto try_again;
4030 }
4031 }
4032 }
4033
4034 static mptcp_key_t *
4035 mptcp_reserve_key(void)
4036 {
4037 struct mptcp_key_entry *key_elm;
4038 struct mptcp_key_entry *found_elm = NULL;
4039
4040 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
4041 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
4042 if (key_elm->mkey_flags == MKEYF_FREE) {
4043 key_elm->mkey_flags = MKEYF_INUSE;
4044 found_elm = key_elm;
4045 break;
4046 }
4047 }
4048 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
4049
4050 if (found_elm) {
4051 return (&found_elm->mkey_value);
4052 }
4053
4054 key_elm = (struct mptcp_key_entry *)
4055 zalloc(mptcp_keys_pool.mkph_key_entry_zone);
4056 key_elm->mkey_flags = MKEYF_INUSE;
4057
4058 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
4059 mptcp_generate_unique_key(key_elm);
4060 LIST_INSERT_HEAD(&mptcp_keys_pool, key_elm, mkey_next);
4061 mptcp_keys_pool.mkph_count += 1;
4062 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
4063 return (&key_elm->mkey_value);
4064 }
4065
4066 static caddr_t
4067 mptcp_get_stored_digest(mptcp_key_t *key)
4068 {
4069 struct mptcp_key_entry *key_holder;
4070 caddr_t digest = NULL;
4071
4072 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
4073 key_holder = (struct mptcp_key_entry *)(void *)((caddr_t)key -
4074 offsetof(struct mptcp_key_entry, mkey_value));
4075 if (key_holder->mkey_flags != MKEYF_INUSE)
4076 panic_plain("%s", __func__);
4077 digest = &key_holder->mkey_digest[0];
4078 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
4079 return (digest);
4080 }
4081
4082 void
4083 mptcp_free_key(mptcp_key_t *key)
4084 {
4085 struct mptcp_key_entry *key_holder;
4086 struct mptcp_key_entry *key_elm;
4087 int pt = RandomULong();
4088
4089 lck_mtx_lock(&mptcp_keys_pool.mkph_lock);
4090 key_holder = (struct mptcp_key_entry *)(void*)((caddr_t)key -
4091 offsetof(struct mptcp_key_entry, mkey_value));
4092 key_holder->mkey_flags = MKEYF_FREE;
4093
4094 LIST_REMOVE(key_holder, mkey_next);
4095 mptcp_keys_pool.mkph_count -= 1;
4096
4097 /* Free half the time */
4098 if (pt & 0x01) {
4099 zfree(mptcp_keys_pool.mkph_key_entry_zone, key_holder);
4100 } else {
4101 /* Insert it at random point to avoid early reuse */
4102 int i = 0;
4103 if (mptcp_keys_pool.mkph_count > 1) {
4104 pt = pt % (mptcp_keys_pool.mkph_count - 1);
4105 LIST_FOREACH(key_elm, &mptcp_keys_pool, mkey_next) {
4106 if (++i >= pt) {
4107 LIST_INSERT_AFTER(key_elm, key_holder,
4108 mkey_next);
4109 break;
4110 }
4111 }
4112 if (i < pt)
4113 panic("missed insertion");
4114 } else {
4115 LIST_INSERT_HEAD(&mptcp_keys_pool, key_holder,
4116 mkey_next);
4117 }
4118 mptcp_keys_pool.mkph_count += 1;
4119 }
4120 lck_mtx_unlock(&mptcp_keys_pool.mkph_lock);
4121 }
4122
4123 static void
4124 mptcp_key_pool_init(void)
4125 {
4126 int i;
4127 struct mptcp_key_entry *key_entry;
4128
4129 LIST_INIT(&mptcp_keys_pool);
4130 mptcp_keys_pool.mkph_count = 0;
4131
4132 mptcp_keys_pool.mkph_key_elm_sz = (vm_size_t)
4133 (sizeof (struct mptcp_key_entry));
4134 mptcp_keys_pool.mkph_key_entry_zone = zinit(
4135 mptcp_keys_pool.mkph_key_elm_sz,
4136 MPTCP_MX_KEY_ALLOCS * mptcp_keys_pool.mkph_key_elm_sz,
4137 MPTCP_MX_PREALLOC_ZONE_SZ, "mptkeys");
4138 if (mptcp_keys_pool.mkph_key_entry_zone == NULL) {
4139 panic("%s: unable to allocate MPTCP keys zone \n", __func__);
4140 /* NOTREACHED */
4141 }
4142 zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_CALLERACCT, FALSE);
4143 zone_change(mptcp_keys_pool.mkph_key_entry_zone, Z_EXPAND, TRUE);
4144
4145 for (i = 0; i < MPTCP_KEY_PREALLOCS_MX; i++) {
4146 key_entry = (struct mptcp_key_entry *)
4147 zalloc(mptcp_keys_pool.mkph_key_entry_zone);
4148 key_entry->mkey_flags = MKEYF_FREE;
4149 mptcp_generate_unique_key(key_entry);
4150 LIST_INSERT_HEAD(&mptcp_keys_pool, key_entry, mkey_next);
4151 mptcp_keys_pool.mkph_count += 1;
4152 }
4153 lck_mtx_init(&mptcp_keys_pool.mkph_lock, mtcbinfo.mppi_lock_grp,
4154 mtcbinfo.mppi_lock_attr);
4155 }
4156
4157 /*
4158 * MPTCP Join support
4159 */
4160
4161 static void
4162 mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp,
4163 uint8_t addr_id)
4164 {
4165 struct tcpcb *tp = sototcpcb(so);
4166 struct mptcp_subf_auth_entry *sauth_entry;
4167 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4168
4169 MPT_LOCK_SPIN(mp_tp);
4170 tp->t_mptcb = mp_tp;
4171 /*
4172 * The address ID of the first flow is implicitly 0.
4173 */
4174 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
4175 tp->t_local_aid = 0;
4176 } else {
4177 tp->t_local_aid = addr_id;
4178 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
4179 so->so_flags |= SOF_MP_SEC_SUBFLOW;
4180 }
4181 MPT_UNLOCK(mp_tp);
4182 sauth_entry = zalloc(mpt_subauth_zone);
4183 sauth_entry->msae_laddr_id = tp->t_local_aid;
4184 sauth_entry->msae_raddr_id = 0;
4185 sauth_entry->msae_raddr_rand = 0;
4186 try_again:
4187 sauth_entry->msae_laddr_rand = RandomULong();
4188 if (sauth_entry->msae_laddr_rand == 0)
4189 goto try_again;
4190 MPT_LOCK_SPIN(mp_tp);
4191 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
4192 MPT_UNLOCK(mp_tp);
4193 }
4194
4195 static void
4196 mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
4197 {
4198 struct mptcp_subf_auth_entry *sauth_entry;
4199 struct tcpcb *tp = NULL;
4200 int found = 0;
4201
4202 socket_lock(so, 0);
4203 tp = sototcpcb(so);
4204 if (tp == NULL) {
4205 socket_unlock(so, 0);
4206 return;
4207 }
4208
4209 MPT_LOCK(mp_tp);
4210 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4211 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
4212 found = 1;
4213 break;
4214 }
4215 }
4216 if (found) {
4217 LIST_REMOVE(sauth_entry, msae_next);
4218 }
4219 MPT_UNLOCK(mp_tp);
4220
4221 if (found)
4222 zfree(mpt_subauth_zone, sauth_entry);
4223
4224 tp->t_mptcb = NULL;
4225 socket_unlock(so, 0);
4226 }
4227
4228 void
4229 mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
4230 u_int32_t *rrand)
4231 {
4232 struct mptcp_subf_auth_entry *sauth_entry;
4233 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4234
4235 MPT_LOCK(mp_tp);
4236 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4237 if (sauth_entry->msae_laddr_id == addr_id) {
4238 if (lrand)
4239 *lrand = sauth_entry->msae_laddr_rand;
4240 if (rrand)
4241 *rrand = sauth_entry->msae_raddr_rand;
4242 break;
4243 }
4244 }
4245 MPT_UNLOCK(mp_tp);
4246 }
4247
4248 void
4249 mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
4250 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
4251 {
4252 struct mptcp_subf_auth_entry *sauth_entry;
4253 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4254
4255 MPT_LOCK(mp_tp);
4256 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4257 if (sauth_entry->msae_laddr_id == laddr_id) {
4258 if ((sauth_entry->msae_raddr_id != 0) &&
4259 (sauth_entry->msae_raddr_id != raddr_id)) {
4260 mptcplog((LOG_ERR, "MPTCP Socket: %s mismatched"
4261 " address ids %d %d \n", __func__, raddr_id,
4262 sauth_entry->msae_raddr_id),
4263 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
4264 MPT_UNLOCK(mp_tp);
4265 return;
4266 }
4267 sauth_entry->msae_raddr_id = raddr_id;
4268 if ((sauth_entry->msae_raddr_rand != 0) &&
4269 (sauth_entry->msae_raddr_rand != raddr_rand)) {
4270 mptcplog((LOG_ERR, "MPTCP Socket: "
4271 "%s: dup SYN_ACK %d %d \n",
4272 __func__, raddr_rand,
4273 sauth_entry->msae_raddr_rand),
4274 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
4275 MPT_UNLOCK(mp_tp);
4276 return;
4277 }
4278 sauth_entry->msae_raddr_rand = raddr_rand;
4279 MPT_UNLOCK(mp_tp);
4280 return;
4281 }
4282 }
4283 MPT_UNLOCK(mp_tp);
4284 }
4285
4286 /*
4287 * SHA1 support for MPTCP
4288 */
4289 static int
4290 mptcp_do_sha1(mptcp_key_t *key, char *sha_digest, int digest_len)
4291 {
4292 SHA1_CTX sha1ctxt;
4293 const unsigned char *sha1_base;
4294 int sha1_size;
4295
4296 if (digest_len != SHA1_RESULTLEN) {
4297 return (FALSE);
4298 }
4299
4300 sha1_base = (const unsigned char *) key;
4301 sha1_size = sizeof (mptcp_key_t);
4302 SHA1Init(&sha1ctxt);
4303 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
4304 SHA1Final(sha_digest, &sha1ctxt);
4305 return (TRUE);
4306 }
4307
4308 void
4309 mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
4310 u_int32_t rand1, u_int32_t rand2, u_char *digest, int digest_len)
4311 {
4312 SHA1_CTX sha1ctxt;
4313 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
4314 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
4315 u_int32_t data[2];
4316 int i;
4317
4318 bzero(digest, digest_len);
4319
4320 /* Set up the Key for HMAC */
4321 key_ipad[0] = key1;
4322 key_ipad[1] = key2;
4323
4324 key_opad[0] = key1;
4325 key_opad[1] = key2;
4326
4327 /* Set up the message for HMAC */
4328 data[0] = rand1;
4329 data[1] = rand2;
4330
4331 /* Key is 512 block length, so no need to compute hash */
4332
4333 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
4334
4335 for (i = 0; i < 8; i++) {
4336 key_ipad[i] ^= 0x3636363636363636;
4337 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
4338 }
4339
4340 /* Perform inner SHA1 */
4341 SHA1Init(&sha1ctxt);
4342 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof (key_ipad));
4343 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof (data));
4344 SHA1Final(digest, &sha1ctxt);
4345
4346 /* Perform outer SHA1 */
4347 SHA1Init(&sha1ctxt);
4348 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof (key_opad));
4349 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
4350 SHA1Final(digest, &sha1ctxt);
4351 }
4352
4353 /*
4354 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
4355 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
4356 */
4357 void
4358 mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest,
4359 int digest_len)
4360 {
4361 uint32_t lrand, rrand;
4362 mptcp_key_t localkey, remotekey;
4363 MPT_LOCK_ASSERT_NOTHELD(mp_tp);
4364
4365 if (digest_len != SHA1_RESULTLEN)
4366 return;
4367
4368 lrand = rrand = 0;
4369 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
4370 MPT_LOCK_SPIN(mp_tp);
4371 localkey = *mp_tp->mpt_localkey;
4372 remotekey = mp_tp->mpt_remotekey;
4373 MPT_UNLOCK(mp_tp);
4374 mptcp_hmac_sha1(localkey, remotekey, lrand, rrand, digest,
4375 digest_len);
4376 }
4377
4378 u_int64_t
4379 mptcp_get_trunced_hmac(mptcp_addr_id aid, struct mptcb *mp_tp)
4380 {
4381 u_char digest[SHA1_RESULTLEN];
4382 u_int64_t trunced_digest;
4383
4384 mptcp_get_hmac(aid, mp_tp, &digest[0], sizeof (digest));
4385 bcopy(digest, &trunced_digest, 8);
4386 return (trunced_digest);
4387 }
4388
4389 /*
4390 * Authentication data generation
4391 */
4392 void
4393 mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
4394 int token_len)
4395 {
4396 VERIFY(token_len == sizeof (u_int32_t));
4397 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4398
4399 /* Most significant 32 bits of the SHA1 hash */
4400 bcopy(sha_digest, token, sizeof (u_int32_t));
4401 return;
4402 }
4403
4404 void
4405 mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
4406 int idsn_len)
4407 {
4408 VERIFY(idsn_len == sizeof (u_int64_t));
4409 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4410
4411 /*
4412 * Least significant 64 bits of the SHA1 hash
4413 */
4414
4415 idsn[7] = sha_digest[12];
4416 idsn[6] = sha_digest[13];
4417 idsn[5] = sha_digest[14];
4418 idsn[4] = sha_digest[15];
4419 idsn[3] = sha_digest[16];
4420 idsn[2] = sha_digest[17];
4421 idsn[1] = sha_digest[18];
4422 idsn[0] = sha_digest[19];
4423 return;
4424 }
4425
4426 static void
4427 mptcp_conn_properties(struct mptcb *mp_tp)
4428 {
4429 /* There is only Version 0 at this time */
4430 mp_tp->mpt_version = MPTCP_STD_VERSION_0;
4431
4432 /* Set DSS checksum flag */
4433 if (mptcp_dss_csum)
4434 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
4435
4436 /* Set up receive window */
4437 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
4438
4439 /* Set up gc ticks */
4440 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
4441 }
4442
4443 static void
4444 mptcp_init_local_parms(struct mptcb *mp_tp)
4445 {
4446 caddr_t local_digest = NULL;
4447
4448 mp_tp->mpt_localkey = mptcp_reserve_key();
4449 local_digest = mptcp_get_stored_digest(mp_tp->mpt_localkey);
4450 mptcp_generate_token(local_digest, SHA1_RESULTLEN,
4451 (caddr_t)&mp_tp->mpt_localtoken, sizeof (mp_tp->mpt_localtoken));
4452 mptcp_generate_idsn(local_digest, SHA1_RESULTLEN,
4453 (caddr_t)&mp_tp->mpt_local_idsn, sizeof (u_int64_t));
4454
4455 /* The subflow SYN is also first MPTCP byte */
4456 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
4457 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
4458
4459 mptcp_conn_properties(mp_tp);
4460 }
4461
4462 int
4463 mptcp_init_remote_parms(struct mptcb *mp_tp)
4464 {
4465 char remote_digest[MPTCP_SHA1_RESULTLEN];
4466 MPT_LOCK_ASSERT_HELD(mp_tp);
4467
4468 /* Only Version 0 is supported for auth purposes */
4469 if (mp_tp->mpt_version != MPTCP_STD_VERSION_0)
4470 return (-1);
4471
4472 /* Setup local and remote tokens and Initial DSNs */
4473
4474 if (!mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest,
4475 SHA1_RESULTLEN)) {
4476 mptcplog((LOG_ERR, "MPTCP Socket: %s: unexpected failure",
4477 __func__), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
4478 return (-1);
4479 }
4480 mptcp_generate_token(remote_digest, SHA1_RESULTLEN,
4481 (caddr_t)&mp_tp->mpt_remotetoken, sizeof (mp_tp->mpt_remotetoken));
4482 mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN,
4483 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof (u_int64_t));
4484 mp_tp->mpt_rcvatmark = mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
4485
4486 return (0);
4487 }
4488
4489 /*
4490 * Helper Functions
4491 */
4492 mptcp_token_t
4493 mptcp_get_localtoken(void* mptcb_arg)
4494 {
4495 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4496 return (mp_tp->mpt_localtoken);
4497 }
4498
4499 mptcp_token_t
4500 mptcp_get_remotetoken(void* mptcb_arg)
4501 {
4502 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4503 return (mp_tp->mpt_remotetoken);
4504 }
4505
4506 u_int64_t
4507 mptcp_get_localkey(void* mptcb_arg)
4508 {
4509 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4510 if (mp_tp->mpt_localkey != NULL)
4511 return (*mp_tp->mpt_localkey);
4512 else
4513 return (0);
4514 }
4515
4516 u_int64_t
4517 mptcp_get_remotekey(void* mptcb_arg)
4518 {
4519 struct mptcb *mp_tp = (struct mptcb *)mptcb_arg;
4520 return (mp_tp->mpt_remotekey);
4521 }
4522
4523 void
4524 mptcp_send_dfin(struct socket *so)
4525 {
4526 struct tcpcb *tp = NULL;
4527 struct inpcb *inp = NULL;
4528
4529 inp = sotoinpcb(so);
4530 if (!inp)
4531 return;
4532
4533 tp = intotcpcb(inp);
4534 if (!tp)
4535 return;
4536
4537 if (!(tp->t_mpflags & TMPF_RESET))
4538 tp->t_mpflags |= TMPF_SEND_DFIN;
4539 }
4540
4541 /*
4542 * Data Sequence Mapping routines
4543 */
4544 void
4545 mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
4546 {
4547 struct mptcb *mp_tp;
4548
4549 if (m == NULL)
4550 return;
4551
4552 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
4553 MPT_LOCK(mp_tp);
4554 while (m) {
4555 VERIFY(m->m_flags & M_PKTHDR);
4556 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
4557 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
4558 m->m_pkthdr.mp_rlen = m_pktlen(m);
4559 mp_tp->mpt_sndmax += m_pktlen(m);
4560 m = m->m_next;
4561 }
4562 MPT_UNLOCK(mp_tp);
4563 }
4564
4565 void
4566 mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
4567 {
4568 u_int32_t sub_len = 0;
4569 int rewinding = 0;
4570
4571 if (so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
4572 /* TFO makes things complicated. */
4573 if (so->so_flags1 & SOF1_TFO_REWIND) {
4574 rewinding = 1;
4575 so->so_flags1 &= ~SOF1_TFO_REWIND;
4576 }
4577 }
4578
4579 while (m) {
4580 VERIFY(m->m_flags & M_PKTHDR);
4581
4582 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
4583 sub_len = m->m_pkthdr.mp_rlen;
4584
4585 if (sub_len < len) {
4586 m->m_pkthdr.mp_dsn += sub_len;
4587 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4588 m->m_pkthdr.mp_rseq += sub_len;
4589 }
4590 m->m_pkthdr.mp_rlen = 0;
4591 len -= sub_len;
4592 } else {
4593 /* sub_len >= len */
4594 if (rewinding == 0)
4595 m->m_pkthdr.mp_dsn += len;
4596 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4597 if (rewinding == 0)
4598 m->m_pkthdr.mp_rseq += len;
4599 }
4600 mptcplog((LOG_DEBUG, "MPTCP Sender: "
4601 "%s: dsn 0x%llx ssn %u len %d %d\n",
4602 __func__,
4603 m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rseq,
4604 m->m_pkthdr.mp_rlen, len),
4605 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
4606 m->m_pkthdr.mp_rlen -= len;
4607 return;
4608 }
4609 } else {
4610 panic("%s: MPTCP tag not set", __func__);
4611 /* NOTREACHED */
4612 }
4613 m = m->m_next;
4614 }
4615 }
4616
4617 /* Obtain the DSN mapping stored in the mbuf */
4618 void
4619 mptcp_output_getm_dsnmap32(struct socket *so, int off, uint32_t datalen,
4620 u_int32_t *dsn, u_int32_t *relseq, u_int16_t *data_len, u_int64_t *dsn64p)
4621 {
4622 u_int64_t dsn64;
4623
4624 mptcp_output_getm_dsnmap64(so, off, datalen, &dsn64, relseq, data_len);
4625 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
4626 *dsn64p = dsn64;
4627 }
4628
4629 void
4630 mptcp_output_getm_dsnmap64(struct socket *so, int off, uint32_t datalen,
4631 u_int64_t *dsn, u_int32_t *relseq, u_int16_t *data_len)
4632 {
4633 struct mbuf *m = so->so_snd.sb_mb;
4634 struct mbuf *mnext = NULL;
4635 uint32_t runlen = 0;
4636 u_int64_t dsn64;
4637 uint32_t contig_len = 0;
4638
4639 if (m == NULL)
4640 return;
4641
4642 if (off < 0)
4643 return;
4644 /*
4645 * In the subflow socket, the DSN sequencing can be discontiguous,
4646 * but the subflow sequence mapping is contiguous. Use the subflow
4647 * sequence property to find the right mbuf and corresponding dsn
4648 * mapping.
4649 */
4650
4651 while (m) {
4652 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4653 VERIFY(m->m_flags & M_PKTHDR);
4654
4655 if ((unsigned int)off >= m->m_pkthdr.mp_rlen) {
4656 off -= m->m_pkthdr.mp_rlen;
4657 m = m->m_next;
4658 } else {
4659 break;
4660 }
4661 }
4662
4663 if (m == NULL) {
4664 panic("%s: bad offset", __func__);
4665 /* NOTREACHED */
4666 }
4667
4668 dsn64 = m->m_pkthdr.mp_dsn + off;
4669 *dsn = dsn64;
4670 *relseq = m->m_pkthdr.mp_rseq + off;
4671
4672 /*
4673 * Now find the last contiguous byte and its length from
4674 * start.
4675 */
4676 runlen = m->m_pkthdr.mp_rlen - off;
4677 contig_len = runlen;
4678
4679 /* If datalen does not span multiple mbufs, return */
4680 if (datalen <= runlen) {
4681 *data_len = min(datalen, UINT16_MAX);
4682 return;
4683 }
4684
4685 mnext = m->m_next;
4686 while (datalen > runlen) {
4687 if (mnext == NULL) {
4688 panic("%s: bad datalen = %d, %d %d", __func__, datalen,
4689 runlen, off);
4690 /* NOTREACHED */
4691 }
4692 VERIFY(mnext->m_flags & M_PKTHDR);
4693 VERIFY(mnext->m_pkthdr.pkt_flags & PKTF_MPTCP);
4694
4695 /*
4696 * case A. contiguous DSN stream
4697 * case B. discontiguous DSN stream
4698 */
4699 if (mnext->m_pkthdr.mp_dsn == (dsn64 + runlen)) {
4700 /* case A */
4701 runlen += mnext->m_pkthdr.mp_rlen;
4702 contig_len += mnext->m_pkthdr.mp_rlen;
4703 mptcplog((LOG_DEBUG, "MPTCP Sender: %s: contig \n",
4704 __func__), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
4705 } else {
4706 /* case B */
4707 mptcplog((LOG_DEBUG, "MPTCP Sender: "
4708 "%s: discontig datalen %d contig_len %d cc %d \n",
4709 __func__, datalen, contig_len, so->so_snd.sb_cc),
4710 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
4711 break;
4712 }
4713 mnext = mnext->m_next;
4714 }
4715 datalen = min(datalen, UINT16_MAX);
4716 *data_len = min(datalen, contig_len);
4717 mptcplog((LOG_DEBUG, "MPTCP Sender: "
4718 "%s: %llu %u %d %d \n", __func__,
4719 *dsn, *relseq, *data_len, off),
4720 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
4721 }
4722
4723 /*
4724 * MPTCP's notion of the next insequence Data Sequence number is adjusted
4725 * here. It must be called from mptcp_adj_rmap() which is called only after
4726 * reassembly of out of order data. The rcvnxt variable must
4727 * be updated only when atleast some insequence new data is received.
4728 */
4729 static void
4730 mptcp_adj_rcvnxt(struct tcpcb *tp, struct mbuf *m)
4731 {
4732 struct mptcb *mp_tp = tptomptp(tp);
4733
4734 if (mp_tp == NULL)
4735 return;
4736 MPT_LOCK(mp_tp);
4737 if ((MPTCP_SEQ_GEQ(mp_tp->mpt_rcvnxt, m->m_pkthdr.mp_dsn)) &&
4738 (MPTCP_SEQ_LEQ(mp_tp->mpt_rcvnxt, (m->m_pkthdr.mp_dsn +
4739 m->m_pkthdr.mp_rlen)))) {
4740 mp_tp->mpt_rcvnxt = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
4741 }
4742 MPT_UNLOCK(mp_tp);
4743 }
4744
4745 /*
4746 * Note that this is called only from tcp_input() via mptcp_input_preproc()
4747 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
4748 * When it trims data tcp_input calls m_adj() which does not remove the
4749 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
4750 * The dsn map insertion cannot be delayed after trim, because data can be in
4751 * the reassembly queue for a while and the DSN option info in tp will be
4752 * overwritten for every new packet received.
4753 * The dsn map will be adjusted just prior to appending to subflow sockbuf
4754 * with mptcp_adj_rmap()
4755 */
4756 void
4757 mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m)
4758 {
4759 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
4760
4761 if (tp->t_mpflags & TMPF_EMBED_DSN) {
4762 VERIFY(m->m_flags & M_PKTHDR);
4763 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
4764 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
4765 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
4766 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
4767 tp->t_mpflags &= ~TMPF_EMBED_DSN;
4768 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
4769 }
4770 }
4771
4772 int
4773 mptcp_adj_rmap(struct socket *so, struct mbuf *m)
4774 {
4775 u_int64_t dsn;
4776 u_int32_t sseq, datalen;
4777 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
4778 u_int32_t old_rcvnxt = 0;
4779
4780 if (m_pktlen(m) == 0)
4781 return 0;
4782
4783 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
4784 VERIFY(m->m_flags & M_PKTHDR);
4785
4786 dsn = m->m_pkthdr.mp_dsn;
4787 sseq = m->m_pkthdr.mp_rseq + tp->irs;
4788 datalen = m->m_pkthdr.mp_rlen;
4789 } else {
4790 /* data arrived without an DSS option mapping */
4791
4792 /* initial subflow can fallback right after SYN handshake */
4793 mptcp_notify_mpfail(so);
4794 return 0;
4795 }
4796
4797 /* In the common case, data is in window and in sequence */
4798 if (m->m_pkthdr.len == (int)datalen) {
4799 mptcp_adj_rcvnxt(tp, m);
4800 return 0;
4801 }
4802
4803 old_rcvnxt = tp->rcv_nxt - m->m_pkthdr.len;
4804 if (SEQ_GT(old_rcvnxt, sseq)) {
4805 /* data trimmed from the left */
4806 int off = old_rcvnxt - sseq;
4807 m->m_pkthdr.mp_dsn += off;
4808 m->m_pkthdr.mp_rseq += off;
4809 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
4810 } else if (old_rcvnxt == sseq) {
4811 /*
4812 * data was trimmed from the right
4813 */
4814 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
4815 } else {
4816 mptcp_notify_mpfail(so);
4817 return (-1);
4818 }
4819 mptcp_adj_rcvnxt(tp, m);
4820 return 0;
4821 }
4822
4823 /*
4824 * Following routines help with failure detection and failover of data
4825 * transfer from one subflow to another.
4826 */
4827 void
4828 mptcp_act_on_txfail(struct socket *so)
4829 {
4830 struct tcpcb *tp = NULL;
4831 struct inpcb *inp = sotoinpcb(so);
4832
4833 if (inp == NULL)
4834 return;
4835
4836 tp = intotcpcb(inp);
4837 if (tp == NULL)
4838 return;
4839
4840 if (so->so_flags & SOF_MP_TRYFAILOVER) {
4841 return;
4842 }
4843
4844 so->so_flags |= SOF_MP_TRYFAILOVER;
4845 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
4846 }
4847
4848 /*
4849 * Support for MP_FAIL option
4850 */
4851 int
4852 mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq)
4853 {
4854 struct mbuf *m = so->so_snd.sb_mb;
4855 u_int64_t dsn;
4856 int off = 0;
4857 u_int32_t datalen;
4858
4859 if (m == NULL)
4860 return (-1);
4861
4862 while (m != NULL) {
4863 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4864 VERIFY(m->m_flags & M_PKTHDR);
4865 dsn = m->m_pkthdr.mp_dsn;
4866 datalen = m->m_pkthdr.mp_rlen;
4867 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
4868 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
4869 off = dsn_fail - dsn;
4870 *tcp_seq = m->m_pkthdr.mp_rseq + off;
4871 mptcplog((LOG_DEBUG, "MPTCP Sender: %s: %llu %llu \n",
4872 __func__, dsn, dsn_fail),
4873 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
4874 return (0);
4875 }
4876
4877 m = m->m_next;
4878 }
4879
4880 /*
4881 * If there was no mbuf data and a fallback to TCP occurred, there's
4882 * not much else to do.
4883 */
4884
4885 mptcplog((LOG_ERR, "MPTCP Sender: "
4886 "%s: %llu not found \n", __func__, dsn_fail),
4887 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
4888 return (-1);
4889 }
4890
4891 /*
4892 * Support for sending contiguous MPTCP bytes in subflow
4893 * Also for preventing sending data with ACK in 3-way handshake
4894 */
4895 int32_t
4896 mptcp_adj_sendlen(struct socket *so, int32_t off, int32_t len)
4897 {
4898 u_int64_t mdss_dsn = 0;
4899 u_int32_t mdss_subflow_seq = 0;
4900 u_int16_t mdss_data_len = 0;
4901
4902 if (len == 0)
4903 return (len);
4904
4905 mptcp_output_getm_dsnmap64(so, off, (u_int32_t)len,
4906 &mdss_dsn, &mdss_subflow_seq, &mdss_data_len);
4907
4908 /*
4909 * Special case handling for Fast Join. We want to send data right
4910 * after ACK of the 3-way handshake, but not piggyback the data
4911 * with the 3rd ACK of the 3WHS. TMPF_FASTJOINBY2_SEND and
4912 * mdss_data_len control this.
4913 */
4914 struct tcpcb *tp = NULL;
4915 tp = intotcpcb(sotoinpcb(so));
4916 if ((tp->t_mpflags & TMPF_JOINED_FLOW) &&
4917 (tp->t_mpflags & TMPF_PREESTABLISHED) &&
4918 (!(tp->t_mpflags & TMPF_RECVD_JOIN)) &&
4919 (tp->t_mpflags & TMPF_SENT_JOIN) &&
4920 (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
4921 (!(tp->t_mpflags & TMPF_FASTJOINBY2_SEND))) {
4922 mdss_data_len = 0;
4923 tp->t_mpflags |= TMPF_FASTJOINBY2_SEND;
4924 }
4925
4926 if ((tp->t_state > TCPS_SYN_SENT) &&
4927 (tp->t_mpflags & TMPF_TFO_REQUEST)) {
4928 mdss_data_len = 0;
4929 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
4930 }
4931 return (mdss_data_len);
4932 }
4933
4934 int32_t
4935 mptcp_sbspace(struct mptcb *mpt)
4936 {
4937 struct sockbuf *sb;
4938 uint32_t rcvbuf;
4939 int32_t space;
4940
4941 MPT_LOCK_ASSERT_HELD(mpt);
4942 MPTE_LOCK_ASSERT_HELD(mpt->mpt_mpte);
4943
4944 sb = &mpt->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
4945 rcvbuf = sb->sb_hiwat;
4946 space = ((int32_t)imin((rcvbuf - sb->sb_cc),
4947 (sb->sb_mbmax - sb->sb_mbcnt)));
4948 if (space < 0)
4949 space = 0;
4950 /* XXX check if it's too small? */
4951
4952 return (space);
4953 }
4954
4955 /*
4956 * Support Fallback to Regular TCP
4957 */
4958 void
4959 mptcp_notify_mpready(struct socket *so)
4960 {
4961 struct tcpcb *tp = NULL;
4962
4963 if (so == NULL)
4964 return;
4965
4966 tp = intotcpcb(sotoinpcb(so));
4967
4968 if (tp == NULL)
4969 return;
4970
4971 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
4972 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
4973 struct tcpcb *, tp);
4974
4975 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE))
4976 return;
4977
4978 if (tp->t_mpflags & TMPF_MPTCP_READY)
4979 return;
4980
4981 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
4982 tp->t_mpflags |= TMPF_MPTCP_READY;
4983
4984 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
4985 }
4986
4987 void
4988 mptcp_notify_mpfail(struct socket *so)
4989 {
4990 struct tcpcb *tp = NULL;
4991
4992 if (so == NULL)
4993 return;
4994
4995 tp = intotcpcb(sotoinpcb(so));
4996
4997 if (tp == NULL)
4998 return;
4999
5000 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
5001 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5002 struct tcpcb *, tp);
5003
5004 if (tp->t_mpflags & TMPF_TCP_FALLBACK)
5005 return;
5006
5007 tp->t_mpflags &= ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
5008 tp->t_mpflags |= TMPF_TCP_FALLBACK;
5009
5010 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5011 }
5012
5013 /*
5014 * Keepalive helper function
5015 */
5016 boolean_t
5017 mptcp_ok_to_keepalive(struct mptcb *mp_tp)
5018 {
5019 boolean_t ret = 1;
5020 VERIFY(mp_tp != NULL);
5021 MPT_LOCK(mp_tp);
5022 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
5023 ret = 0;
5024 }
5025 MPT_UNLOCK(mp_tp);
5026 return (ret);
5027 }
5028
5029 /*
5030 * MPTCP t_maxseg adjustment function
5031 */
5032 int
5033 mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
5034 {
5035 int mss_lower = 0;
5036 struct mptcb *mp_tp = tptomptp(tp);
5037
5038 #define MPTCP_COMPUTE_LEN { \
5039 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
5040 MPT_LOCK(mp_tp); \
5041 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
5042 mss_lower += 2; \
5043 else \
5044 /* adjust to 32-bit boundary + EOL */ \
5045 mss_lower += 2; \
5046 MPT_UNLOCK(mp_tp); \
5047 }
5048 if (mp_tp == NULL)
5049 return (0);
5050
5051 /*
5052 * For the first subflow and subsequent subflows, adjust mss for
5053 * most common MPTCP option size, for case where tcp_mss is called
5054 * during option processing and MTU discovery.
5055 */
5056 if ((tp->t_mpflags & TMPF_PREESTABLISHED) &&
5057 (!(tp->t_mpflags & TMPF_JOINED_FLOW))) {
5058 MPTCP_COMPUTE_LEN;
5059 }
5060
5061 if ((tp->t_mpflags & TMPF_PREESTABLISHED) &&
5062 (tp->t_mpflags & TMPF_SENT_JOIN)) {
5063 MPTCP_COMPUTE_LEN;
5064 }
5065
5066 if ((mtudisc) && (tp->t_mpflags & TMPF_MPTCP_TRUE)) {
5067 MPTCP_COMPUTE_LEN;
5068 }
5069
5070 return (mss_lower);
5071 }
5072
5073 /*
5074 * Update the pid, upid, uuid of the subflow so, based on parent so
5075 */
5076 void
5077 mptcp_update_last_owner(struct mptsub *mpts, struct socket *parent_mpso)
5078 {
5079 struct socket *subflow_so = mpts->mpts_socket;
5080
5081 MPTS_LOCK_ASSERT_HELD(mpts);
5082
5083 socket_lock(subflow_so, 0);
5084 if ((subflow_so->last_pid != parent_mpso->last_pid) ||
5085 (subflow_so->last_upid != parent_mpso->last_upid)) {
5086 subflow_so->last_upid = parent_mpso->last_upid;
5087 subflow_so->last_pid = parent_mpso->last_pid;
5088 uuid_copy(subflow_so->last_uuid, parent_mpso->last_uuid);
5089 }
5090 so_update_policy(subflow_so);
5091 socket_unlock(subflow_so, 0);
5092 }
5093
5094 static void
5095 fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
5096 {
5097 struct inpcb *inp;
5098
5099 tcp_getconninfo(so, &flow->flow_ci);
5100 inp = sotoinpcb(so);
5101 #if INET6
5102 if ((inp->inp_vflag & INP_IPV6) != 0) {
5103 flow->flow_src.ss_family = AF_INET6;
5104 flow->flow_dst.ss_family = AF_INET6;
5105 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
5106 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
5107 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
5108 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
5109 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
5110 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
5111 } else
5112 #endif
5113 if ((inp->inp_vflag & INP_IPV4) != 0) {
5114 flow->flow_src.ss_family = AF_INET;
5115 flow->flow_dst.ss_family = AF_INET;
5116 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
5117 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
5118 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
5119 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
5120 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
5121 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
5122 }
5123 flow->flow_len = sizeof(*flow);
5124 flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
5125 flow->flow_flags = mpts->mpts_flags;
5126 flow->flow_cid = mpts->mpts_connid;
5127 flow->flow_sndnxt = mpts->mpts_sndnxt;
5128 flow->flow_relseq = mpts->mpts_rel_seq;
5129 flow->flow_soerror = mpts->mpts_soerror;
5130 flow->flow_probecnt = mpts->mpts_probecnt;
5131 flow->flow_peerswitch = mpts->mpts_peerswitch;
5132 }
5133
5134 static int
5135 mptcp_pcblist SYSCTL_HANDLER_ARGS
5136 {
5137 #pragma unused(oidp, arg1, arg2)
5138 int error = 0, f;
5139 size_t n, len;
5140 struct mppcb *mpp;
5141 struct mptses *mpte;
5142 struct mptcb *mp_tp;
5143 struct mptsub *mpts;
5144 struct socket *so;
5145 conninfo_mptcp_t mptcpci;
5146 mptcp_flow_t *flows = NULL;
5147
5148 if (req->newptr != USER_ADDR_NULL)
5149 return (EPERM);
5150
5151 lck_mtx_lock(&mtcbinfo.mppi_lock);
5152 n = mtcbinfo.mppi_count;
5153 if (req->oldptr == USER_ADDR_NULL) {
5154 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5155 req->oldidx = (n + n/8) * sizeof(conninfo_mptcp_t) +
5156 4 * (n + n/8) * sizeof(mptcp_flow_t);
5157 return (0);
5158 }
5159 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
5160 flows = NULL;
5161 lck_mtx_lock(&mpp->mpp_lock);
5162 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
5163 if (mpp->mpp_flags & MPP_DEFUNCT) {
5164 lck_mtx_unlock(&mpp->mpp_lock);
5165 continue;
5166 }
5167 mpte = mptompte(mpp);
5168 VERIFY(mpte != NULL);
5169 mp_tp = mpte->mpte_mptcb;
5170 VERIFY(mp_tp != NULL);
5171
5172 bzero(&mptcpci, sizeof(mptcpci));
5173 MPT_LOCK(mp_tp);
5174 mptcpci.mptcpci_state = mp_tp->mpt_state;
5175 mptcpci.mptcpci_flags = mp_tp->mpt_flags;
5176 mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
5177 mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
5178 mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
5179 mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
5180 mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
5181 mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
5182 mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
5183 mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
5184 mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
5185 mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvatmark;
5186 mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
5187 mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
5188 MPT_UNLOCK(mp_tp);
5189
5190 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
5191 mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
5192 mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
5193 mptcpci.mptcpci_flow_offset =
5194 offsetof(conninfo_mptcp_t, mptcpci_flows);
5195
5196 len = sizeof(*flows) * mpte->mpte_numflows;
5197 if (mpte->mpte_numflows != 0) {
5198 flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
5199 if (flows == NULL) {
5200 lck_mtx_unlock(&mpp->mpp_lock);
5201 break;
5202 }
5203 mptcpci.mptcpci_len = sizeof(mptcpci) +
5204 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
5205 error = SYSCTL_OUT(req, &mptcpci,
5206 sizeof(mptcpci) - sizeof(mptcp_flow_t));
5207 } else {
5208 mptcpci.mptcpci_len = sizeof(mptcpci);
5209 error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
5210 }
5211 if (error) {
5212 lck_mtx_unlock(&mpp->mpp_lock);
5213 FREE(flows, M_TEMP);
5214 break;
5215 }
5216 f = 0;
5217 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5218 MPTS_LOCK(mpts);
5219 so = mpts->mpts_socket;
5220 socket_lock(so, 0);
5221 fill_mptcp_subflow(so, &flows[f], mpts);
5222 socket_unlock(so, 0);
5223 MPTS_UNLOCK(mpts);
5224 f++;
5225 }
5226 lck_mtx_unlock(&mpp->mpp_lock);
5227 if (flows) {
5228 error = SYSCTL_OUT(req, flows, len);
5229 FREE(flows, M_TEMP);
5230 if (error)
5231 break;
5232 }
5233 }
5234 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5235
5236 return (error);
5237 }
5238
5239 SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
5240 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
5241 "List of active MPTCP connections");
5242
5243 /*
5244 * Check the health of the other subflows and do an mptcp_output if
5245 * there is no other active or functional subflow at the time of
5246 * call of this function.
5247 */
5248 static void
5249 mptcp_output_needed(struct mptses *mpte, struct mptsub *to_mpts)
5250 {
5251 struct mptsub *from_mpts = NULL;
5252
5253 MPTE_LOCK_ASSERT_HELD(mpte);
5254
5255 MPTS_UNLOCK(to_mpts);
5256
5257 from_mpts = mpte->mpte_active_sub;
5258
5259 if (from_mpts == NULL)
5260 goto output_needed;
5261
5262 MPTS_LOCK(from_mpts);
5263
5264 if ((from_mpts->mpts_flags & MPTSF_DISCONNECTED) ||
5265 (from_mpts->mpts_flags & MPTSF_DISCONNECTING)) {
5266 MPTS_UNLOCK(from_mpts);
5267 goto output_needed;
5268 }
5269
5270 MPTS_UNLOCK(from_mpts);
5271 MPTS_LOCK(to_mpts);
5272 return;
5273
5274 output_needed:
5275 mptcp_output(mpte);
5276 MPTS_LOCK(to_mpts);
5277 }
5278
5279 /*
5280 * Set notsent lowat mark on the MPTCB
5281 */
5282 int
5283 mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
5284 {
5285 struct mptcb *mp_tp = NULL;
5286 int error = 0;
5287
5288 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5289 mp_tp = mpte->mpte_mptcb;
5290
5291 if (mp_tp)
5292 mp_tp->mpt_notsent_lowat = optval;
5293 else
5294 error = EINVAL;
5295
5296 return error;
5297 }
5298
5299 u_int32_t
5300 mptcp_get_notsent_lowat(struct mptses *mpte)
5301 {
5302 struct mptcb *mp_tp = NULL;
5303
5304 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5305 mp_tp = mpte->mpte_mptcb;
5306
5307 if (mp_tp)
5308 return mp_tp->mpt_notsent_lowat;
5309 else
5310 return 0;
5311 }
5312
5313 int
5314 mptcp_notsent_lowat_check(struct socket *so) {
5315 struct mptses *mpte;
5316 struct mppcb *mpp;
5317 struct mptcb *mp_tp;
5318 struct mptsub *mpts;
5319
5320 int notsent = 0;
5321
5322 mpp = sotomppcb(so);
5323 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
5324 return (0);
5325 }
5326
5327 mpte = mptompte(mpp);
5328 mp_tp = mpte->mpte_mptcb;
5329
5330 MPT_LOCK(mp_tp);
5331 notsent = so->so_snd.sb_cc;
5332
5333 if ((notsent == 0) ||
5334 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
5335 mp_tp->mpt_notsent_lowat)) {
5336 mptcplog((LOG_DEBUG, "MPTCP Sender: "
5337 "lowat %d notsent %d actual %d \n",
5338 mp_tp->mpt_notsent_lowat, notsent,
5339 notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)),
5340 MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
5341 MPT_UNLOCK(mp_tp);
5342 return (1);
5343 }
5344 MPT_UNLOCK(mp_tp);
5345
5346 /* When Nagle's algorithm is not disabled, it is better
5347 * to wakeup the client even before there is atleast one
5348 * maxseg of data to write.
5349 */
5350 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5351 int retval = 0;
5352 MPTS_LOCK(mpts);
5353 if (mpts->mpts_flags & MPTSF_ACTIVE) {
5354 struct socket *subf_so = mpts->mpts_socket;
5355 socket_lock(subf_so, 0);
5356 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
5357
5358 notsent = so->so_snd.sb_cc -
5359 (tp->snd_nxt - tp->snd_una);
5360
5361 if ((tp->t_flags & TF_NODELAY) == 0 &&
5362 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
5363 retval = 1;
5364 }
5365 mptcplog((LOG_DEBUG, "MPTCP Sender: lowat %d notsent %d"
5366 " nodelay false \n",
5367 mp_tp->mpt_notsent_lowat, notsent),
5368 MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
5369 socket_unlock(subf_so, 0);
5370 MPTS_UNLOCK(mpts);
5371 return (retval);
5372 }
5373 MPTS_UNLOCK(mpts);
5374 }
5375 return (0);
5376 }
5377
5378 static void
5379 mptcp_get_rtt_measurement(struct mptsub *mpts, struct mptses *mpte)
5380 {
5381 MPTE_LOCK_ASSERT_HELD(mpte);
5382 MPTS_LOCK_ASSERT_HELD(mpts);
5383
5384 struct socket *subflow_so = mpts->mpts_socket;
5385 socket_lock(subflow_so, 0);
5386 mpts->mpts_srtt = (intotcpcb(sotoinpcb(subflow_so)))->t_srtt;
5387 mpts->mpts_rxtcur = (intotcpcb(sotoinpcb(subflow_so)))->t_rxtcur;
5388 socket_unlock(subflow_so, 0);
5389 }
5390
5391 /* Using Symptoms Advisory to detect poor WiFi or poor Cell */
5392 static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
5393 static uint32_t mptcp_kern_skt_inuse = 0;
5394 symptoms_advisory_t mptcp_advisory;
5395
5396 static errno_t
5397 mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
5398 void **unitinfo)
5399 {
5400 #pragma unused(kctlref, sac, unitinfo)
5401 /*
5402 * We don't need to do anything here. But we can atleast ensure
5403 * only one user opens the MPTCP_KERN_CTL_NAME control socket.
5404 */
5405 if (OSCompareAndSwap(0, 1, &mptcp_kern_skt_inuse))
5406 return (0);
5407 else
5408 return (EALREADY);
5409 }
5410
5411 static errno_t
5412 mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
5413 void *unitinfo)
5414 {
5415 #pragma unused(kctlref, kcunit, unitinfo)
5416 if (OSCompareAndSwap(1, 0, &mptcp_kern_skt_inuse)) {
5417 /* TBD needs to be locked if the size grows more than an int */
5418 bzero(&mptcp_advisory, sizeof(mptcp_advisory));
5419 return (0);
5420 }
5421 else {
5422 return (EINVAL);
5423 }
5424 }
5425
5426 static errno_t
5427 mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
5428 mbuf_t m, int flags)
5429 {
5430 #pragma unused(kctlref, kcunit, unitinfo, flags)
5431 symptoms_advisory_t *sa = NULL;
5432
5433 if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
5434 mbuf_freem(m);
5435 return (EINVAL);
5436 }
5437
5438 if (mbuf_len(m) >= sizeof(*sa))
5439 sa = mbuf_data(m);
5440 else
5441 return (EINVAL);
5442
5443 if (mptcp_advisory.sa_nwk_status_int != sa->sa_nwk_status_int) {
5444 /*
5445 * we could use this notification to notify all mptcp pcbs
5446 * of the change in network status. But its difficult to
5447 * define if sending REMOVE_ADDR or MP_PRIO is appropriate
5448 * given that these are only soft indicators of the network
5449 * state. Leaving this as TBD for now.
5450 */
5451 }
5452
5453 if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_NOCOMMENT) {
5454 mptcplog((LOG_DEBUG, "MPTCP Events: %s wifi %d,%d cell %d,%d\n",
5455 __func__, sa->sa_wifi_status, mptcp_advisory.sa_wifi_status,
5456 sa->sa_cell_status, mptcp_advisory.sa_cell_status),
5457 MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG,
5458 MPTCP_LOGLVL_LOG);
5459
5460 if ((sa->sa_wifi_status &
5461 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) !=
5462 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) {
5463 mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
5464 }
5465
5466 if ((sa->sa_cell_status &
5467 (SYMPTOMS_ADVISORY_CELL_BAD | SYMPTOMS_ADVISORY_CELL_OK)) !=
5468 (SYMPTOMS_ADVISORY_CELL_BAD | SYMPTOMS_ADVISORY_CELL_OK)) {
5469 mptcp_advisory.sa_cell_status = sa->sa_cell_status;
5470 }
5471 } else {
5472 mptcplog((LOG_DEBUG, "MPTCP Events: %s NOCOMMENT "
5473 "wifi %d cell %d\n", __func__,
5474 mptcp_advisory.sa_wifi_status,
5475 mptcp_advisory.sa_cell_status),
5476 MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
5477 }
5478 return (0);
5479 }
5480
5481 void
5482 mptcp_control_register(void)
5483 {
5484 /* Set up the advisory control socket */
5485 struct kern_ctl_reg mptcp_kern_ctl;
5486
5487 bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
5488 strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
5489 sizeof(mptcp_kern_ctl.ctl_name));
5490 mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
5491 mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
5492 mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
5493 mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
5494
5495 (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
5496 }
5497
5498 int
5499 mptcp_is_wifi_unusable(void)
5500 {
5501 /* a false return val indicates there is no info or wifi is ok */
5502 return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD);
5503 }
5504
5505 int
5506 mptcp_is_cell_unusable(void)
5507 {
5508 /* a false return val indicates there is no info or cell is ok */
5509 return (mptcp_advisory.sa_cell_status & SYMPTOMS_ADVISORY_CELL_BAD);
5510 }
5511
5512 struct mptsub*
5513 mptcp_use_symptoms_hints(struct mptsub* best, struct mptsub *second_best)
5514 {
5515 struct mptsub *cellsub = NULL;
5516 struct mptsub *wifisub = NULL;
5517 struct mptsub *wiredsub = NULL;
5518
5519 VERIFY ((best != NULL) && (second_best != NULL));
5520
5521 if (!mptcp_use_symptomsd)
5522 return (NULL);
5523
5524 if (!mptcp_kern_skt_inuse)
5525 return (NULL);
5526
5527 /*
5528 * There could be devices with more than one wifi interface or
5529 * more than one wired or cell interfaces.
5530 * TBD: SymptomsD is unavailable on such platforms as of now.
5531 * Try to prefer best when possible in general.
5532 * Also, SymptomsD sends notifications about wifi only when it
5533 * is primary.
5534 */
5535 if (best->mpts_linktype & MPTSL_WIFI)
5536 wifisub = best;
5537 else if (best->mpts_linktype & MPTSL_CELL)
5538 cellsub = best;
5539 else if (best->mpts_linktype & MPTSL_WIRED)
5540 wiredsub = best;
5541
5542 /*
5543 * On platforms with wired paths, don't use hints about wifi or cell.
5544 * Currently, SymptomsD is not available on platforms with wired paths.
5545 */
5546 if (wiredsub)
5547 return (NULL);
5548
5549 if ((wifisub == NULL) && (second_best->mpts_linktype & MPTSL_WIFI))
5550 wifisub = second_best;
5551
5552 if ((cellsub == NULL) && (second_best->mpts_linktype & MPTSL_CELL))
5553 cellsub = second_best;
5554
5555 if ((wiredsub == NULL) && (second_best->mpts_linktype & MPTSL_WIRED))
5556 wiredsub = second_best;
5557
5558 if ((wifisub == best) && mptcp_is_wifi_unusable()) {
5559 tcpstat.tcps_mp_sel_symtomsd++;
5560 if (mptcp_is_cell_unusable()) {
5561 mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
5562 " suggests both Wifi and Cell are bad. Wired %s.",
5563 (wiredsub == NULL) ? "none" : "present"),
5564 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5565 return (wiredsub);
5566 } else {
5567 mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
5568 " suggests Wifi bad, Cell good. Wired %s.",
5569 (wiredsub == NULL) ? "none" : "present"),
5570 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5571 return ((wiredsub != NULL) ? wiredsub : cellsub);
5572 }
5573 }
5574
5575 if ((cellsub == best) && (mptcp_is_cell_unusable())) {
5576 tcpstat.tcps_mp_sel_symtomsd++;
5577 if (mptcp_is_wifi_unusable()) {
5578 mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
5579 " suggests both Cell and Wifi are bad. Wired %s.",
5580 (wiredsub == NULL) ? "none" : "present"),
5581 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5582 return (wiredsub);
5583 } else {
5584 mptcplog((LOG_DEBUG, "MPTCP Sender: SymptomsD hint"
5585 " suggests Cell bad, Wifi good. Wired %s.",
5586 (wiredsub == NULL) ? "none" : "present"),
5587 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5588 return ((wiredsub != NULL) ? wiredsub : wifisub);
5589 }
5590 }
5591
5592 /* little is known about the state of the network or wifi is good */
5593 return (NULL);
5594 }
5595
5596 /* If TFO data is succesfully acked, it must be dropped from the mptcp so */
5597 static void
5598 mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
5599 {
5600 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5601 struct socket *so = mpts->mpts_socket;
5602 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
5603 struct mptcb *mp_tp = mpte->mpte_mptcb;
5604
5605 /* If data was sent with SYN, rewind state */
5606 if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
5607 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
5608 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
5609 MPT_LOCK(mp_tp);
5610 u_int64_t mp_droplen = mpts->mpts_sndnxt - mp_tp->mpt_snduna;
5611 unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
5612 VERIFY(mp_droplen <= (UINT_MAX));
5613 VERIFY(mp_droplen >= tcp_droplen);
5614
5615 if (mp_droplen > tcp_droplen) {
5616 /* handle partial TCP ack */
5617 mp_so->so_flags1 |= SOF1_TFO_REWIND;
5618 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
5619 mpts->mpts_sndnxt = mp_tp->mpt_sndnxt;
5620 mp_droplen = tcp_droplen;
5621 } else {
5622 /* all data on SYN was acked */
5623 mpts->mpts_rel_seq = 1;
5624 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
5625 mpts->mpts_sndnxt = mp_tp->mpt_snduna;
5626 }
5627 mp_tp->mpt_sndmax -= tcp_droplen;
5628
5629 MPT_UNLOCK(mp_tp);
5630 if (mp_droplen != 0) {
5631 VERIFY(mp_so->so_snd.sb_mb != NULL);
5632 sbdrop(&mp_so->so_snd, (int)mp_droplen);
5633 }
5634 mptcplog((LOG_ERR, "MPTCP Sender: %s mp_so 0x%llx cid %d "
5635 "TFO tcp len %d mptcp len %d\n", __func__,
5636 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpts->mpts_connid,
5637 tcp_droplen, mp_droplen),
5638 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5639 }
5640 }
5641