]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/mptcp_usrreq.c
8749d1a0a40e5e9fa195f57e42970ac88d2310d6
[apple/xnu.git] / bsd / netinet / mptcp_usrreq.c
1 /*
2 * Copyright (c) 2012-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/kernel.h>
32 #include <sys/socket.h>
33 #include <sys/socketvar.h>
34 #include <sys/protosw.h>
35 #include <sys/mcache.h>
36 #include <sys/syslog.h>
37 #include <sys/proc.h>
38 #include <sys/proc_internal.h>
39 #include <sys/resourcevar.h>
40 #include <sys/kauth.h>
41 #include <sys/priv.h>
42
43 #include <net/if.h>
44 #include <netinet/in.h>
45 #include <netinet/in_var.h>
46 #include <netinet/tcp.h>
47 #include <netinet/tcp_fsm.h>
48 #include <netinet/tcp_seq.h>
49 #include <netinet/tcp_var.h>
50 #include <netinet/tcp_timer.h>
51 #include <netinet/mptcp_var.h>
52 #include <netinet/mptcp_timer.h>
53
54 #include <mach/sdt.h>
55
56 static int mptcp_usr_attach(struct socket *, int, struct proc *);
57 static int mptcp_usr_detach(struct socket *);
58 static int mptcp_attach(struct socket *, struct proc *);
59 static int mptcp_usr_connectx(struct socket *, struct sockaddr *,
60 struct sockaddr *, struct proc *, uint32_t, sae_associd_t,
61 sae_connid_t *, uint32_t, void *, uint32_t, struct uio *, user_ssize_t *);
62 static int mptcp_getassocids(struct mptses *, uint32_t *, user_addr_t);
63 static int mptcp_getconnids(struct mptses *, sae_associd_t, uint32_t *,
64 user_addr_t);
65 static int mptcp_getconninfo(struct mptses *, sae_connid_t *, uint32_t *,
66 uint32_t *, int32_t *, user_addr_t, socklen_t *, user_addr_t, socklen_t *,
67 uint32_t *, user_addr_t, uint32_t *);
68 static int mptcp_usr_control(struct socket *, u_long, caddr_t, struct ifnet *,
69 struct proc *);
70 static int mptcp_disconnect(struct mptses *);
71 static int mptcp_usr_disconnect(struct socket *);
72 static int mptcp_usr_disconnectx(struct socket *, sae_associd_t, sae_connid_t);
73 static struct mptses *mptcp_usrclosed(struct mptses *);
74 static int mptcp_usr_rcvd(struct socket *, int);
75 static int mptcp_usr_send(struct socket *, int, struct mbuf *,
76 struct sockaddr *, struct mbuf *, struct proc *);
77 static int mptcp_usr_shutdown(struct socket *);
78 static int mptcp_usr_sosend(struct socket *, struct sockaddr *, struct uio *,
79 struct mbuf *, struct mbuf *, int);
80 static int mptcp_usr_socheckopt(struct socket *, struct sockopt *);
81 static int mptcp_setopt(struct mptses *, struct sockopt *);
82 static int mptcp_getopt(struct mptses *, struct sockopt *);
83 static int mptcp_default_tcp_optval(struct mptses *, struct sockopt *, int *);
84 static int mptcp_usr_preconnect(struct socket *so);
85
86 struct pr_usrreqs mptcp_usrreqs = {
87 .pru_attach = mptcp_usr_attach,
88 .pru_connectx = mptcp_usr_connectx,
89 .pru_control = mptcp_usr_control,
90 .pru_detach = mptcp_usr_detach,
91 .pru_disconnect = mptcp_usr_disconnect,
92 .pru_disconnectx = mptcp_usr_disconnectx,
93 .pru_peeraddr = mp_getpeeraddr,
94 .pru_rcvd = mptcp_usr_rcvd,
95 .pru_send = mptcp_usr_send,
96 .pru_shutdown = mptcp_usr_shutdown,
97 .pru_sockaddr = mp_getsockaddr,
98 .pru_sosend = mptcp_usr_sosend,
99 .pru_soreceive = soreceive,
100 .pru_socheckopt = mptcp_usr_socheckopt,
101 .pru_preconnect = mptcp_usr_preconnect,
102 };
103
104
105 #if (DEVELOPMENT || DEBUG)
106 static int mptcp_disable_entitlements = 0;
107 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, disable_entitlements, CTLFLAG_RW | CTLFLAG_LOCKED,
108 &mptcp_disable_entitlements, 0, "Disable Multipath TCP Entitlement Checking");
109 #endif
110
111 int mptcp_developer_mode = 0;
112 SYSCTL_INT(_net_inet_mptcp, OID_AUTO, allow_aggregate, CTLFLAG_RW | CTLFLAG_LOCKED,
113 &mptcp_developer_mode, 0, "Allow the Multipath aggregation mode");
114
115
116 /*
117 * Attaches an MPTCP control block to a socket.
118 */
119 static int
120 mptcp_usr_attach(struct socket *mp_so, int proto, struct proc *p)
121 {
122 #pragma unused(proto)
123 int error;
124
125 VERIFY(mpsotomppcb(mp_so) == NULL);
126
127 error = mptcp_attach(mp_so, p);
128 if (error != 0)
129 goto out;
130 /*
131 * XXX: adi@apple.com
132 *
133 * Might want to use a different SO_LINGER timeout than TCP's?
134 */
135 if ((mp_so->so_options & SO_LINGER) && mp_so->so_linger == 0)
136 mp_so->so_linger = TCP_LINGERTIME * hz;
137 out:
138 return (error);
139 }
140
141 /*
142 * Detaches an MPTCP control block from a socket.
143 */
144 static int
145 mptcp_usr_detach(struct socket *mp_so)
146 {
147 struct mptses *mpte = mpsotompte(mp_so);
148 struct mppcb *mpp = mpsotomppcb(mp_so);
149
150 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
151 mptcplog((LOG_ERR, "%s state: %d\n", __func__,
152 mpp ? mpp->mpp_state : -1),
153 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
154 return (EINVAL);
155 }
156
157 /*
158 * We are done with this MPTCP socket (it has been closed);
159 * trigger all subflows to be disconnected, if not already,
160 * by initiating the PCB detach sequence (SOF_PCBCLEARING
161 * will be set.)
162 */
163 mp_pcbdetach(mp_so);
164
165 mptcp_disconnect(mpte);
166
167 return (0);
168 }
169
170 /*
171 * Attach MPTCP protocol to socket, allocating MP control block,
172 * MPTCP session, control block, buffer space, etc.
173 */
174 static int
175 mptcp_attach(struct socket *mp_so, struct proc *p)
176 {
177 #pragma unused(p)
178 struct mptses *mpte = NULL;
179 struct mptcb *mp_tp = NULL;
180 struct mppcb *mpp = NULL;
181 int error = 0;
182
183 if (mp_so->so_snd.sb_hiwat == 0 || mp_so->so_rcv.sb_hiwat == 0) {
184 error = soreserve(mp_so, tcp_sendspace, tcp_recvspace);
185 if (error != 0)
186 goto out;
187 }
188
189 if (mp_so->so_snd.sb_preconn_hiwat == 0) {
190 soreserve_preconnect(mp_so, 2048);
191 }
192
193 if ((mp_so->so_rcv.sb_flags & SB_USRSIZE) == 0)
194 mp_so->so_rcv.sb_flags |= SB_AUTOSIZE;
195 if ((mp_so->so_snd.sb_flags & SB_USRSIZE) == 0)
196 mp_so->so_snd.sb_flags |= SB_AUTOSIZE;
197
198 /*
199 * MPTCP socket buffers cannot be compressed, due to the
200 * fact that each mbuf chained via m_next is a M_PKTHDR
201 * which carries some MPTCP metadata.
202 */
203 mp_so->so_snd.sb_flags |= SB_NOCOMPRESS;
204 mp_so->so_rcv.sb_flags |= SB_NOCOMPRESS;
205
206 if ((error = mp_pcballoc(mp_so, &mtcbinfo)) != 0) {
207 goto out;
208 }
209
210 mpp = mpsotomppcb(mp_so);
211 VERIFY(mpp != NULL);
212 mpte = (struct mptses *)mpp->mpp_pcbe;
213 VERIFY(mpte != NULL);
214 mp_tp = mpte->mpte_mptcb;
215 VERIFY(mp_tp != NULL);
216 out:
217 return (error);
218 }
219
220 static int
221 mptcp_entitlement_check(struct socket *mp_so)
222 {
223 struct mptses *mpte = mpsotompte(mp_so);
224
225 if (soopt_cred_check(mp_so, PRIV_NET_RESTRICTED_MULTIPATH_EXTENDED, TRUE) == 0) {
226 /*
227 * This means the app has the extended entitlement. Thus,
228 * it's a first party app and can run without restrictions.
229 */
230 mpte->mpte_flags |= MPTE_FIRSTPARTY;
231 goto grant;
232 }
233
234 #if (DEVELOPMENT || DEBUG)
235 if (mptcp_disable_entitlements)
236 goto grant;
237 #endif
238
239 if (soopt_cred_check(mp_so, PRIV_NET_PRIVILEGED_MULTIPATH, TRUE)) {
240 mptcplog((LOG_NOTICE, "%s Multipath Capability needed\n", __func__),
241 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
242 return (-1);
243 }
244
245 if (mpte->mpte_svctype > MPTCP_SVCTYPE_INTERACTIVE &&
246 mptcp_developer_mode == 0) {
247 mptcplog((LOG_NOTICE, "%s need to set allow_aggregate sysctl\n",
248 __func__), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
249 return (-1);
250 }
251
252 grant:
253 mptcplog((LOG_NOTICE, "%s entitlement granted for %u\n", __func__, mpte->mpte_svctype),
254 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
255
256 return (0);
257 }
258
259 /*
260 * Common subroutine to open a MPTCP connection to one of the remote hosts
261 * specified by dst_sl. This includes allocating and establishing a
262 * subflow TCP connection, either initially to establish MPTCP connection,
263 * or to join an existing one. Returns a connection handle upon success.
264 */
265 static int
266 mptcp_connectx(struct mptses *mpte, struct sockaddr *src,
267 struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
268 {
269 struct socket *mp_so = mptetoso(mpte);
270 int error = 0;
271
272 VERIFY(dst != NULL);
273 VERIFY(pcid != NULL);
274
275 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx\n", __func__,
276 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
277 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
278 DTRACE_MPTCP2(connectx, struct mptses *, mpte, struct socket *, mp_so);
279
280 error = mptcp_subflow_add(mpte, src, dst, ifscope, pcid);
281
282 return (error);
283 }
284
285 /*
286 * User-protocol pru_connectx callback.
287 */
288 static int
289 mptcp_usr_connectx(struct socket *mp_so, struct sockaddr *src,
290 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
291 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
292 uint32_t arglen, struct uio *auio, user_ssize_t *bytes_written)
293 {
294 #pragma unused(p, aid, flags, arg, arglen)
295 struct mppcb *mpp = mpsotomppcb(mp_so);
296 struct mptses *mpte = NULL;
297 struct mptcb *mp_tp = NULL;
298 user_ssize_t datalen;
299 int error = 0;
300
301 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
302 mptcplog((LOG_ERR, "%s state %d\n", __func__,
303 mpp ? mpp->mpp_state : -1),
304 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
305 error = EINVAL;
306 goto out;
307 }
308 mpte = mptompte(mpp);
309 VERIFY(mpte != NULL);
310 mpte_lock_assert_held(mpte);
311
312 mp_tp = mpte->mpte_mptcb;
313 VERIFY(mp_tp != NULL);
314
315 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
316 mptcplog((LOG_ERR, "%s fell back to TCP\n", __func__),
317 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
318 error = EINVAL;
319 goto out;
320 }
321
322 if (dst->sa_family == AF_INET &&
323 dst->sa_len != sizeof(mpte->__mpte_dst_v4)) {
324 mptcplog((LOG_ERR, "%s IPv4 dst len %u\n", __func__,
325 dst->sa_len),
326 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
327 error = EINVAL;
328 goto out;
329 }
330
331 if (dst->sa_family == AF_INET6 &&
332 dst->sa_len != sizeof(mpte->__mpte_dst_v6)) {
333 mptcplog((LOG_ERR, "%s IPv6 dst len %u\n", __func__,
334 dst->sa_len),
335 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
336 error = EINVAL;
337 goto out;
338 }
339
340 if (!(mpte->mpte_flags & MPTE_SVCTYPE_CHECKED)) {
341 if (mptcp_entitlement_check(mp_so) < 0) {
342 error = EPERM;
343 goto out;
344 }
345
346 mpte->mpte_flags |= MPTE_SVCTYPE_CHECKED;
347 }
348
349 if ((mp_so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) {
350 memcpy(&mpte->mpte_dst, dst, dst->sa_len);
351 }
352
353 if (src) {
354 if (src->sa_family == AF_INET &&
355 src->sa_len != sizeof(mpte->__mpte_src_v4)) {
356 mptcplog((LOG_ERR, "%s IPv4 src len %u\n", __func__,
357 src->sa_len),
358 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
359 error = EINVAL;
360 goto out;
361 }
362
363 if (src->sa_family == AF_INET6 &&
364 src->sa_len != sizeof(mpte->__mpte_src_v6)) {
365 mptcplog((LOG_ERR, "%s IPv6 src len %u\n", __func__,
366 src->sa_len),
367 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
368 error = EINVAL;
369 goto out;
370 }
371
372 if ((mp_so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) {
373 memcpy(&mpte->mpte_src, src, src->sa_len);
374 }
375 }
376
377 error = mptcp_connectx(mpte, src, dst, ifscope, pcid);
378
379 /* If there is data, copy it */
380 if (auio != NULL) {
381 datalen = uio_resid(auio);
382 socket_unlock(mp_so, 0);
383 error = mp_so->so_proto->pr_usrreqs->pru_sosend(mp_so, NULL,
384 (uio_t) auio, NULL, NULL, 0);
385
386 if (error == 0 || error == EWOULDBLOCK)
387 *bytes_written = datalen - uio_resid(auio);
388
389 if (error == EWOULDBLOCK)
390 error = EINPROGRESS;
391
392 socket_lock(mp_so, 0);
393 }
394
395 out:
396 return (error);
397 }
398
399 /*
400 * Handle SIOCGASSOCIDS ioctl for PF_MULTIPATH domain.
401 */
402 static int
403 mptcp_getassocids(struct mptses *mpte, uint32_t *cnt, user_addr_t aidp)
404 {
405 mpte_lock_assert_held(mpte); /* same as MP socket lock */
406
407 /* MPTCP has at most 1 association */
408 *cnt = (mpte->mpte_associd != SAE_ASSOCID_ANY) ? 1 : 0;
409
410 /* just asking how many there are? */
411 if (aidp == USER_ADDR_NULL)
412 return (0);
413
414 return (copyout(&mpte->mpte_associd, aidp,
415 sizeof (mpte->mpte_associd)));
416 }
417
418 /*
419 * Handle SIOCGCONNIDS ioctl for PF_MULTIPATH domain.
420 */
421 static int
422 mptcp_getconnids(struct mptses *mpte, sae_associd_t aid, uint32_t *cnt,
423 user_addr_t cidp)
424 {
425 struct mptsub *mpts;
426 int error = 0;
427
428 mpte_lock_assert_held(mpte); /* same as MP socket lock */
429
430 if (aid != SAE_ASSOCID_ANY && aid != SAE_ASSOCID_ALL &&
431 aid != mpte->mpte_associd)
432 return (EINVAL);
433
434 *cnt = mpte->mpte_numflows;
435
436 /* just asking how many there are? */
437 if (cidp == USER_ADDR_NULL)
438 return (0);
439
440 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
441 if ((error = copyout(&mpts->mpts_connid, cidp,
442 sizeof (mpts->mpts_connid))) != 0)
443 break;
444
445 cidp += sizeof (mpts->mpts_connid);
446 }
447
448 return (error);
449 }
450
451 /*
452 * Handle SIOCGCONNINFO ioctl for PF_MULTIPATH domain.
453 */
454 static int
455 mptcp_getconninfo(struct mptses *mpte, sae_connid_t *cid, uint32_t *flags,
456 uint32_t *ifindex, int32_t *soerror, user_addr_t src, socklen_t *src_len,
457 user_addr_t dst, socklen_t *dst_len, uint32_t *aux_type,
458 user_addr_t aux_data, uint32_t *aux_len)
459 {
460 struct socket *so;
461 struct inpcb *inp;
462 struct mptsub *mpts;
463 int error = 0;
464
465 *flags = 0;
466 *aux_type = 0;
467 *ifindex = 0;
468 *soerror = 0;
469
470 if (*cid == SAE_CONNID_ALL) {
471 struct socket *mp_so = mptetoso(mpte);
472 struct mptcb *mp_tp = mpte->mpte_mptcb;
473 struct conninfo_multipathtcp mptcp_ci;
474
475 if (*aux_len != 0 && *aux_len != sizeof(mptcp_ci))
476 return (EINVAL);
477
478 if (mp_so->so_state & SS_ISCONNECTING)
479 *flags |= CIF_CONNECTING;
480 if (mp_so->so_state & SS_ISCONNECTED)
481 *flags |= CIF_CONNECTED;
482 if (mp_so->so_state & SS_ISDISCONNECTING)
483 *flags |= CIF_DISCONNECTING;
484 if (mp_so->so_state & SS_ISDISCONNECTED)
485 *flags |= CIF_DISCONNECTED;
486 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP))
487 *flags |= CIF_MP_CAPABLE;
488 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)
489 *flags |= CIF_MP_DEGRADED;
490
491 *src_len = 0;
492 *dst_len = 0;
493
494 *aux_type = CIAUX_MPTCP;
495 *aux_len = sizeof(mptcp_ci);
496
497 if (aux_data != USER_ADDR_NULL) {
498 unsigned long i = 0;
499 int initial_info_set = 0;
500
501 bzero(&mptcp_ci, sizeof (mptcp_ci));
502 mptcp_ci.mptcpci_subflow_count = mpte->mpte_numflows;
503 mptcp_ci.mptcpci_switch_count = mpte->mpte_subflow_switches;
504
505 VERIFY(sizeof(mptcp_ci.mptcpci_itfstats) == sizeof(mpte->mpte_itfstats));
506 memcpy(mptcp_ci.mptcpci_itfstats, mpte->mpte_itfstats, sizeof(mptcp_ci.mptcpci_itfstats));
507
508 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
509 if (i >= sizeof(mptcp_ci.mptcpci_subflow_connids) / sizeof(sae_connid_t))
510 break;
511 mptcp_ci.mptcpci_subflow_connids[i] = mpts->mpts_connid;
512
513 if (mpts->mpts_flags & MPTSF_INITIAL_SUB) {
514 inp = sotoinpcb(mpts->mpts_socket);
515
516 mptcp_ci.mptcpci_init_rxbytes = inp->inp_stat->rxbytes;
517 mptcp_ci.mptcpci_init_txbytes = inp->inp_stat->txbytes;
518 initial_info_set = 1;
519 }
520
521 mptcpstats_update(mptcp_ci.mptcpci_itfstats, mpts);
522
523 i++;
524 }
525
526 if (initial_info_set == 0) {
527 mptcp_ci.mptcpci_init_rxbytes = mpte->mpte_init_rxbytes;
528 mptcp_ci.mptcpci_init_txbytes = mpte->mpte_init_txbytes;
529 }
530
531 if (mpte->mpte_flags & MPTE_FIRSTPARTY)
532 mptcp_ci.mptcpci_flags |= MPTCPCI_FIRSTPARTY;
533
534 error = copyout(&mptcp_ci, aux_data, sizeof(mptcp_ci));
535 if (error != 0) {
536 mptcplog((LOG_ERR, "%s copyout failed: %d\n",
537 __func__, error),
538 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
539 return (error);
540 }
541 }
542
543 return (0);
544 }
545
546 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
547 if (mpts->mpts_connid == *cid || *cid == SAE_CONNID_ANY)
548 break;
549 }
550 if (mpts == NULL)
551 return ((*cid == SAE_CONNID_ANY) ? ENXIO : EINVAL);
552
553 so = mpts->mpts_socket;
554 inp = sotoinpcb(so);
555
556 if (inp->inp_vflag & INP_IPV4)
557 error = in_getconninfo(so, SAE_CONNID_ANY, flags, ifindex,
558 soerror, src, src_len, dst, dst_len,
559 aux_type, aux_data, aux_len);
560 else
561 error = in6_getconninfo(so, SAE_CONNID_ANY, flags, ifindex,
562 soerror, src, src_len, dst, dst_len,
563 aux_type, aux_data, aux_len);
564
565 if (error != 0) {
566 mptcplog((LOG_ERR, "%s error from in_getconninfo %d\n",
567 __func__, error),
568 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
569 return (error);
570 }
571
572 if (mpts->mpts_flags & MPTSF_MP_CAPABLE)
573 *flags |= CIF_MP_CAPABLE;
574 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
575 *flags |= CIF_MP_DEGRADED;
576 if (mpts->mpts_flags & MPTSF_MP_READY)
577 *flags |= CIF_MP_READY;
578 if (mpts->mpts_flags & MPTSF_ACTIVE)
579 *flags |= CIF_MP_ACTIVE;
580
581 mptcplog((LOG_DEBUG, "%s: cid %d flags %x \n", __func__,
582 mpts->mpts_connid, mpts->mpts_flags),
583 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
584
585 return (0);
586 }
587
588 /*
589 * User-protocol pru_control callback.
590 */
591 static int
592 mptcp_usr_control(struct socket *mp_so, u_long cmd, caddr_t data,
593 struct ifnet *ifp, struct proc *p)
594 {
595 #pragma unused(ifp, p)
596 struct mppcb *mpp = mpsotomppcb(mp_so);
597 struct mptses *mpte;
598 int error = 0;
599
600 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
601 error = EINVAL;
602 goto out;
603 }
604 mpte = mptompte(mpp);
605 VERIFY(mpte != NULL);
606
607 mpte_lock_assert_held(mpte); /* same as MP socket lock */
608
609 switch (cmd) {
610 case SIOCGASSOCIDS32: { /* struct so_aidreq32 */
611 struct so_aidreq32 aidr;
612 bcopy(data, &aidr, sizeof (aidr));
613 error = mptcp_getassocids(mpte, &aidr.sar_cnt,
614 aidr.sar_aidp);
615 if (error == 0)
616 bcopy(&aidr, data, sizeof (aidr));
617 break;
618 }
619
620 case SIOCGASSOCIDS64: { /* struct so_aidreq64 */
621 struct so_aidreq64 aidr;
622 bcopy(data, &aidr, sizeof (aidr));
623 error = mptcp_getassocids(mpte, &aidr.sar_cnt,
624 aidr.sar_aidp);
625 if (error == 0)
626 bcopy(&aidr, data, sizeof (aidr));
627 break;
628 }
629
630 case SIOCGCONNIDS32: { /* struct so_cidreq32 */
631 struct so_cidreq32 cidr;
632 bcopy(data, &cidr, sizeof (cidr));
633 error = mptcp_getconnids(mpte, cidr.scr_aid, &cidr.scr_cnt,
634 cidr.scr_cidp);
635 if (error == 0)
636 bcopy(&cidr, data, sizeof (cidr));
637 break;
638 }
639
640 case SIOCGCONNIDS64: { /* struct so_cidreq64 */
641 struct so_cidreq64 cidr;
642 bcopy(data, &cidr, sizeof (cidr));
643 error = mptcp_getconnids(mpte, cidr.scr_aid, &cidr.scr_cnt,
644 cidr.scr_cidp);
645 if (error == 0)
646 bcopy(&cidr, data, sizeof (cidr));
647 break;
648 }
649
650 case SIOCGCONNINFO32: { /* struct so_cinforeq32 */
651 struct so_cinforeq32 cifr;
652 bcopy(data, &cifr, sizeof (cifr));
653 error = mptcp_getconninfo(mpte, &cifr.scir_cid,
654 &cifr.scir_flags, &cifr.scir_ifindex, &cifr.scir_error,
655 cifr.scir_src, &cifr.scir_src_len, cifr.scir_dst,
656 &cifr.scir_dst_len, &cifr.scir_aux_type, cifr.scir_aux_data,
657 &cifr.scir_aux_len);
658 if (error == 0)
659 bcopy(&cifr, data, sizeof (cifr));
660 break;
661 }
662
663 case SIOCGCONNINFO64: { /* struct so_cinforeq64 */
664 struct so_cinforeq64 cifr;
665 bcopy(data, &cifr, sizeof (cifr));
666 error = mptcp_getconninfo(mpte, &cifr.scir_cid,
667 &cifr.scir_flags, &cifr.scir_ifindex, &cifr.scir_error,
668 cifr.scir_src, &cifr.scir_src_len, cifr.scir_dst,
669 &cifr.scir_dst_len, &cifr.scir_aux_type, cifr.scir_aux_data,
670 &cifr.scir_aux_len);
671 if (error == 0)
672 bcopy(&cifr, data, sizeof (cifr));
673 break;
674 }
675
676 default:
677 error = EOPNOTSUPP;
678 break;
679 }
680 out:
681 return (error);
682 }
683
684 static int
685 mptcp_disconnect(struct mptses *mpte)
686 {
687 struct socket *mp_so;
688 struct mptcb *mp_tp;
689 int error = 0;
690
691 mpte_lock_assert_held(mpte); /* same as MP socket lock */
692
693 mp_so = mptetoso(mpte);
694 mp_tp = mpte->mpte_mptcb;
695
696 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx %d\n", __func__,
697 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_error),
698 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
699
700 DTRACE_MPTCP3(disconnectx, struct mptses *, mpte,
701 struct socket *, mp_so, struct mptcb *, mp_tp);
702
703 /* if we're not detached, go thru socket state checks */
704 if (!(mp_so->so_flags & SOF_PCBCLEARING)) {
705 if (!(mp_so->so_state & (SS_ISCONNECTED|
706 SS_ISCONNECTING))) {
707 error = ENOTCONN;
708 goto out;
709 }
710 if (mp_so->so_state & SS_ISDISCONNECTING) {
711 error = EALREADY;
712 goto out;
713 }
714 }
715
716 mptcp_cancel_all_timers(mp_tp);
717 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
718 mptcp_close(mpte, mp_tp);
719 } else if ((mp_so->so_options & SO_LINGER) &&
720 mp_so->so_linger == 0) {
721 mptcp_drop(mpte, mp_tp, 0);
722 } else {
723 soisdisconnecting(mp_so);
724 sbflush(&mp_so->so_rcv);
725 if (mptcp_usrclosed(mpte) != NULL)
726 mptcp_output(mpte);
727 }
728
729 if (error == 0)
730 mptcp_subflow_workloop(mpte);
731
732 out:
733 return (error);
734 }
735
736 /*
737 * Wrapper function to support disconnect on socket
738 */
739 static int
740 mptcp_usr_disconnect(struct socket *mp_so)
741 {
742 return (mptcp_disconnect(mpsotompte(mp_so)));
743 }
744
745 /*
746 * User-protocol pru_disconnectx callback.
747 */
748 static int
749 mptcp_usr_disconnectx(struct socket *mp_so, sae_associd_t aid, sae_connid_t cid)
750 {
751 if (aid != SAE_ASSOCID_ANY && aid != SAE_ASSOCID_ALL)
752 return (EINVAL);
753
754 if (cid != SAE_CONNID_ANY && cid != SAE_CONNID_ALL)
755 return (EINVAL);
756
757 return (mptcp_usr_disconnect(mp_so));
758 }
759
760 void
761 mptcp_finish_usrclosed(struct mptses *mpte)
762 {
763 struct mptcb *mp_tp = mpte->mpte_mptcb;
764 struct socket *mp_so = mptetoso(mpte);
765
766 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
767 mpte = mptcp_close(mpte, mp_tp);
768 } else if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2) {
769 soisdisconnected(mp_so);
770 } else {
771 struct mptsub *mpts;
772
773 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
774 if ((mp_so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
775 (SS_CANTRCVMORE | SS_CANTSENDMORE))
776 mptcp_subflow_disconnect(mpte, mpts);
777 else
778 mptcp_subflow_shutdown(mpte, mpts);
779 }
780 }
781 }
782
783 /*
784 * User issued close, and wish to trail thru shutdown states.
785 */
786 static struct mptses *
787 mptcp_usrclosed(struct mptses *mpte)
788 {
789 struct mptcb *mp_tp = mpte->mpte_mptcb;
790
791 mptcp_close_fsm(mp_tp, MPCE_CLOSE);
792
793 /* Not everything has been acknowledged - don't close the subflows! */
794 if (mp_tp->mpt_sndnxt + 1 != mp_tp->mpt_sndmax)
795 return (mpte);
796
797 mptcp_finish_usrclosed(mpte);
798
799 return (mpte);
800 }
801
802 /*
803 * After a receive, possible send some update to peer.
804 */
805 static int
806 mptcp_usr_rcvd(struct socket *mp_so, int flags)
807 {
808 #pragma unused(flags)
809 struct mppcb *mpp = mpsotomppcb(mp_so);
810 struct mptses *mpte;
811 int error = 0;
812
813 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
814 error = EINVAL;
815 goto out;
816 }
817 mpte = mptompte(mpp);
818 VERIFY(mpte != NULL);
819
820 error = mptcp_output(mpte);
821 out:
822 return (error);
823 }
824
825 /*
826 * Do a send by putting data in the output queue.
827 */
828 static int
829 mptcp_usr_send(struct socket *mp_so, int prus_flags, struct mbuf *m,
830 struct sockaddr *nam, struct mbuf *control, struct proc *p)
831 {
832 #pragma unused(nam, p)
833 struct mppcb *mpp = mpsotomppcb(mp_so);
834 struct mptses *mpte;
835 int error = 0;
836
837 if (prus_flags & (PRUS_OOB|PRUS_EOF)) {
838 error = EOPNOTSUPP;
839 goto out;
840 }
841
842 if (nam != NULL) {
843 error = EOPNOTSUPP;
844 goto out;
845 }
846
847 if (control != NULL && control->m_len != 0) {
848 error = EOPNOTSUPP;
849 goto out;
850 }
851
852 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
853 error = ECONNRESET;
854 goto out;
855 }
856 mpte = mptompte(mpp);
857 VERIFY(mpte != NULL);
858
859 if (!(mp_so->so_state & SS_ISCONNECTED) &&
860 !(mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
861 error = ENOTCONN;
862 goto out;
863 }
864
865 mptcp_insert_dsn(mpp, m);
866 VERIFY(mp_so->so_snd.sb_flags & SB_NOCOMPRESS);
867 sbappendstream(&mp_so->so_snd, m);
868 m = NULL;
869
870 error = mptcp_output(mpte);
871 if (error != 0)
872 goto out;
873
874 if (mp_so->so_state & SS_ISCONNECTING) {
875 if (mp_so->so_state & SS_NBIO)
876 error = EWOULDBLOCK;
877 else
878 error = sbwait(&mp_so->so_snd);
879 }
880
881 out:
882 if (error) {
883 if (m != NULL)
884 m_freem(m);
885 if (control != NULL)
886 m_freem(control);
887 }
888 return (error);
889 }
890
891 /*
892 * Mark the MPTCP connection as being incapable of further output.
893 */
894 static int
895 mptcp_usr_shutdown(struct socket *mp_so)
896 {
897 struct mppcb *mpp = mpsotomppcb(mp_so);
898 struct mptses *mpte;
899 int error = 0;
900
901 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
902 error = EINVAL;
903 goto out;
904 }
905 mpte = mptompte(mpp);
906 VERIFY(mpte != NULL);
907
908 socantsendmore(mp_so);
909
910 mpte = mptcp_usrclosed(mpte);
911 if (mpte != NULL)
912 error = mptcp_output(mpte);
913 out:
914 return (error);
915 }
916
917 /*
918 * Copy the contents of uio into a properly sized mbuf chain.
919 */
920 static int
921 mptcp_uiotombuf(struct uio *uio, int how, int space, uint32_t align,
922 struct mbuf **top)
923 {
924 struct mbuf *m, *mb, *nm = NULL, *mtail = NULL;
925 user_ssize_t resid, tot, len, progress; /* must be user_ssize_t */
926 int error;
927
928 VERIFY(top != NULL && *top == NULL);
929
930 /*
931 * space can be zero or an arbitrary large value bound by
932 * the total data supplied by the uio.
933 */
934 resid = uio_resid(uio);
935 if (space > 0)
936 tot = imin(resid, space);
937 else
938 tot = resid;
939
940 /*
941 * The smallest unit is a single mbuf with pkthdr.
942 * We can't align past it.
943 */
944 if (align >= MHLEN)
945 return (EINVAL);
946
947 /*
948 * Give us the full allocation or nothing.
949 * If space is zero return the smallest empty mbuf.
950 */
951 if ((len = tot + align) == 0)
952 len = 1;
953
954 /* Loop and append maximum sized mbufs to the chain tail. */
955 while (len > 0) {
956 uint32_t m_needed = 1;
957
958 if (njcl > 0 && len > MBIGCLBYTES)
959 mb = m_getpackets_internal(&m_needed, 1,
960 how, 1, M16KCLBYTES);
961 else if (len > MCLBYTES)
962 mb = m_getpackets_internal(&m_needed, 1,
963 how, 1, MBIGCLBYTES);
964 else if (len >= (signed)MINCLSIZE)
965 mb = m_getpackets_internal(&m_needed, 1,
966 how, 1, MCLBYTES);
967 else
968 mb = m_gethdr(how, MT_DATA);
969
970 /* Fail the whole operation if one mbuf can't be allocated. */
971 if (mb == NULL) {
972 if (nm != NULL)
973 m_freem(nm);
974 return (ENOBUFS);
975 }
976
977 /* Book keeping. */
978 VERIFY(mb->m_flags & M_PKTHDR);
979 len -= ((mb->m_flags & M_EXT) ? mb->m_ext.ext_size : MHLEN);
980 if (mtail != NULL)
981 mtail->m_next = mb;
982 else
983 nm = mb;
984 mtail = mb;
985 }
986
987 m = nm;
988 m->m_data += align;
989
990 progress = 0;
991 /* Fill all mbufs with uio data and update header information. */
992 for (mb = m; mb != NULL; mb = mb->m_next) {
993 len = imin(M_TRAILINGSPACE(mb), tot - progress);
994
995 error = uiomove(mtod(mb, char *), len, uio);
996 if (error != 0) {
997 m_freem(m);
998 return (error);
999 }
1000
1001 /* each mbuf is M_PKTHDR chained via m_next */
1002 mb->m_len = len;
1003 mb->m_pkthdr.len = len;
1004
1005 progress += len;
1006 }
1007 VERIFY(progress == tot);
1008 *top = m;
1009 return (0);
1010 }
1011
1012 /*
1013 * MPTCP socket protocol-user socket send routine, derived from sosend().
1014 */
1015 static int
1016 mptcp_usr_sosend(struct socket *mp_so, struct sockaddr *addr, struct uio *uio,
1017 struct mbuf *top, struct mbuf *control, int flags)
1018 {
1019 #pragma unused(addr)
1020 int32_t space;
1021 user_ssize_t resid;
1022 int error, sendflags;
1023 struct proc *p = current_proc();
1024 int sblocked = 0;
1025
1026 /* UIO is required for now, due to per-mbuf M_PKTHDR constrains */
1027 if (uio == NULL || top != NULL) {
1028 error = EINVAL;
1029 goto out;
1030 }
1031 resid = uio_resid(uio);
1032
1033 socket_lock(mp_so, 1);
1034 so_update_last_owner_locked(mp_so, p);
1035 so_update_policy(mp_so);
1036
1037 VERIFY(mp_so->so_type == SOCK_STREAM);
1038 VERIFY(!(mp_so->so_flags & SOF_MP_SUBFLOW));
1039
1040 if ((flags & (MSG_OOB|MSG_DONTROUTE|MSG_HOLD|MSG_SEND|MSG_FLUSH)) ||
1041 (mp_so->so_flags & SOF_ENABLE_MSGS)) {
1042 error = EOPNOTSUPP;
1043 socket_unlock(mp_so, 1);
1044 goto out;
1045 }
1046
1047 /*
1048 * In theory resid should be unsigned. However, space must be
1049 * signed, as it might be less than 0 if we over-committed, and we
1050 * must use a signed comparison of space and resid. On the other
1051 * hand, a negative resid causes us to loop sending 0-length
1052 * segments to the protocol.
1053 */
1054 if (resid < 0 || (flags & MSG_EOR) || control != NULL) {
1055 error = EINVAL;
1056 socket_unlock(mp_so, 1);
1057 goto out;
1058 }
1059
1060 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
1061
1062 do {
1063 error = sosendcheck(mp_so, NULL, resid, 0, 0, flags,
1064 &sblocked, NULL);
1065 if (error != 0)
1066 goto release;
1067
1068 space = sbspace(&mp_so->so_snd);
1069 do {
1070 socket_unlock(mp_so, 0);
1071 /*
1072 * Copy the data from userland into an mbuf chain.
1073 */
1074 error = mptcp_uiotombuf(uio, M_WAITOK, space, 0, &top);
1075 if (error != 0) {
1076 socket_lock(mp_so, 0);
1077 goto release;
1078 }
1079 VERIFY(top != NULL);
1080 space -= resid - uio_resid(uio);
1081 resid = uio_resid(uio);
1082 socket_lock(mp_so, 0);
1083
1084 /*
1085 * Compute flags here, for pru_send and NKEs.
1086 */
1087 sendflags = (resid > 0 && space > 0) ?
1088 PRUS_MORETOCOME : 0;
1089
1090 /*
1091 * Socket filter processing
1092 */
1093 VERIFY(control == NULL);
1094 error = sflt_data_out(mp_so, NULL, &top, &control, 0);
1095 if (error != 0) {
1096 if (error == EJUSTRETURN) {
1097 error = 0;
1098 top = NULL;
1099 /* always free control if any */
1100 }
1101 goto release;
1102 }
1103 if (control != NULL) {
1104 m_freem(control);
1105 control = NULL;
1106 }
1107
1108 /*
1109 * Pass data to protocol.
1110 */
1111 error = (*mp_so->so_proto->pr_usrreqs->pru_send)
1112 (mp_so, sendflags, top, NULL, NULL, p);
1113
1114 top = NULL;
1115 if (error != 0)
1116 goto release;
1117 } while (resid != 0 && space > 0);
1118 } while (resid != 0);
1119
1120 release:
1121 if (sblocked)
1122 sbunlock(&mp_so->so_snd, FALSE); /* will unlock socket */
1123 else
1124 socket_unlock(mp_so, 1);
1125 out:
1126 if (top != NULL)
1127 m_freem(top);
1128 if (control != NULL)
1129 m_freem(control);
1130
1131 soclearfastopen(mp_so);
1132
1133 return (error);
1134 }
1135
1136 /*
1137 * Called to filter SOPT_{SET,GET} for SOL_SOCKET level socket options.
1138 * This routine simply indicates to the caller whether or not to proceed
1139 * further with the given socket option. This is invoked by sosetoptlock()
1140 * and sogetoptlock().
1141 */
1142 static int
1143 mptcp_usr_socheckopt(struct socket *mp_so, struct sockopt *sopt)
1144 {
1145 #pragma unused(mp_so)
1146 int error = 0;
1147
1148 VERIFY(sopt->sopt_level == SOL_SOCKET);
1149
1150 /*
1151 * We could check for sopt_dir (set/get) here, but we'll just
1152 * let the caller deal with it as appropriate; therefore the
1153 * following is a superset of the socket options which we
1154 * allow for set/get.
1155 *
1156 * XXX: adi@apple.com
1157 *
1158 * Need to consider the following cases:
1159 *
1160 * a. Certain socket options don't have a clear definition
1161 * on the expected behavior post connect(2). At the time
1162 * those options are issued on the MP socket, there may
1163 * be existing subflow sockets that are already connected.
1164 */
1165 switch (sopt->sopt_name) {
1166 case SO_LINGER: /* MP */
1167 case SO_LINGER_SEC: /* MP */
1168 case SO_TYPE: /* MP */
1169 case SO_NREAD: /* MP */
1170 case SO_NWRITE: /* MP */
1171 case SO_ERROR: /* MP */
1172 case SO_SNDBUF: /* MP */
1173 case SO_RCVBUF: /* MP */
1174 case SO_SNDLOWAT: /* MP */
1175 case SO_RCVLOWAT: /* MP */
1176 case SO_SNDTIMEO: /* MP */
1177 case SO_RCVTIMEO: /* MP */
1178 case SO_NKE: /* MP */
1179 case SO_NOSIGPIPE: /* MP */
1180 case SO_NOADDRERR: /* MP */
1181 case SO_LABEL: /* MP */
1182 case SO_PEERLABEL: /* MP */
1183 case SO_DEFUNCTOK: /* MP */
1184 case SO_ISDEFUNCT: /* MP */
1185 case SO_TRAFFIC_CLASS_DBG: /* MP */
1186 case SO_DELEGATED: /* MP */
1187 case SO_DELEGATED_UUID: /* MP */
1188 #if NECP
1189 case SO_NECP_ATTRIBUTES:
1190 case SO_NECP_CLIENTUUID:
1191 #endif /* NECP */
1192 /*
1193 * Tell the caller that these options are to be processed.
1194 */
1195 break;
1196
1197 case SO_DEBUG: /* MP + subflow */
1198 case SO_KEEPALIVE: /* MP + subflow */
1199 case SO_USELOOPBACK: /* MP + subflow */
1200 case SO_RANDOMPORT: /* MP + subflow */
1201 case SO_TRAFFIC_CLASS: /* MP + subflow */
1202 case SO_RECV_TRAFFIC_CLASS: /* MP + subflow */
1203 case SO_PRIVILEGED_TRAFFIC_CLASS: /* MP + subflow */
1204 case SO_RECV_ANYIF: /* MP + subflow */
1205 case SO_RESTRICTIONS: /* MP + subflow */
1206 case SO_FLUSH: /* MP + subflow */
1207 case SO_NOWAKEFROMSLEEP:
1208 case SO_NOAPNFALLBK:
1209 case SO_MARK_CELLFALLBACK:
1210 /*
1211 * Tell the caller that these options are to be processed;
1212 * these will also be recorded later by mptcp_setopt().
1213 *
1214 * NOTE: Only support integer option value for now.
1215 */
1216 if (sopt->sopt_valsize != sizeof (int))
1217 error = EINVAL;
1218 break;
1219
1220 default:
1221 /*
1222 * Tell the caller to stop immediately and return an error.
1223 */
1224 error = ENOPROTOOPT;
1225 break;
1226 }
1227
1228 return (error);
1229 }
1230
1231 /*
1232 * Issue SOPT_SET for all MPTCP subflows (for integer option values.)
1233 */
1234 static int
1235 mptcp_setopt_apply(struct mptses *mpte, struct mptopt *mpo)
1236 {
1237 struct socket *mp_so;
1238 struct mptsub *mpts;
1239 struct mptopt smpo;
1240 int error = 0;
1241
1242 /* just bail now if this isn't applicable to subflow sockets */
1243 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) {
1244 error = ENOPROTOOPT;
1245 goto out;
1246 }
1247
1248 /*
1249 * Skip those that are handled internally; these options
1250 * should not have been recorded and marked with the
1251 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1252 */
1253 if (mpo->mpo_level == SOL_SOCKET &&
1254 (mpo->mpo_name == SO_NOSIGPIPE || mpo->mpo_name == SO_NOADDRERR)) {
1255 error = ENOPROTOOPT;
1256 goto out;
1257 }
1258
1259 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1260 mp_so = mptetoso(mpte);
1261
1262 /*
1263 * Don't bother going further if there's no subflow; mark the option
1264 * with MPOF_INTERIM so that we know whether or not to remove this
1265 * option upon encountering an error while issuing it during subflow
1266 * socket creation.
1267 */
1268 if (mpte->mpte_numflows == 0) {
1269 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows));
1270 mpo->mpo_flags |= MPOF_INTERIM;
1271 /* return success */
1272 goto out;
1273 }
1274
1275 bzero(&smpo, sizeof (smpo));
1276 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1277 smpo.mpo_level = mpo->mpo_level;
1278 smpo.mpo_name = mpo->mpo_name;
1279
1280 /* grab exisiting values in case we need to rollback */
1281 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1282 struct socket *so;
1283
1284 mpts->mpts_flags &= ~(MPTSF_SOPT_OLDVAL|MPTSF_SOPT_INPROG);
1285 mpts->mpts_oldintval = 0;
1286 smpo.mpo_intval = 0;
1287 VERIFY(mpts->mpts_socket != NULL);
1288 so = mpts->mpts_socket;
1289 if (mptcp_subflow_sogetopt(mpte, so, &smpo) == 0) {
1290 mpts->mpts_flags |= MPTSF_SOPT_OLDVAL;
1291 mpts->mpts_oldintval = smpo.mpo_intval;
1292 }
1293 }
1294
1295 /* apply socket option */
1296 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1297 struct socket *so;
1298
1299 mpts->mpts_flags |= MPTSF_SOPT_INPROG;
1300 VERIFY(mpts->mpts_socket != NULL);
1301 so = mpts->mpts_socket;
1302 error = mptcp_subflow_sosetopt(mpte, mpts, mpo);
1303 if (error != 0)
1304 break;
1305 }
1306
1307 /* cleanup, and rollback if needed */
1308 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1309 struct socket *so;
1310
1311 if (!(mpts->mpts_flags & MPTSF_SOPT_INPROG)) {
1312 /* clear in case it's set */
1313 mpts->mpts_flags &= ~MPTSF_SOPT_OLDVAL;
1314 mpts->mpts_oldintval = 0;
1315 continue;
1316 }
1317 if (!(mpts->mpts_flags & MPTSF_SOPT_OLDVAL)) {
1318 mpts->mpts_flags &= ~MPTSF_SOPT_INPROG;
1319 VERIFY(mpts->mpts_oldintval == 0);
1320 continue;
1321 }
1322 /* error during sosetopt, so roll it back */
1323 if (error != 0) {
1324 VERIFY(mpts->mpts_socket != NULL);
1325 so = mpts->mpts_socket;
1326 smpo.mpo_intval = mpts->mpts_oldintval;
1327 mptcp_subflow_sosetopt(mpte, mpts, &smpo);
1328 }
1329 mpts->mpts_oldintval = 0;
1330 mpts->mpts_flags &= ~(MPTSF_SOPT_OLDVAL|MPTSF_SOPT_INPROG);
1331 }
1332
1333 out:
1334 return (error);
1335 }
1336
1337 /*
1338 * Handle SOPT_SET for socket options issued on MP socket.
1339 */
1340 static int
1341 mptcp_setopt(struct mptses *mpte, struct sockopt *sopt)
1342 {
1343 int error = 0, optval = 0, level, optname, rec = 1;
1344 struct mptopt smpo, *mpo = NULL;
1345 struct socket *mp_so;
1346
1347 level = sopt->sopt_level;
1348 optname = sopt->sopt_name;
1349
1350 VERIFY(sopt->sopt_dir == SOPT_SET);
1351 VERIFY(level == SOL_SOCKET || level == IPPROTO_TCP);
1352 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1353 mp_so = mptetoso(mpte);
1354
1355 /*
1356 * Record socket options which are applicable to subflow sockets so
1357 * that we can replay them for new ones; see mptcp_usr_socheckopt()
1358 * for the list of eligible socket-level options.
1359 */
1360 if (level == SOL_SOCKET) {
1361 switch (optname) {
1362 case SO_DEBUG:
1363 case SO_KEEPALIVE:
1364 case SO_USELOOPBACK:
1365 case SO_RANDOMPORT:
1366 case SO_TRAFFIC_CLASS:
1367 case SO_RECV_TRAFFIC_CLASS:
1368 case SO_PRIVILEGED_TRAFFIC_CLASS:
1369 case SO_RECV_ANYIF:
1370 case SO_RESTRICTIONS:
1371 case SO_NOWAKEFROMSLEEP:
1372 case SO_NOAPNFALLBK:
1373 case SO_MARK_CELLFALLBACK:
1374 /* record it */
1375 break;
1376 case SO_FLUSH:
1377 /* don't record it */
1378 rec = 0;
1379 break;
1380
1381 /* Next ones, record at MPTCP-level */
1382 #if NECP
1383 case SO_NECP_CLIENTUUID:
1384 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
1385 error = EINVAL;
1386 goto out;
1387 }
1388
1389 error = sooptcopyin(sopt, &mpsotomppcb(mp_so)->necp_client_uuid,
1390 sizeof(uuid_t), sizeof(uuid_t));
1391 if (error != 0) {
1392 goto out;
1393 }
1394
1395 mpsotomppcb(mp_so)->necp_cb = mptcp_session_necp_cb;
1396 error = necp_client_register_multipath_cb(mp_so->last_pid,
1397 mpsotomppcb(mp_so)->necp_client_uuid,
1398 mpsotomppcb(mp_so));
1399 if (error)
1400 goto out;
1401
1402 if (uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
1403 error = EINVAL;
1404 goto out;
1405 }
1406
1407 goto out;
1408 case SO_NECP_ATTRIBUTES:
1409 #endif /* NECP */
1410 default:
1411 /* nothing to do; just return */
1412 goto out;
1413 }
1414 } else {
1415 switch (optname) {
1416 case TCP_NODELAY:
1417 case TCP_RXT_FINDROP:
1418 case TCP_KEEPALIVE:
1419 case TCP_KEEPINTVL:
1420 case TCP_KEEPCNT:
1421 case TCP_CONNECTIONTIMEOUT:
1422 case TCP_RXT_CONNDROPTIME:
1423 case PERSIST_TIMEOUT:
1424 case TCP_ADAPTIVE_READ_TIMEOUT:
1425 case TCP_ADAPTIVE_WRITE_TIMEOUT:
1426 /* eligible; record it */
1427 break;
1428 case TCP_NOTSENT_LOWAT:
1429 /* record at MPTCP level */
1430 error = sooptcopyin(sopt, &optval, sizeof(optval),
1431 sizeof(optval));
1432 if (error)
1433 goto out;
1434 if (optval < 0) {
1435 error = EINVAL;
1436 goto out;
1437 } else {
1438 if (optval == 0) {
1439 mp_so->so_flags &= ~SOF_NOTSENT_LOWAT;
1440 error = mptcp_set_notsent_lowat(mpte,0);
1441 } else {
1442 mp_so->so_flags |= SOF_NOTSENT_LOWAT;
1443 error = mptcp_set_notsent_lowat(mpte,
1444 optval);
1445 }
1446 }
1447 goto out;
1448 case MPTCP_SERVICE_TYPE:
1449 /* record at MPTCP level */
1450 error = sooptcopyin(sopt, &optval, sizeof(optval),
1451 sizeof(optval));
1452 if (error)
1453 goto out;
1454 if (optval < 0 || optval >= MPTCP_SVCTYPE_MAX) {
1455 error = EINVAL;
1456 goto out;
1457 }
1458
1459 mpte->mpte_svctype = optval;
1460
1461 if (mptcp_entitlement_check(mp_so) < 0) {
1462 error = EACCES;
1463 goto out;
1464 }
1465
1466 mpte->mpte_flags |= MPTE_SVCTYPE_CHECKED;
1467
1468 goto out;
1469 case MPTCP_ALTERNATE_PORT:
1470 /* record at MPTCP level */
1471 error = sooptcopyin(sopt, &optval, sizeof(optval),
1472 sizeof(optval));
1473 if (error)
1474 goto out;
1475
1476 if (optval < 0 || optval > UINT16_MAX) {
1477 error = EINVAL;
1478 goto out;
1479 }
1480
1481 mpte->mpte_alternate_port = optval;
1482
1483 goto out;
1484 default:
1485 /* not eligible */
1486 error = ENOPROTOOPT;
1487 goto out;
1488 }
1489 }
1490
1491 if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
1492 sizeof (optval))) != 0)
1493 goto out;
1494
1495 if (rec) {
1496 /* search for an existing one; if not found, allocate */
1497 if ((mpo = mptcp_sopt_find(mpte, sopt)) == NULL)
1498 mpo = mptcp_sopt_alloc(M_WAITOK);
1499
1500 if (mpo == NULL) {
1501 error = ENOBUFS;
1502 } else {
1503 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s val %d %s\n",
1504 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1505 mptcp_sopt2str(level, optname), optval,
1506 (mpo->mpo_flags & MPOF_ATTACHED) ?
1507 "updated" : "recorded"),
1508 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
1509
1510 /* initialize or update, as needed */
1511 mpo->mpo_intval = optval;
1512 if (!(mpo->mpo_flags & MPOF_ATTACHED)) {
1513 mpo->mpo_level = level;
1514 mpo->mpo_name = optname;
1515 mptcp_sopt_insert(mpte, mpo);
1516 }
1517 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1518 /* this can be issued on the subflow socket */
1519 mpo->mpo_flags |= MPOF_SUBFLOW_OK;
1520 }
1521 } else {
1522 bzero(&smpo, sizeof (smpo));
1523 mpo = &smpo;
1524 mpo->mpo_flags |= MPOF_SUBFLOW_OK;
1525 mpo->mpo_level = level;
1526 mpo->mpo_name = optname;
1527 mpo->mpo_intval = optval;
1528 }
1529 VERIFY(mpo == NULL || error == 0);
1530
1531 /* issue this socket option on existing subflows */
1532 if (error == 0) {
1533 error = mptcp_setopt_apply(mpte, mpo);
1534 if (error != 0 && (mpo->mpo_flags & MPOF_ATTACHED)) {
1535 VERIFY(mpo != &smpo);
1536 mptcp_sopt_remove(mpte, mpo);
1537 mptcp_sopt_free(mpo);
1538 }
1539 if (mpo == &smpo)
1540 mpo->mpo_flags &= ~MPOF_INTERIM;
1541 }
1542 out:
1543 if (error == 0 && mpo != NULL) {
1544 mptcplog((LOG_INFO, "%s: mp_so 0x%llx sopt %s val %d set %s\n",
1545 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1546 mptcp_sopt2str(level, optname), optval,
1547 (mpo->mpo_flags & MPOF_INTERIM) ?
1548 "pending" : "successful"),
1549 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
1550 } else if (error != 0) {
1551 mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s (%d, %d) val %d can't be issued error %d\n",
1552 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1553 mptcp_sopt2str(level, optname), level, optname, optval, error),
1554 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1555 }
1556 return (error);
1557 }
1558
1559 /*
1560 * Handle SOPT_GET for socket options issued on MP socket.
1561 */
1562 static int
1563 mptcp_getopt(struct mptses *mpte, struct sockopt *sopt)
1564 {
1565 int error = 0, optval = 0;
1566
1567 VERIFY(sopt->sopt_dir == SOPT_GET);
1568 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1569
1570 /*
1571 * We only handle SOPT_GET for TCP level socket options; we should
1572 * not get here for socket level options since they are already
1573 * handled at the socket layer.
1574 */
1575 if (sopt->sopt_level != IPPROTO_TCP) {
1576 error = ENOPROTOOPT;
1577 goto out;
1578 }
1579
1580 switch (sopt->sopt_name) {
1581 case TCP_NODELAY:
1582 case TCP_RXT_FINDROP:
1583 case TCP_KEEPALIVE:
1584 case TCP_KEEPINTVL:
1585 case TCP_KEEPCNT:
1586 case TCP_CONNECTIONTIMEOUT:
1587 case TCP_RXT_CONNDROPTIME:
1588 case PERSIST_TIMEOUT:
1589 case TCP_ADAPTIVE_READ_TIMEOUT:
1590 case TCP_ADAPTIVE_WRITE_TIMEOUT:
1591 case TCP_NOTSENT_LOWAT:
1592 case MPTCP_SERVICE_TYPE:
1593 case MPTCP_ALTERNATE_PORT:
1594 /* eligible; get the default value just in case */
1595 error = mptcp_default_tcp_optval(mpte, sopt, &optval);
1596 break;
1597 default:
1598 /* not eligible */
1599 error = ENOPROTOOPT;
1600 break;
1601 }
1602
1603 switch (sopt->sopt_name) {
1604 case TCP_NOTSENT_LOWAT:
1605 if (mptetoso(mpte)->so_flags & SOF_NOTSENT_LOWAT)
1606 optval = mptcp_get_notsent_lowat(mpte);
1607 else
1608 optval = 0;
1609 goto out;
1610 case MPTCP_SERVICE_TYPE:
1611 optval = mpte->mpte_svctype;
1612 goto out;
1613 case MPTCP_ALTERNATE_PORT:
1614 optval = mpte->mpte_alternate_port;
1615 goto out;
1616 }
1617
1618 /*
1619 * Search for a previously-issued TCP level socket option and
1620 * return the recorded option value. This assumes that the
1621 * value did not get modified by the lower layer after it was
1622 * issued at setsockopt(2) time. If not found, we'll return
1623 * the default value obtained ealier.
1624 */
1625 if (error == 0) {
1626 struct mptopt *mpo;
1627
1628 if ((mpo = mptcp_sopt_find(mpte, sopt)) != NULL)
1629 optval = mpo->mpo_intval;
1630
1631 error = sooptcopyout(sopt, &optval, sizeof (int));
1632 }
1633 out:
1634 return (error);
1635 }
1636
1637 /*
1638 * Return default values for TCP socket options. Ideally we would query the
1639 * subflow TCP socket, but that requires creating a subflow socket before
1640 * connectx(2) time. To simplify things, just return the default values
1641 * that we know of.
1642 */
1643 static int
1644 mptcp_default_tcp_optval(struct mptses *mpte, struct sockopt *sopt, int *optval)
1645 {
1646 int error = 0;
1647
1648 VERIFY(sopt->sopt_level == IPPROTO_TCP);
1649 VERIFY(sopt->sopt_dir == SOPT_GET);
1650 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1651
1652 /* try to do what tcp_newtcpcb() does */
1653 switch (sopt->sopt_name) {
1654 case TCP_NODELAY:
1655 case TCP_RXT_FINDROP:
1656 case TCP_KEEPINTVL:
1657 case TCP_KEEPCNT:
1658 case TCP_CONNECTIONTIMEOUT:
1659 case TCP_RXT_CONNDROPTIME:
1660 case TCP_NOTSENT_LOWAT:
1661 case TCP_ADAPTIVE_READ_TIMEOUT:
1662 case TCP_ADAPTIVE_WRITE_TIMEOUT:
1663 case MPTCP_SERVICE_TYPE:
1664 case MPTCP_ALTERNATE_PORT:
1665 *optval = 0;
1666 break;
1667
1668 case TCP_KEEPALIVE:
1669 *optval = mptcp_subflow_keeptime;
1670 break;
1671
1672 case PERSIST_TIMEOUT:
1673 *optval = tcp_max_persist_timeout;
1674 break;
1675
1676 default:
1677 error = ENOPROTOOPT;
1678 break;
1679 }
1680 return (error);
1681 }
1682
1683 /*
1684 * MPTCP SOPT_{SET,GET} socket option handler, for options issued on the MP
1685 * socket, at SOL_SOCKET and IPPROTO_TCP levels. The former is restricted
1686 * to those that are allowed by mptcp_usr_socheckopt().
1687 */
1688 int
1689 mptcp_ctloutput(struct socket *mp_so, struct sockopt *sopt)
1690 {
1691 struct mppcb *mpp = mpsotomppcb(mp_so);
1692 struct mptses *mpte;
1693 int error = 0;
1694
1695 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
1696 error = EINVAL;
1697 goto out;
1698 }
1699 mpte = mptompte(mpp);
1700 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1701
1702 /* we only handle socket and TCP-level socket options for MPTCP */
1703 if (sopt->sopt_level != SOL_SOCKET && sopt->sopt_level != IPPROTO_TCP) {
1704 mptcplog((LOG_DEBUG, "MPTCP Socket: "
1705 "%s: mp_so 0x%llx sopt %s level not "
1706 "handled\n", __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1707 mptcp_sopt2str(sopt->sopt_level, sopt->sopt_name)),
1708 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
1709 error = EINVAL;
1710 goto out;
1711 }
1712
1713 switch (sopt->sopt_dir) {
1714 case SOPT_SET:
1715 error = mptcp_setopt(mpte, sopt);
1716 break;
1717
1718 case SOPT_GET:
1719 error = mptcp_getopt(mpte, sopt);
1720 break;
1721 }
1722 out:
1723 return (error);
1724 }
1725
1726 const char *
1727 mptcp_sopt2str(int level, int optname)
1728 {
1729 switch (level) {
1730 case SOL_SOCKET:
1731 switch (optname) {
1732 case SO_LINGER:
1733 return ("SO_LINGER");
1734 case SO_LINGER_SEC:
1735 return ("SO_LINGER_SEC");
1736 case SO_DEBUG:
1737 return ("SO_DEBUG");
1738 case SO_KEEPALIVE:
1739 return ("SO_KEEPALIVE");
1740 case SO_USELOOPBACK:
1741 return ("SO_USELOOPBACK");
1742 case SO_TYPE:
1743 return ("SO_TYPE");
1744 case SO_NREAD:
1745 return ("SO_NREAD");
1746 case SO_NWRITE:
1747 return ("SO_NWRITE");
1748 case SO_ERROR:
1749 return ("SO_ERROR");
1750 case SO_SNDBUF:
1751 return ("SO_SNDBUF");
1752 case SO_RCVBUF:
1753 return ("SO_RCVBUF");
1754 case SO_SNDLOWAT:
1755 return ("SO_SNDLOWAT");
1756 case SO_RCVLOWAT:
1757 return ("SO_RCVLOWAT");
1758 case SO_SNDTIMEO:
1759 return ("SO_SNDTIMEO");
1760 case SO_RCVTIMEO:
1761 return ("SO_RCVTIMEO");
1762 case SO_NKE:
1763 return ("SO_NKE");
1764 case SO_NOSIGPIPE:
1765 return ("SO_NOSIGPIPE");
1766 case SO_NOADDRERR:
1767 return ("SO_NOADDRERR");
1768 case SO_RESTRICTIONS:
1769 return ("SO_RESTRICTIONS");
1770 case SO_LABEL:
1771 return ("SO_LABEL");
1772 case SO_PEERLABEL:
1773 return ("SO_PEERLABEL");
1774 case SO_RANDOMPORT:
1775 return ("SO_RANDOMPORT");
1776 case SO_TRAFFIC_CLASS:
1777 return ("SO_TRAFFIC_CLASS");
1778 case SO_RECV_TRAFFIC_CLASS:
1779 return ("SO_RECV_TRAFFIC_CLASS");
1780 case SO_TRAFFIC_CLASS_DBG:
1781 return ("SO_TRAFFIC_CLASS_DBG");
1782 case SO_PRIVILEGED_TRAFFIC_CLASS:
1783 return ("SO_PRIVILEGED_TRAFFIC_CLASS");
1784 case SO_DEFUNCTOK:
1785 return ("SO_DEFUNCTOK");
1786 case SO_ISDEFUNCT:
1787 return ("SO_ISDEFUNCT");
1788 case SO_OPPORTUNISTIC:
1789 return ("SO_OPPORTUNISTIC");
1790 case SO_FLUSH:
1791 return ("SO_FLUSH");
1792 case SO_RECV_ANYIF:
1793 return ("SO_RECV_ANYIF");
1794 case SO_NOWAKEFROMSLEEP:
1795 return ("SO_NOWAKEFROMSLEEP");
1796 case SO_NOAPNFALLBK:
1797 return ("SO_NOAPNFALLBK");
1798 case SO_MARK_CELLFALLBACK:
1799 return ("SO_CELLFALLBACK");
1800 case SO_DELEGATED:
1801 return ("SO_DELEGATED");
1802 case SO_DELEGATED_UUID:
1803 return ("SO_DELEGATED_UUID");
1804 #if NECP
1805 case SO_NECP_ATTRIBUTES:
1806 return ("SO_NECP_ATTRIBUTES");
1807 case SO_NECP_CLIENTUUID:
1808 return ("SO_NECP_CLIENTUUID");
1809 #endif /* NECP */
1810 }
1811
1812 break;
1813 case IPPROTO_TCP:
1814 switch (optname) {
1815 case TCP_NODELAY:
1816 return ("TCP_NODELAY");
1817 case TCP_KEEPALIVE:
1818 return ("TCP_KEEPALIVE");
1819 case TCP_KEEPINTVL:
1820 return ("TCP_KEEPINTVL");
1821 case TCP_KEEPCNT:
1822 return ("TCP_KEEPCNT");
1823 case TCP_CONNECTIONTIMEOUT:
1824 return ("TCP_CONNECTIONTIMEOUT");
1825 case TCP_RXT_CONNDROPTIME:
1826 return ("TCP_RXT_CONNDROPTIME");
1827 case PERSIST_TIMEOUT:
1828 return ("PERSIST_TIMEOUT");
1829 case TCP_NOTSENT_LOWAT:
1830 return ("NOTSENT_LOWAT");
1831 case TCP_ADAPTIVE_READ_TIMEOUT:
1832 return ("ADAPTIVE_READ_TIMEOUT");
1833 case TCP_ADAPTIVE_WRITE_TIMEOUT:
1834 return ("ADAPTIVE_WRITE_TIMEOUT");
1835 case MPTCP_SERVICE_TYPE:
1836 return ("MPTCP_SERVICE_TYPE");
1837 case MPTCP_ALTERNATE_PORT:
1838 return ("MPTCP_ALTERNATE_PORT");
1839 }
1840
1841 break;
1842 }
1843
1844 return ("unknown");
1845 }
1846
1847 static int
1848 mptcp_usr_preconnect(struct socket *mp_so)
1849 {
1850 struct mptsub *mpts = NULL;
1851 struct mppcb *mpp = mpsotomppcb(mp_so);
1852 struct mptses *mpte;
1853 struct socket *so;
1854 struct tcpcb *tp = NULL;
1855 int error;
1856
1857 mpte = mptompte(mpp);
1858 VERIFY(mpte != NULL);
1859 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1860
1861 mpts = mptcp_get_subflow(mpte, NULL, NULL);
1862 if (mpts == NULL) {
1863 mptcplog((LOG_ERR, "%s: mp_so 0x%llx invalid preconnect ",
1864 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
1865 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1866 return (EINVAL);
1867 }
1868 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
1869 so = mpts->mpts_socket;
1870 tp = intotcpcb(sotoinpcb(so));
1871 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
1872 error = tcp_output(sototcpcb(so));
1873
1874 soclearfastopen(mp_so);
1875
1876 return (error);
1877 }