]> git.saurik.com Git - apple/xnu.git/blame - bsd/netinet/mptcp_usrreq.c
xnu-2422.1.72.tar.gz
[apple/xnu.git] / bsd / netinet / mptcp_usrreq.c
CommitLineData
39236c6e
A
1/*
2 * Copyright (c) 2012-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/kernel.h>
32#include <sys/socket.h>
33#include <sys/socketvar.h>
34#include <sys/protosw.h>
35#include <sys/mcache.h>
36#include <sys/syslog.h>
37#include <sys/proc.h>
38#include <sys/proc_internal.h>
39#include <sys/resourcevar.h>
40
41#include <net/if.h>
42#include <netinet/in.h>
43#include <netinet/in_var.h>
44#include <netinet/tcp.h>
45#include <netinet/tcp_fsm.h>
46#include <netinet/tcp_seq.h>
47#include <netinet/tcp_var.h>
48#include <netinet/tcp_timer.h>
49#include <netinet/mptcp_var.h>
50#include <netinet/mptcp_timer.h>
51
52#include <mach/sdt.h>
53
54static int mptcp_usr_attach(struct socket *, int, struct proc *);
55static int mptcp_usr_detach(struct socket *);
56static int mptcp_attach(struct socket *, struct proc *);
57static int mptcp_detach(struct socket *, struct mppcb *);
58static int mptcp_connectx(struct mptses *, struct sockaddr_list **,
59 struct sockaddr_list **, struct proc *, uint32_t, associd_t, connid_t *,
60 uint32_t, void *, uint32_t);
61static int mptcp_usr_connectx(struct socket *, struct sockaddr_list **,
62 struct sockaddr_list **, struct proc *, uint32_t, associd_t, connid_t *,
63 uint32_t, void *, uint32_t);
64static int mptcp_getassocids(struct mptses *, uint32_t *, user_addr_t);
65static int mptcp_getconnids(struct mptses *, associd_t, uint32_t *,
66 user_addr_t);
67static int mptcp_getconninfo(struct mptses *, connid_t *, uint32_t *,
68 uint32_t *, int32_t *, user_addr_t, socklen_t *, user_addr_t, socklen_t *,
69 uint32_t *, user_addr_t, uint32_t *);
70static int mptcp_usr_control(struct socket *, u_long, caddr_t, struct ifnet *,
71 struct proc *);
72static int mptcp_disconnectx(struct mptses *, associd_t, connid_t);
73static int mptcp_usr_disconnectx(struct socket *, associd_t, connid_t);
74static struct mptses *mptcp_usrclosed(struct mptses *);
75static int mptcp_usr_peeloff(struct socket *, associd_t, struct socket **);
76static int mptcp_peeloff(struct mptses *, associd_t, struct socket **);
77static int mptcp_usr_rcvd(struct socket *, int);
78static int mptcp_usr_send(struct socket *, int, struct mbuf *,
79 struct sockaddr *, struct mbuf *, struct proc *);
80static int mptcp_usr_shutdown(struct socket *);
81static int mptcp_uiotombuf(struct uio *, int, int, uint32_t, struct mbuf **);
82static int mptcp_usr_sosend(struct socket *, struct sockaddr *, struct uio *,
83 struct mbuf *, struct mbuf *, int);
84static int mptcp_usr_socheckopt(struct socket *, struct sockopt *);
85static int mptcp_setopt_apply(struct mptses *, struct mptopt *);
86static int mptcp_setopt(struct mptses *, struct sockopt *);
87static int mptcp_getopt(struct mptses *, struct sockopt *);
88static int mptcp_default_tcp_optval(struct mptses *, struct sockopt *, int *);
89static void mptcp_connorder_helper(struct mptsub *mpts);
90
91struct pr_usrreqs mptcp_usrreqs = {
92 .pru_attach = mptcp_usr_attach,
93 .pru_connectx = mptcp_usr_connectx,
94 .pru_control = mptcp_usr_control,
95 .pru_detach = mptcp_usr_detach,
96 .pru_disconnectx = mptcp_usr_disconnectx,
97 .pru_peeloff = mptcp_usr_peeloff,
98 .pru_rcvd = mptcp_usr_rcvd,
99 .pru_send = mptcp_usr_send,
100 .pru_shutdown = mptcp_usr_shutdown,
101 .pru_sosend = mptcp_usr_sosend,
102 .pru_soreceive = soreceive,
103 .pru_socheckopt = mptcp_usr_socheckopt,
104};
105
106/*
107 * Attaches an MPTCP control block to a socket.
108 */
109static int
110mptcp_usr_attach(struct socket *mp_so, int proto, struct proc *p)
111{
112#pragma unused(proto)
113 int error;
114
115 VERIFY(sotomppcb(mp_so) == NULL);
116
117 error = mptcp_attach(mp_so, p);
118 if (error != 0)
119 goto out;
120 /*
121 * XXX: adi@apple.com
122 *
123 * Might want to use a different SO_LINGER timeout than TCP's?
124 */
125 if ((mp_so->so_options & SO_LINGER) && mp_so->so_linger == 0)
126 mp_so->so_linger = TCP_LINGERTIME * hz;
127out:
128 return (error);
129}
130
131/*
132 * Detaches an MPTCP control block from a socket.
133 */
134static int
135mptcp_usr_detach(struct socket *mp_so)
136{
137 struct mppcb *mpp = sotomppcb(mp_so);
138 int error = 0;
139
140 VERIFY(mpp != NULL);
141 VERIFY(mpp->mpp_socket != NULL);
142
143 error = mptcp_detach(mp_so, mpp);
144 return (error);
145}
146
147/*
148 * Attach MPTCP protocol to socket, allocating MP control block,
149 * MPTCP session, control block, buffer space, etc.
150 */
151static int
152mptcp_attach(struct socket *mp_so, struct proc *p)
153{
154#pragma unused(p)
155 struct mptses *mpte;
156 struct mptcb *mp_tp;
157 struct mppcb *mpp;
158 int error = 0;
159
160 if (mp_so->so_snd.sb_hiwat == 0 || mp_so->so_rcv.sb_hiwat == 0) {
161 error = soreserve(mp_so, tcp_sendspace, MPTCP_RWIN_MAX);
162 if (error != 0)
163 goto out;
164 }
165
166 /*
167 * MPTCP socket buffers cannot be compressed, due to the
168 * fact that each mbuf chained via m_next is a M_PKTHDR
169 * which carries some MPTCP metadata.
170 */
171 mp_so->so_snd.sb_flags |= SB_NOCOMPRESS;
172 mp_so->so_rcv.sb_flags |= SB_NOCOMPRESS;
173
174 /* Disable socket buffer auto-tuning. */
175 mp_so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
176 mp_so->so_snd.sb_flags &= ~SB_AUTOSIZE;
177
178 if ((error = mp_pcballoc(mp_so, &mtcbinfo)) != 0)
179 goto out;
180
181 mpp = sotomppcb(mp_so);
182 VERIFY(mpp != NULL);
183
184 mpte = mptcp_sescreate(mp_so, mpp);
185 if (mpte == NULL) {
186 mp_pcbdetach(mpp);
187 error = ENOBUFS;
188 goto out;
189 }
190 mp_tp = mpte->mpte_mptcb;
191 VERIFY(mp_tp != NULL);
192
193 MPT_LOCK(mp_tp);
194 mp_tp->mpt_state = MPTCPS_CLOSED;
195 MPT_UNLOCK(mp_tp);
196
197out:
198 return (error);
199}
200
201/*
202 * Called when the socket layer loses its final reference to the socket;
203 * at this point, there is only one case in which we will keep things
204 * around: time wait.
205 */
206static int
207mptcp_detach(struct socket *mp_so, struct mppcb *mpp)
208{
209 struct mptses *mpte;
210 struct mppcbinfo *mppi;
211
212 VERIFY(mp_so->so_pcb == mpp);
213 VERIFY(mpp->mpp_socket == mp_so);
214
215 mppi = mpp->mpp_pcbinfo;
216 VERIFY(mppi != NULL);
217
218 mpte = &((struct mpp_mtp *)mpp)->mpp_ses;
219 VERIFY(mpte->mpte_mppcb == mpp);
220
221 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
222
223 /*
224 * We are done with this MPTCP socket (it has been closed);
225 * trigger all subflows to be disconnected, if not already,
226 * by initiating the PCB detach sequence (SOF_PCBCLEARING
227 * will be set.)
228 */
229 mp_pcbdetach(mpp);
230
231 (void) mptcp_disconnectx(mpte, ASSOCID_ALL, CONNID_ALL);
232
233 /*
234 * XXX: adi@apple.com
235 *
236 * Here, we would want to handle time wait state.
237 */
238
239 return (0);
240}
241
242/*
243 * Common subroutine to open a MPTCP connection to one of the remote hosts
244 * specified by dst_sl. This includes allocating and establishing a
245 * subflow TCP connection, either initially to establish MPTCP connection,
246 * or to join an existing one. Returns a connection handle upon success.
247 */
248static int
249mptcp_connectx(struct mptses *mpte, struct sockaddr_list **src_sl,
250 struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
251 associd_t aid, connid_t *pcid, uint32_t flags, void *arg,
252 uint32_t arglen)
253{
254#pragma unused(p, aid, flags, arg, arglen)
255 struct mptsub *mpts;
256 struct socket *mp_so;
257 int error = 0;
258
259 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
260 mp_so = mpte->mpte_mppcb->mpp_socket;
261
262 VERIFY(dst_sl != NULL && *dst_sl != NULL);
263 VERIFY(pcid != NULL);
264
265 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx\n", __func__,
266 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)));
267 DTRACE_MPTCP3(connectx, struct mptses *, mpte, associd_t, aid,
268 struct socket *, mp_so);
269
270 mpts = mptcp_subflow_alloc(M_WAITOK);
271 if (mpts == NULL) {
272 error = ENOBUFS;
273 goto out;
274 }
275 MPTS_ADDREF(mpts); /* for this routine */
276
277 if (src_sl != NULL) {
278 mpts->mpts_src_sl = *src_sl;
279 *src_sl = NULL;
280 }
281 mpts->mpts_dst_sl = *dst_sl;
282 *dst_sl = NULL;
283
284 error = mptcp_subflow_add(mpte, mpts, p, ifscope);
285 if (error == 0 && pcid != NULL)
286 *pcid = mpts->mpts_connid;
287
288out:
289 if (mpts != NULL) {
290 if ((error != 0) && (error != EWOULDBLOCK)) {
291 MPTS_LOCK(mpts);
292 if (mpts->mpts_flags & MPTSF_ATTACHED) {
293 MPTS_UNLOCK(mpts);
294 MPTS_REMREF(mpts);
295 mptcp_subflow_del(mpte, mpts, TRUE);
296 return (error);
297 }
298 MPTS_UNLOCK(mpts);
299 }
300 MPTS_REMREF(mpts);
301 }
302
303 return (error);
304}
305
306/*
307 * User-protocol pru_connectx callback.
308 */
309static int
310mptcp_usr_connectx(struct socket *mp_so, struct sockaddr_list **src_sl,
311 struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
312 associd_t aid, connid_t *pcid, uint32_t flags, void *arg,
313 uint32_t arglen)
314{
315#pragma unused(arg, arglen)
316 struct mppcb *mpp = sotomppcb(mp_so);
317 struct mptses *mpte;
318 int error = 0;
319
320 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
321 error = EINVAL;
322 goto out;
323 }
324 mpte = mptompte(mpp);
325 VERIFY(mpte != NULL);
326
327 error = mptcp_connectx(mpte, src_sl, dst_sl, p, ifscope,
328 aid, pcid, flags, arg, arglen);
329out:
330 return (error);
331}
332
333/*
334 * Handle SIOCGASSOCIDS ioctl for PF_MULTIPATH domain.
335 */
336static int
337mptcp_getassocids(struct mptses *mpte, uint32_t *cnt, user_addr_t aidp)
338{
339 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
340
341 /* MPTCP has at most 1 association */
342 *cnt = (mpte->mpte_associd != ASSOCID_ANY) ? 1 : 0;
343
344 /* just asking how many there are? */
345 if (aidp == USER_ADDR_NULL)
346 return (0);
347
348 return (copyout(&mpte->mpte_associd, aidp,
349 sizeof (mpte->mpte_associd)));
350}
351
352/*
353 * Handle SIOCGCONNIDS ioctl for PF_MULTIPATH domain.
354 */
355static int
356mptcp_getconnids(struct mptses *mpte, associd_t aid, uint32_t *cnt,
357 user_addr_t cidp)
358{
359 struct mptsub *mpts;
360 int error = 0;
361
362 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
363
364 if (aid != ASSOCID_ANY && aid != ASSOCID_ALL &&
365 aid != mpte->mpte_associd)
366 return (EINVAL);
367
368 *cnt = mpte->mpte_numflows;
369
370 /* just asking how many there are? */
371 if (cidp == USER_ADDR_NULL)
372 return (0);
373
374 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
375 if ((error = copyout(&mpts->mpts_connid, cidp,
376 sizeof (mpts->mpts_connid))) != 0)
377 break;
378
379 cidp += sizeof (mpts->mpts_connid);
380 }
381
382 return (error);
383}
384
385/*
386 * Handle SIOCGCONNINFO ioctl for PF_MULTIPATH domain.
387 */
388static int
389mptcp_getconninfo(struct mptses *mpte, connid_t *cid, uint32_t *flags,
390 uint32_t *ifindex, int32_t *soerror, user_addr_t src, socklen_t *src_len,
391 user_addr_t dst, socklen_t *dst_len, uint32_t *aux_type,
392 user_addr_t aux_data, uint32_t *aux_len)
393{
394#pragma unused(aux_data)
395 struct sockaddr_entry *se;
396 struct ifnet *ifp = NULL;
397 struct mptsub *mpts;
398 int error = 0;
399
400 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
401
402 if (*cid == CONNID_ALL)
403 return (EINVAL);
404
405 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
406 if (mpts->mpts_connid == *cid || *cid == CONNID_ANY)
407 break;
408 }
409 if (mpts == NULL)
410 return ((*cid == CONNID_ANY) ? ENXIO : EINVAL);
411
412 MPTS_LOCK(mpts);
413 ifp = mpts->mpts_outif;
414 *cid = mpts->mpts_connid;
415 *ifindex = ((ifp != NULL) ? ifp->if_index : 0);
416 *soerror = mpts->mpts_soerror;
417 *flags = 0;
418 if (mpts->mpts_flags & MPTSF_CONNECTING)
419 *flags |= CIF_CONNECTING;
420 if (mpts->mpts_flags & MPTSF_CONNECTED)
421 *flags |= CIF_CONNECTED;
422 if (mpts->mpts_flags & MPTSF_DISCONNECTING)
423 *flags |= CIF_DISCONNECTING;
424 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
425 *flags |= CIF_DISCONNECTED;
426 if (mpts->mpts_flags & MPTSF_BOUND_IF)
427 *flags |= CIF_BOUND_IF;
428 if (mpts->mpts_flags & MPTSF_BOUND_IP)
429 *flags |= CIF_BOUND_IP;
430 if (mpts->mpts_flags & MPTSF_BOUND_PORT)
431 *flags |= CIF_BOUND_PORT;
432 if (mpts->mpts_flags & MPTSF_PREFERRED)
433 *flags |= CIF_PREFERRED;
434 if (mpts->mpts_flags & MPTSF_MP_CAPABLE)
435 *flags |= CIF_MP_CAPABLE;
436 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
437 *flags |= CIF_MP_DEGRADED;
438 if (mpts->mpts_flags & MPTSF_MP_READY)
439 *flags |= CIF_MP_READY;
440 if (mpts->mpts_flags & MPTSF_ACTIVE)
441 *flags |= CIF_MP_ACTIVE;
442
443 VERIFY(mpts->mpts_src_sl != NULL);
444 se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head);
445 VERIFY(se != NULL && se->se_addr != NULL);
446 *src_len = se->se_addr->sa_len;
447 if (src != USER_ADDR_NULL) {
448 error = copyout(se->se_addr, src, se->se_addr->sa_len);
449 if (error != 0)
450 goto out;
451 }
452
453 VERIFY(mpts->mpts_dst_sl != NULL);
454 se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head);
455 VERIFY(se != NULL && se->se_addr != NULL);
456 *dst_len = se->se_addr->sa_len;
457 if (dst != USER_ADDR_NULL) {
458 error = copyout(se->se_addr, dst, se->se_addr->sa_len);
459 if (error != 0)
460 goto out;
461 }
462
463 *aux_type = 0;
464 *aux_len = 0;
465 if (mpts->mpts_socket != NULL) {
466 struct conninfo_tcp tcp_ci;
467
468 *aux_type = CIAUX_TCP;
469 *aux_len = sizeof (tcp_ci);
470
471 if (aux_data != USER_ADDR_NULL) {
472 struct socket *so = mpts->mpts_socket;
473
474 VERIFY(SOCK_PROTO(so) == IPPROTO_TCP);
475 bzero(&tcp_ci, sizeof (tcp_ci));
476 socket_lock(so, 0);
477 tcp_getconninfo(so, &tcp_ci);
478 socket_unlock(so, 0);
479 error = copyout(&tcp_ci, aux_data, sizeof (tcp_ci));
480 if (error != 0)
481 goto out;
482 }
483 }
484out:
485 MPTS_UNLOCK(mpts);
486 return (error);
487}
488
489/*
490 * Handle SIOCSCONNORDER
491 */
492int
493mptcp_setconnorder(struct mptses *mpte, connid_t cid, uint32_t rank)
494{
495 struct mptsub *mpts, *mpts1;
496 int error = 0;
497
498 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
499 mptcplog((LOG_DEBUG, "%s: cid %d rank %d \n", __func__, cid, rank));
500
501 if (cid == CONNID_ANY || cid == CONNID_ALL) {
502 error = EINVAL;
503 goto out;
504 }
505
506 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
507 if (mpts->mpts_connid == cid)
508 break;
509 }
510 if (mpts == NULL) {
511 error = ENXIO;
512 goto out;
513 }
514
515 if (rank == 0 || rank > 1) {
516 /*
517 * If rank is 0, determine whether this should be the
518 * primary or backup subflow, depending on what we have.
519 *
520 * Otherwise, if greater than 0, make it a backup flow.
521 */
522 TAILQ_FOREACH(mpts1, &mpte->mpte_subflows, mpts_entry) {
523 MPTS_LOCK(mpts1);
524 if (mpts1->mpts_flags & MPTSF_PREFERRED) {
525 MPTS_UNLOCK(mpts1);
526 break;
527 }
528 MPTS_UNLOCK(mpts1);
529 }
530
531 MPTS_LOCK(mpts);
532 mpts->mpts_flags &= ~MPTSF_PREFERRED;
533 mpts->mpts_rank = rank;
534 if (mpts1 != NULL && mpts != mpts1) {
535 /* preferred subflow found; set rank as necessary */
536 if (rank == 0)
537 mpts->mpts_rank = (mpts1->mpts_rank + 1);
538 } else if (rank == 0) {
539 /* no preferred one found; promote this */
540 rank = 1;
541 }
542 MPTS_UNLOCK(mpts);
543 }
544
545 if (rank == 1) {
546 /*
547 * If rank is 1, promote this subflow to be preferred.
548 */
549 TAILQ_FOREACH(mpts1, &mpte->mpte_subflows, mpts_entry) {
550 MPTS_LOCK(mpts1);
551 if (mpts1 != mpts &&
552 (mpts1->mpts_flags & MPTSF_PREFERRED)) {
553 mpts1->mpts_flags &= ~MPTSF_PREFERRED;
554 if (mpte->mpte_nummpcapflows > 1)
555 mptcp_connorder_helper(mpts1);
556 } else if (mpts1 == mpts) {
557 mpts1->mpts_rank = 1;
558 if (mpts1->mpts_flags & MPTSF_MP_CAPABLE) {
559 mpts1->mpts_flags |= MPTSF_PREFERRED;
560 if (mpte->mpte_nummpcapflows > 1)
561 mptcp_connorder_helper(mpts1);
562 }
563 }
564 MPTS_UNLOCK(mpts1);
565 }
566 }
567
568out:
569 return (error);
570}
571
572static void
573mptcp_connorder_helper(struct mptsub *mpts)
574{
575 struct socket *so = mpts->mpts_socket;
576 struct tcpcb *tp = NULL;
577
578 socket_lock(so, 0);
579
580 tp = intotcpcb(sotoinpcb(so));
581 tp->t_mpflags |= TMPF_SND_MPPRIO;
582 if (mpts->mpts_flags & MPTSF_PREFERRED)
583 tp->t_mpflags &= ~TMPF_BACKUP_PATH;
584 else
585 tp->t_mpflags |= TMPF_BACKUP_PATH;
586 mptcplog((LOG_DEBUG, "%s cid %d flags %x", __func__,
587 mpts->mpts_connid, mpts->mpts_flags));
588 socket_unlock(so, 0);
589
590}
591
592/*
593 * Handle SIOCSGONNORDER
594 */
595int
596mptcp_getconnorder(struct mptses *mpte, connid_t cid, uint32_t *rank)
597{
598 struct mptsub *mpts;
599 int error = 0;
600
601 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
602 VERIFY(rank != NULL);
603 *rank = 0;
604
605 if (cid == CONNID_ANY || cid == CONNID_ALL) {
606 error = EINVAL;
607 goto out;
608 }
609
610 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
611 if (mpts->mpts_connid == cid)
612 break;
613 }
614 if (mpts == NULL) {
615 error = ENXIO;
616 goto out;
617 }
618
619 MPTS_LOCK(mpts);
620 *rank = mpts->mpts_rank;
621 MPTS_UNLOCK(mpts);
622out:
623 return (error);
624}
625
626/*
627 * User-protocol pru_control callback.
628 */
629static int
630mptcp_usr_control(struct socket *mp_so, u_long cmd, caddr_t data,
631 struct ifnet *ifp, struct proc *p)
632{
633#pragma unused(ifp, p)
634 struct mppcb *mpp = sotomppcb(mp_so);
635 struct mptses *mpte;
636 int error = 0;
637
638 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
639 error = EINVAL;
640 goto out;
641 }
642 mpte = mptompte(mpp);
643 VERIFY(mpte != NULL);
644
645 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
646
647 switch (cmd) {
648 case SIOCGASSOCIDS32: { /* struct so_aidreq32 */
649 struct so_aidreq32 aidr;
650 bcopy(data, &aidr, sizeof (aidr));
651 error = mptcp_getassocids(mpte, &aidr.sar_cnt,
652 aidr.sar_aidp);
653 if (error == 0)
654 bcopy(&aidr, data, sizeof (aidr));
655 break;
656 }
657
658 case SIOCGASSOCIDS64: { /* struct so_aidreq64 */
659 struct so_aidreq64 aidr;
660 bcopy(data, &aidr, sizeof (aidr));
661 error = mptcp_getassocids(mpte, &aidr.sar_cnt,
662 aidr.sar_aidp);
663 if (error == 0)
664 bcopy(&aidr, data, sizeof (aidr));
665 break;
666 }
667
668 case SIOCGCONNIDS32: { /* struct so_cidreq32 */
669 struct so_cidreq32 cidr;
670 bcopy(data, &cidr, sizeof (cidr));
671 error = mptcp_getconnids(mpte, cidr.scr_aid, &cidr.scr_cnt,
672 cidr.scr_cidp);
673 if (error == 0)
674 bcopy(&cidr, data, sizeof (cidr));
675 break;
676 }
677
678 case SIOCGCONNIDS64: { /* struct so_cidreq64 */
679 struct so_cidreq64 cidr;
680 bcopy(data, &cidr, sizeof (cidr));
681 error = mptcp_getconnids(mpte, cidr.scr_aid, &cidr.scr_cnt,
682 cidr.scr_cidp);
683 if (error == 0)
684 bcopy(&cidr, data, sizeof (cidr));
685 break;
686 }
687
688 case SIOCGCONNINFO32: { /* struct so_cinforeq32 */
689 struct so_cinforeq32 cifr;
690 bcopy(data, &cifr, sizeof (cifr));
691 error = mptcp_getconninfo(mpte, &cifr.scir_cid,
692 &cifr.scir_flags, &cifr.scir_ifindex, &cifr.scir_error,
693 cifr.scir_src, &cifr.scir_src_len, cifr.scir_dst,
694 &cifr.scir_dst_len, &cifr.scir_aux_type, cifr.scir_aux_data,
695 &cifr.scir_aux_len);
696 if (error == 0)
697 bcopy(&cifr, data, sizeof (cifr));
698 break;
699 }
700
701 case SIOCGCONNINFO64: { /* struct so_cinforeq64 */
702 struct so_cinforeq64 cifr;
703 bcopy(data, &cifr, sizeof (cifr));
704 error = mptcp_getconninfo(mpte, &cifr.scir_cid,
705 &cifr.scir_flags, &cifr.scir_ifindex, &cifr.scir_error,
706 cifr.scir_src, &cifr.scir_src_len, cifr.scir_dst,
707 &cifr.scir_dst_len, &cifr.scir_aux_type, cifr.scir_aux_data,
708 &cifr.scir_aux_len);
709 if (error == 0)
710 bcopy(&cifr, data, sizeof (cifr));
711 break;
712 }
713
714 case SIOCSCONNORDER: { /* struct so_cordreq */
715 struct so_cordreq cor;
716 bcopy(data, &cor, sizeof (cor));
717 error = mptcp_setconnorder(mpte, cor.sco_cid, cor.sco_rank);
718 if (error == 0)
719 bcopy(&cor, data, sizeof (cor));
720 break;
721 }
722
723 case SIOCGCONNORDER: { /* struct so_cordreq */
724 struct so_cordreq cor;
725 bcopy(data, &cor, sizeof (cor));
726 error = mptcp_getconnorder(mpte, cor.sco_cid, &cor.sco_rank);
727 if (error == 0)
728 bcopy(&cor, data, sizeof (cor));
729 break;
730 }
731
732 default:
733 error = EOPNOTSUPP;
734 break;
735 }
736out:
737 return (error);
738}
739
740/*
741 * Initiate a disconnect. MPTCP-level disconnection is specified by
742 * CONNID_{ANY,ALL}. Otherwise, selectively disconnect a subflow
743 * connection while keeping the MPTCP-level connection (association).
744 */
745static int
746mptcp_disconnectx(struct mptses *mpte, associd_t aid, connid_t cid)
747{
748 struct mptsub *mpts;
749 struct socket *mp_so;
750 struct mptcb *mp_tp;
751 int error = 0;
752
753 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
754
755 mp_so = mpte->mpte_mppcb->mpp_socket;
756 mp_tp = mpte->mpte_mptcb;
757
758 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx aid %d cid %d\n", __func__,
759 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), aid, cid));
760 DTRACE_MPTCP5(disconnectx, struct mptses *, mpte, associd_t, aid,
761 connid_t, cid, struct socket *, mp_so, struct mptcb *, mp_tp);
762
763 VERIFY(aid == ASSOCID_ANY || aid == ASSOCID_ALL ||
764 aid == mpte->mpte_associd);
765
766 /* terminate the association? */
767 if (cid == CONNID_ANY || cid == CONNID_ALL) {
768 /* if we're not detached, go thru socket state checks */
769 if (!(mp_so->so_flags & SOF_PCBCLEARING)) {
770 if (!(mp_so->so_state & (SS_ISCONNECTED|
771 SS_ISCONNECTING))) {
772 error = ENOTCONN;
773 goto out;
774 }
775 if (mp_so->so_state & SS_ISDISCONNECTING) {
776 error = EALREADY;
777 goto out;
778 }
779 }
780 MPT_LOCK(mp_tp);
781 mptcp_cancel_all_timers(mp_tp);
782 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
783 (void) mptcp_close(mpte, mp_tp);
784 MPT_UNLOCK(mp_tp);
785 } else if ((mp_so->so_options & SO_LINGER) &&
786 mp_so->so_linger == 0) {
787 (void) mptcp_drop(mpte, mp_tp, 0);
788 MPT_UNLOCK(mp_tp);
789 } else {
790 MPT_UNLOCK(mp_tp);
791 soisdisconnecting(mp_so);
792 sbflush(&mp_so->so_rcv);
793 if (mptcp_usrclosed(mpte) != NULL)
794 (void) mptcp_output(mpte);
795 }
796 } else {
797 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
798 if (mpts->mpts_connid != cid)
799 continue;
800 MPTS_LOCK(mpts);
801 mptcp_subflow_disconnect(mpte, mpts, FALSE);
802 MPTS_UNLOCK(mpts);
803 break;
804 }
805
806 if (mpts == NULL) {
807 error = EINVAL;
808 goto out;
809 }
810 }
811
812 if (error == 0)
813 mptcp_thread_signal(mpte);
814
815 if ((mp_so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
816 (SS_CANTRCVMORE | SS_CANTSENDMORE)) {
817 /* the socket has been shutdown, no more sockopt's */
818 mptcp_flush_sopts(mpte);
819 }
820
821out:
822 return (error);
823}
824
825/*
826 * User-protocol pru_disconnectx callback.
827 */
828static int
829mptcp_usr_disconnectx(struct socket *mp_so, associd_t aid, connid_t cid)
830{
831 struct mppcb *mpp = sotomppcb(mp_so);
832 struct mptses *mpte;
833 int error = 0;
834
835 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
836 error = EINVAL;
837 goto out;
838 }
839 mpte = mptompte(mpp);
840 VERIFY(mpte != NULL);
841 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
842
843 if (aid != ASSOCID_ANY && aid != ASSOCID_ALL &&
844 aid != mpte->mpte_associd) {
845 error = EINVAL;
846 goto out;
847 }
848
849 error = mptcp_disconnectx(mpte, aid, cid);
850out:
851 return (error);
852}
853
854/*
855 * User issued close, and wish to trail thru shutdown states.
856 */
857static struct mptses *
858mptcp_usrclosed(struct mptses *mpte)
859{
860 struct socket *mp_so;
861 struct mptcb *mp_tp;
862 struct mptsub *mpts;
863
864 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
865 mp_so = mpte->mpte_mppcb->mpp_socket;
866 mp_tp = mpte->mpte_mptcb;
867
868 MPT_LOCK(mp_tp);
869 mptcp_close_fsm(mp_tp, MPCE_CLOSE);
870
871 if (mp_tp->mpt_state == TCPS_CLOSED) {
872 mpte = mptcp_close(mpte, mp_tp);
873 MPT_UNLOCK(mp_tp);
874 } else if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2) {
875 MPT_UNLOCK(mp_tp);
876 soisdisconnected(mp_so);
877 } else {
878 mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
879 MPT_UNLOCK(mp_tp);
880
881 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
882 MPTS_LOCK(mpts);
883 mptcp_subflow_disconnect(mpte, mpts, FALSE);
884 MPTS_UNLOCK(mpts);
885 }
886 }
887 /*
888 * XXX: adi@apple.com
889 *
890 * Do we need to handle time wait specially here? We need to handle
891 * the case where MPTCP has been established, but we have not usable
892 * subflow to use. Do we want to wait a while before forcibly
893 * tearing this MPTCP down, in case we have one or more subflows
894 * that are flow controlled?
895 */
896
897 return (mpte);
898}
899
900/*
901 * User-protocol pru_peeloff callback.
902 */
903static int
904mptcp_usr_peeloff(struct socket *mp_so, associd_t aid, struct socket **psop)
905{
906 struct mppcb *mpp = sotomppcb(mp_so);
907 struct mptses *mpte;
908 int error = 0;
909
910 VERIFY(psop != NULL);
911
912 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
913 error = EINVAL;
914 goto out;
915 }
916 mpte = mptompte(mpp);
917 VERIFY(mpte != NULL);
918
919 error = mptcp_peeloff(mpte, aid, psop);
920out:
921 return (error);
922}
923
924/*
925 * Transform a previously connected TCP subflow connection which has
926 * failed to negotiate MPTCP to its own socket which can be externalized
927 * with a file descriptor. Valid only when the MPTCP socket is not
928 * yet associated (MPTCP-level connection has not been established.)
929 */
930static int
931mptcp_peeloff(struct mptses *mpte, associd_t aid, struct socket **psop)
932{
933 struct socket *so = NULL, *mp_so;
934 struct mptsub *mpts;
935 int error = 0;
936
937 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
938 mp_so = mpte->mpte_mppcb->mpp_socket;
939
940 VERIFY(psop != NULL);
941 *psop = NULL;
942
943 DTRACE_MPTCP3(peeloff, struct mptses *, mpte, associd_t, aid,
944 struct socket *, mp_so);
945
946 /* peeloff cannot happen after an association is established */
947 if (mpte->mpte_associd != ASSOCID_ANY) {
948 error = EINVAL;
949 goto out;
950 }
951
952 if (aid != ASSOCID_ANY && aid != ASSOCID_ALL) {
953 error = EINVAL;
954 goto out;
955 }
956
957 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
958 MPTS_LOCK(mpts);
959 if (mpts->mpts_flags & MPTSF_MP_CAPABLE) {
960 panic("%s: so %p is MPTCP capable but mp_so %p "
961 "aid is %d\n", __func__, so, mp_so,
962 mpte->mpte_associd);
963 /* NOTREACHED */
964 }
965 MPTS_ADDREF_LOCKED(mpts); /* for us */
966 so = mpts->mpts_socket;
967 VERIFY(so != NULL);
968 /*
969 * This subflow socket is about to be externalized; make it
970 * appear as if it has the same properties as the MPTCP socket,
971 * undo what's done earlier in mptcp_subflow_add().
972 */
973 mptcp_subflow_sopeeloff(mpte, mpts, so);
974 MPTS_UNLOCK(mpts);
975
976 mptcp_subflow_del(mpte, mpts, FALSE);
977 MPTS_REMREF(mpts); /* ours */
978 /*
979 * XXX adi@apple.com
980 *
981 * Here we need to make sure the subflow socket is not
982 * flow controlled; need to clear both INP_FLOW_CONTROLLED
983 * and INP_FLOW_SUSPENDED on the subflow socket, since
984 * we will no longer be monitoring its events.
985 */
986 break;
987 }
988
989 if (so == NULL) {
990 error = EINVAL;
991 goto out;
992 }
993 *psop = so;
994
995 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx\n", __func__,
996 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)));
997out:
998 return (error);
999}
1000
1001/*
1002 * After a receive, possible send some update to peer.
1003 */
1004static int
1005mptcp_usr_rcvd(struct socket *mp_so, int flags)
1006{
1007#pragma unused(flags)
1008 struct mppcb *mpp = sotomppcb(mp_so);
1009 struct mptses *mpte;
1010 int error = 0;
1011
1012 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
1013 error = EINVAL;
1014 goto out;
1015 }
1016 mpte = mptompte(mpp);
1017 VERIFY(mpte != NULL);
1018
1019 error = mptcp_output(mpte);
1020out:
1021 return (error);
1022}
1023
1024/*
1025 * Do a send by putting data in the output queue.
1026 */
1027static int
1028mptcp_usr_send(struct socket *mp_so, int prus_flags, struct mbuf *m,
1029 struct sockaddr *nam, struct mbuf *control, struct proc *p)
1030{
1031#pragma unused(nam, p)
1032 struct mppcb *mpp = sotomppcb(mp_so);
1033 struct mptses *mpte;
1034 int error = 0;
1035
1036 if (prus_flags & (PRUS_OOB|PRUS_EOF)) {
1037 error = EOPNOTSUPP;
1038 goto out;
1039 }
1040
1041 if (nam != NULL) {
1042 error = EOPNOTSUPP;
1043 goto out;
1044 }
1045
1046 if (control != NULL && control->m_len != 0) {
1047 error = EOPNOTSUPP;
1048 goto out;
1049 }
1050
1051 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
1052 error = ECONNRESET;
1053 goto out;
1054 }
1055 mpte = mptompte(mpp);
1056 VERIFY(mpte != NULL);
1057
1058 if (!(mp_so->so_state & SS_ISCONNECTED)) {
1059 error = ENOTCONN;
1060 goto out;
1061 }
1062
1063 mptcp_insert_dsn(mpp, m);
1064 VERIFY(mp_so->so_snd.sb_flags & SB_NOCOMPRESS);
1065 (void) sbappendstream(&mp_so->so_snd, m);
1066 m = NULL;
1067
1068 if (mpte != NULL) {
1069 /*
1070 * XXX: adi@apple.com
1071 *
1072 * PRUS_MORETOCOME could be set, but we don't check it now.
1073 */
1074 error = mptcp_output(mpte);
1075 }
1076
1077out:
1078 if (error) {
1079 if (m != NULL)
1080 m_freem(m);
1081 if (control != NULL)
1082 m_freem(control);
1083 }
1084 return (error);
1085}
1086
1087/*
1088 * Mark the MPTCP connection as being incapable of further output.
1089 */
1090static int
1091mptcp_usr_shutdown(struct socket *mp_so)
1092{
1093 struct mppcb *mpp = sotomppcb(mp_so);
1094 struct mptses *mpte;
1095 int error = 0;
1096
1097 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
1098 error = EINVAL;
1099 goto out;
1100 }
1101 mpte = mptompte(mpp);
1102 VERIFY(mpte != NULL);
1103
1104 socantsendmore(mp_so);
1105
1106 mpte = mptcp_usrclosed(mpte);
1107 if (mpte != NULL)
1108 error = mptcp_output(mpte);
1109out:
1110 return (error);
1111}
1112
1113/*
1114 * Copy the contents of uio into a properly sized mbuf chain.
1115 */
1116static int
1117mptcp_uiotombuf(struct uio *uio, int how, int space, uint32_t align,
1118 struct mbuf **top)
1119{
1120 struct mbuf *m, *mb, *nm = NULL, *mtail = NULL;
1121 user_ssize_t resid, tot, len, progress; /* must be user_ssize_t */
1122 int error;
1123
1124 VERIFY(top != NULL && *top == NULL);
1125
1126 /*
1127 * space can be zero or an arbitrary large value bound by
1128 * the total data supplied by the uio.
1129 */
1130 resid = uio_resid(uio);
1131 if (space > 0)
1132 tot = imin(resid, space);
1133 else
1134 tot = resid;
1135
1136 /*
1137 * The smallest unit is a single mbuf with pkthdr.
1138 * We can't align past it.
1139 */
1140 if (align >= MHLEN)
1141 return (EINVAL);
1142
1143 /*
1144 * Give us the full allocation or nothing.
1145 * If space is zero return the smallest empty mbuf.
1146 */
1147 if ((len = tot + align) == 0)
1148 len = 1;
1149
1150 /* Loop and append maximum sized mbufs to the chain tail. */
1151 while (len > 0) {
1152 uint32_t m_needed = 1;
1153
1154 if (njcl > 0 && len > MBIGCLBYTES)
1155 mb = m_getpackets_internal(&m_needed, 1,
1156 how, 1, M16KCLBYTES);
1157 else if (len > MCLBYTES)
1158 mb = m_getpackets_internal(&m_needed, 1,
1159 how, 1, MBIGCLBYTES);
1160 else if (len >= (signed)MINCLSIZE)
1161 mb = m_getpackets_internal(&m_needed, 1,
1162 how, 1, MCLBYTES);
1163 else
1164 mb = m_gethdr(how, MT_DATA);
1165
1166 /* Fail the whole operation if one mbuf can't be allocated. */
1167 if (mb == NULL) {
1168 if (nm != NULL)
1169 m_freem(nm);
1170 return (ENOBUFS);
1171 }
1172
1173 /* Book keeping. */
1174 VERIFY(mb->m_flags & M_PKTHDR);
1175 len -= ((mb->m_flags & M_EXT) ? mb->m_ext.ext_size : MHLEN);
1176 if (mtail != NULL)
1177 mtail->m_next = mb;
1178 else
1179 nm = mb;
1180 mtail = mb;
1181 }
1182
1183 m = nm;
1184 m->m_data += align;
1185
1186 progress = 0;
1187 /* Fill all mbufs with uio data and update header information. */
1188 for (mb = m; mb != NULL; mb = mb->m_next) {
1189 len = imin(M_TRAILINGSPACE(mb), tot - progress);
1190
1191 error = uiomove(mtod(mb, char *), len, uio);
1192 if (error != 0) {
1193 m_freem(m);
1194 return (error);
1195 }
1196
1197 /* each mbuf is M_PKTHDR chained via m_next */
1198 mb->m_len = len;
1199 mb->m_pkthdr.len = len;
1200
1201 progress += len;
1202 }
1203 VERIFY(progress == tot);
1204 *top = m;
1205 return (0);
1206}
1207
1208/*
1209 * MPTCP socket protocol-user socket send routine, derived from sosend().
1210 */
1211static int
1212mptcp_usr_sosend(struct socket *mp_so, struct sockaddr *addr, struct uio *uio,
1213 struct mbuf *top, struct mbuf *control, int flags)
1214{
1215#pragma unused(addr)
1216 int32_t space;
1217 user_ssize_t resid;
1218 int error, sendflags;
1219 struct proc *p = current_proc();
1220 int sblocked = 0;
1221
1222 /* UIO is required for now, due to per-mbuf M_PKTHDR constrains */
1223 if (uio == NULL || top != NULL) {
1224 error = EINVAL;
1225 goto out;
1226 }
1227 resid = uio_resid(uio);
1228
1229 socket_lock(mp_so, 1);
1230 so_update_last_owner_locked(mp_so, p);
1231 so_update_policy(mp_so);
1232
1233 VERIFY(mp_so->so_type == SOCK_STREAM);
1234 VERIFY(!(mp_so->so_flags & SOF_MP_SUBFLOW));
1235
1236 if ((flags & (MSG_OOB|MSG_DONTROUTE|MSG_HOLD|MSG_SEND|MSG_FLUSH)) ||
1237 (mp_so->so_flags & SOF_ENABLE_MSGS)) {
1238 error = EOPNOTSUPP;
1239 socket_unlock(mp_so, 1);
1240 goto out;
1241 }
1242
1243 /*
1244 * In theory resid should be unsigned. However, space must be
1245 * signed, as it might be less than 0 if we over-committed, and we
1246 * must use a signed comparison of space and resid. On the other
1247 * hand, a negative resid causes us to loop sending 0-length
1248 * segments to the protocol.
1249 */
1250 if (resid < 0 || (flags & MSG_EOR) || control != NULL) {
1251 error = EINVAL;
1252 socket_unlock(mp_so, 1);
1253 goto out;
1254 }
1255
1256 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
1257
1258 do {
1259 error = sosendcheck(mp_so, NULL, resid, 0, 0, flags,
1260 &sblocked, NULL);
1261 if (error != 0)
1262 goto release;
1263
1264 space = sbspace(&mp_so->so_snd);
1265 do {
1266 socket_unlock(mp_so, 0);
1267 /*
1268 * Copy the data from userland into an mbuf chain.
1269 */
1270 error = mptcp_uiotombuf(uio, M_WAITOK, space, 0, &top);
1271 if (error != 0) {
1272 socket_lock(mp_so, 0);
1273 goto release;
1274 }
1275 VERIFY(top != NULL);
1276 space -= resid - uio_resid(uio);
1277 resid = uio_resid(uio);
1278 socket_lock(mp_so, 0);
1279
1280 /*
1281 * Compute flags here, for pru_send and NKEs.
1282 */
1283 sendflags = (resid > 0 && space > 0) ?
1284 PRUS_MORETOCOME : 0;
1285
1286 /*
1287 * Socket filter processing
1288 */
1289 VERIFY(control == NULL);
1290 error = sflt_data_out(mp_so, NULL, &top, &control, 0);
1291 if (error != 0) {
1292 if (error == EJUSTRETURN) {
1293 error = 0;
1294 top = NULL;
1295 /* always free control if any */
1296 }
1297 goto release;
1298 }
1299 if (control != NULL) {
1300 m_freem(control);
1301 control = NULL;
1302 }
1303
1304 /*
1305 * Pass data to protocol.
1306 */
1307 error = (*mp_so->so_proto->pr_usrreqs->pru_send)
1308 (mp_so, sendflags, top, NULL, NULL, p);
1309
1310 top = NULL;
1311 if (error != 0)
1312 goto release;
1313 } while (resid != 0 && space > 0);
1314 } while (resid != 0);
1315
1316release:
1317 if (sblocked)
1318 sbunlock(&mp_so->so_snd, FALSE); /* will unlock socket */
1319 else
1320 socket_unlock(mp_so, 1);
1321out:
1322 if (top != NULL)
1323 m_freem(top);
1324 if (control != NULL)
1325 m_freem(control);
1326
1327 return (error);
1328}
1329
1330/*
1331 * Called to filter SOPT_{SET,GET} for SOL_SOCKET level socket options.
1332 * This routine simply indicates to the caller whether or not to proceed
1333 * further with the given socket option. This is invoked by sosetoptlock()
1334 * and sogetoptlock().
1335 */
1336static int
1337mptcp_usr_socheckopt(struct socket *mp_so, struct sockopt *sopt)
1338{
1339#pragma unused(mp_so)
1340 int error = 0;
1341
1342 VERIFY(sopt->sopt_level == SOL_SOCKET);
1343
1344 /*
1345 * We could check for sopt_dir (set/get) here, but we'll just
1346 * let the caller deal with it as appropriate; therefore the
1347 * following is a superset of the socket options which we
1348 * allow for set/get.
1349 *
1350 * XXX: adi@apple.com
1351 *
1352 * Need to consider the following cases:
1353 *
1354 * a. In the event peeloff(2) occurs on the subflow socket,
1355 * we may want to issue those options which are now
1356 * handled at the MP socket. In that case, we will need
1357 * to record them in mptcp_setopt() so that they can
1358 * be replayed during peeloff.
1359 *
1360 * b. Certain socket options don't have a clear definition
1361 * on the expected behavior post connect(2). At the time
1362 * those options are issued on the MP socket, there may
1363 * be existing subflow sockets that are already connected.
1364 */
1365 switch (sopt->sopt_name) {
1366 case SO_LINGER: /* MP */
1367 case SO_LINGER_SEC: /* MP */
1368 case SO_TYPE: /* MP */
1369 case SO_NREAD: /* MP */
1370 case SO_NWRITE: /* MP */
1371 case SO_ERROR: /* MP */
1372 case SO_SNDBUF: /* MP */
1373 case SO_RCVBUF: /* MP */
1374 case SO_SNDLOWAT: /* MP */
1375 case SO_RCVLOWAT: /* MP */
1376 case SO_SNDTIMEO: /* MP */
1377 case SO_RCVTIMEO: /* MP */
1378 case SO_NKE: /* MP */
1379 case SO_NOSIGPIPE: /* MP */
1380 case SO_NOADDRERR: /* MP */
1381 case SO_LABEL: /* MP */
1382 case SO_PEERLABEL: /* MP */
1383 case SO_DEFUNCTOK: /* MP */
1384 case SO_ISDEFUNCT: /* MP */
1385 case SO_TRAFFIC_CLASS_DBG: /* MP */
1386 /*
1387 * Tell the caller that these options are to be processed.
1388 */
1389 break;
1390
1391 case SO_DEBUG: /* MP + subflow */
1392 case SO_KEEPALIVE: /* MP + subflow */
1393 case SO_USELOOPBACK: /* MP + subflow */
1394 case SO_RANDOMPORT: /* MP + subflow */
1395 case SO_TRAFFIC_CLASS: /* MP + subflow */
1396 case SO_RECV_TRAFFIC_CLASS: /* MP + subflow */
1397 case SO_PRIVILEGED_TRAFFIC_CLASS: /* MP + subflow */
1398 case SO_RECV_ANYIF: /* MP + subflow */
1399 case SO_RESTRICTIONS: /* MP + subflow */
1400 case SO_FLUSH: /* MP + subflow */
1401 /*
1402 * Tell the caller that these options are to be processed;
1403 * these will also be recorded later by mptcp_setopt().
1404 *
1405 * NOTE: Only support integer option value for now.
1406 */
1407 if (sopt->sopt_valsize != sizeof (int))
1408 error = EINVAL;
1409 break;
1410
1411 default:
1412 /*
1413 * Tell the caller to stop immediately and return an error.
1414 */
1415 error = ENOPROTOOPT;
1416 break;
1417 }
1418
1419 return (error);
1420}
1421
1422/*
1423 * Issue SOPT_SET for all MPTCP subflows (for integer option values.)
1424 */
1425static int
1426mptcp_setopt_apply(struct mptses *mpte, struct mptopt *mpo)
1427{
1428 struct socket *mp_so;
1429 struct mptsub *mpts;
1430 struct mptopt smpo;
1431 int error = 0;
1432
1433 /* just bail now if this isn't applicable to subflow sockets */
1434 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) {
1435 error = ENOPROTOOPT;
1436 goto out;
1437 }
1438
1439 /*
1440 * Skip those that are handled internally; these options
1441 * should not have been recorded and marked with the
1442 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1443 */
1444 if (mpo->mpo_level == SOL_SOCKET &&
1445 (mpo->mpo_name == SO_NOSIGPIPE || mpo->mpo_name == SO_NOADDRERR)) {
1446 error = ENOPROTOOPT;
1447 goto out;
1448 }
1449
1450 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1451 mp_so = mpte->mpte_mppcb->mpp_socket;
1452
1453 /*
1454 * Don't bother going further if there's no subflow; mark the option
1455 * with MPOF_INTERIM so that we know whether or not to remove this
1456 * option upon encountering an error while issuing it during subflow
1457 * socket creation.
1458 */
1459 if (mpte->mpte_numflows == 0) {
1460 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows));
1461 mpo->mpo_flags |= MPOF_INTERIM;
1462 /* return success */
1463 goto out;
1464 }
1465
1466 bzero(&smpo, sizeof (smpo));
1467 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1468 smpo.mpo_level = mpo->mpo_level;
1469 smpo.mpo_name = mpo->mpo_name;
1470
1471 /* grab exisiting values in case we need to rollback */
1472 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1473 struct socket *so;
1474
1475 MPTS_LOCK(mpts);
1476 mpts->mpts_flags &= ~(MPTSF_SOPT_OLDVAL|MPTSF_SOPT_INPROG);
1477 mpts->mpts_oldintval = 0;
1478 smpo.mpo_intval = 0;
1479 VERIFY(mpts->mpts_socket != NULL);
1480 so = mpts->mpts_socket;
1481 socket_lock(so, 0);
1482 if (mptcp_subflow_sogetopt(mpte, so, &smpo) == 0) {
1483 mpts->mpts_flags |= MPTSF_SOPT_OLDVAL;
1484 mpts->mpts_oldintval = smpo.mpo_intval;
1485 }
1486 socket_unlock(so, 0);
1487 MPTS_UNLOCK(mpts);
1488 }
1489
1490 /* apply socket option */
1491 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1492 struct socket *so;
1493
1494 MPTS_LOCK(mpts);
1495 mpts->mpts_flags |= MPTSF_SOPT_INPROG;
1496 VERIFY(mpts->mpts_socket != NULL);
1497 so = mpts->mpts_socket;
1498 socket_lock(so, 0);
1499 error = mptcp_subflow_sosetopt(mpte, so, mpo);
1500 socket_unlock(so, 0);
1501 MPTS_UNLOCK(mpts);
1502 if (error != 0)
1503 break;
1504 }
1505
1506 /* cleanup, and rollback if needed */
1507 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1508 struct socket *so;
1509
1510 MPTS_LOCK(mpts);
1511 if (!(mpts->mpts_flags & MPTSF_SOPT_INPROG)) {
1512 /* clear in case it's set */
1513 mpts->mpts_flags &= ~MPTSF_SOPT_OLDVAL;
1514 mpts->mpts_oldintval = 0;
1515 MPTS_UNLOCK(mpts);
1516 continue;
1517 }
1518 if (!(mpts->mpts_flags & MPTSF_SOPT_OLDVAL)) {
1519 mpts->mpts_flags &= ~MPTSF_SOPT_INPROG;
1520 VERIFY(mpts->mpts_oldintval == 0);
1521 MPTS_UNLOCK(mpts);
1522 continue;
1523 }
1524 /* error during sosetopt, so roll it back */
1525 if (error != 0) {
1526 VERIFY(mpts->mpts_socket != NULL);
1527 so = mpts->mpts_socket;
1528 socket_lock(so, 0);
1529 smpo.mpo_intval = mpts->mpts_oldintval;
1530 (void) mptcp_subflow_sosetopt(mpte, so, &smpo);
1531 socket_unlock(so, 0);
1532 }
1533 mpts->mpts_oldintval = 0;
1534 mpts->mpts_flags &= ~(MPTSF_SOPT_OLDVAL|MPTSF_SOPT_INPROG);
1535 MPTS_UNLOCK(mpts);
1536 }
1537
1538out:
1539 return (error);
1540}
1541
1542/*
1543 * Handle SOPT_SET for socket options issued on MP socket.
1544 */
1545static int
1546mptcp_setopt(struct mptses *mpte, struct sockopt *sopt)
1547{
1548 int error = 0, optval, level, optname, rec = 1;
1549 struct mptopt smpo, *mpo = NULL;
1550 struct socket *mp_so;
1551 char buf[32];
1552
1553 level = sopt->sopt_level;
1554 optname = sopt->sopt_name;
1555
1556 VERIFY(sopt->sopt_dir == SOPT_SET);
1557 VERIFY(level == SOL_SOCKET || level == IPPROTO_TCP);
1558 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1559 mp_so = mpte->mpte_mppcb->mpp_socket;
1560
1561 /*
1562 * Record socket options which are applicable to subflow sockets so
1563 * that we can replay them for new ones; see mptcp_usr_socheckopt()
1564 * for the list of eligible socket-level options.
1565 */
1566 if (level == SOL_SOCKET) {
1567 switch (optname) {
1568 case SO_DEBUG:
1569 case SO_KEEPALIVE:
1570 case SO_USELOOPBACK:
1571 case SO_RANDOMPORT:
1572 case SO_TRAFFIC_CLASS:
1573 case SO_RECV_TRAFFIC_CLASS:
1574 case SO_PRIVILEGED_TRAFFIC_CLASS:
1575 case SO_RECV_ANYIF:
1576 case SO_RESTRICTIONS:
1577 /* record it */
1578 break;
1579 case SO_FLUSH:
1580 /* don't record it */
1581 rec = 0;
1582 break;
1583 default:
1584 /* nothing to do; just return success */
1585 goto out;
1586 }
1587 } else {
1588 switch (optname) {
1589 case TCP_NODELAY:
1590 case TCP_RXT_FINDROP:
1591 case TCP_KEEPALIVE:
1592 case TCP_KEEPINTVL:
1593 case TCP_KEEPCNT:
1594 case TCP_CONNECTIONTIMEOUT:
1595 case TCP_RXT_CONNDROPTIME:
1596 case PERSIST_TIMEOUT:
1597 /* eligible; record it */
1598 break;
1599 default:
1600 /* not eligible */
1601 error = ENOPROTOOPT;
1602 goto out;
1603 }
1604 }
1605
1606 if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
1607 sizeof (optval))) != 0)
1608 goto out;
1609
1610 if (rec) {
1611 /* search for an existing one; if not found, allocate */
1612 if ((mpo = mptcp_sopt_find(mpte, sopt)) == NULL)
1613 mpo = mptcp_sopt_alloc(M_WAITOK);
1614
1615 if (mpo == NULL) {
1616 error = ENOBUFS;
1617 } else {
1618 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s "
1619 "val %d %s\n", __func__,
1620 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1621 mptcp_sopt2str(level, optname, buf,
1622 sizeof (buf)), optval,
1623 (mpo->mpo_flags & MPOF_ATTACHED) ?
1624 "updated" : "recorded"));
1625
1626 /* initialize or update, as needed */
1627 mpo->mpo_intval = optval;
1628 if (!(mpo->mpo_flags & MPOF_ATTACHED)) {
1629 mpo->mpo_level = level;
1630 mpo->mpo_name = optname;
1631 mptcp_sopt_insert(mpte, mpo);
1632 }
1633 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1634 /* this can be issued on the subflow socket */
1635 mpo->mpo_flags |= MPOF_SUBFLOW_OK;
1636 }
1637 } else {
1638 bzero(&smpo, sizeof (smpo));
1639 mpo = &smpo;
1640 mpo->mpo_flags |= MPOF_SUBFLOW_OK;
1641 mpo->mpo_level = level;
1642 mpo->mpo_name = optname;
1643 mpo->mpo_intval = optval;
1644 }
1645 VERIFY(mpo == NULL || error == 0);
1646
1647 /* issue this socket option on existing subflows */
1648 if (error == 0) {
1649 error = mptcp_setopt_apply(mpte, mpo);
1650 if (error != 0 && (mpo->mpo_flags & MPOF_ATTACHED)) {
1651 VERIFY(mpo != &smpo);
1652 mptcp_sopt_remove(mpte, mpo);
1653 mptcp_sopt_free(mpo);
1654 }
1655 if (mpo == &smpo)
1656 mpo->mpo_flags &= ~MPOF_INTERIM;
1657 }
1658out:
1659 if (error == 0 && mpo != NULL) {
1660 mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s val %d set %s\n",
1661 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1662 mptcp_sopt2str(level, optname, buf,
1663 sizeof (buf)), optval, (mpo->mpo_flags & MPOF_INTERIM) ?
1664 "pending" : "successful"));
1665 } else if (error != 0) {
1666 mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s can't be issued "
1667 "error %d\n", __func__,
1668 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mptcp_sopt2str(level,
1669 optname, buf, sizeof (buf)), error));
1670 }
1671 return (error);
1672}
1673
1674/*
1675 * Handle SOPT_GET for socket options issued on MP socket.
1676 */
1677static int
1678mptcp_getopt(struct mptses *mpte, struct sockopt *sopt)
1679{
1680 int error = 0, optval;
1681
1682 VERIFY(sopt->sopt_dir == SOPT_GET);
1683 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1684
1685 /*
1686 * We only handle SOPT_GET for TCP level socket options; we should
1687 * not get here for socket level options since they are already
1688 * handled at the socket layer.
1689 */
1690 if (sopt->sopt_level != IPPROTO_TCP) {
1691 error = ENOPROTOOPT;
1692 goto out;
1693 }
1694
1695 switch (sopt->sopt_name) {
1696 case TCP_NODELAY:
1697 case TCP_RXT_FINDROP:
1698 case TCP_KEEPALIVE:
1699 case TCP_KEEPINTVL:
1700 case TCP_KEEPCNT:
1701 case TCP_CONNECTIONTIMEOUT:
1702 case TCP_RXT_CONNDROPTIME:
1703 case PERSIST_TIMEOUT:
1704 /* eligible; get the default value just in case */
1705 error = mptcp_default_tcp_optval(mpte, sopt, &optval);
1706 break;
1707 default:
1708 /* not eligible */
1709 error = ENOPROTOOPT;
1710 break;
1711 }
1712
1713 /*
1714 * Search for a previously-issued TCP level socket option and
1715 * return the recorded option value. This assumes that the
1716 * value did not get modified by the lower layer after it was
1717 * issued at setsockopt(2) time. If not found, we'll return
1718 * the default value obtained ealier.
1719 */
1720 if (error == 0) {
1721 struct mptopt *mpo;
1722
1723 if ((mpo = mptcp_sopt_find(mpte, sopt)) != NULL)
1724 optval = mpo->mpo_intval;
1725
1726 error = sooptcopyout(sopt, &optval, sizeof (int));
1727 }
1728out:
1729 return (error);
1730}
1731
1732/*
1733 * Return default values for TCP socket options. Ideally we would query the
1734 * subflow TCP socket, but that requires creating a subflow socket before
1735 * connectx(2) time. To simplify things, just return the default values
1736 * that we know of.
1737 */
1738static int
1739mptcp_default_tcp_optval(struct mptses *mpte, struct sockopt *sopt, int *optval)
1740{
1741 int error = 0;
1742
1743 VERIFY(sopt->sopt_level == IPPROTO_TCP);
1744 VERIFY(sopt->sopt_dir == SOPT_GET);
1745 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1746
1747 /* try to do what tcp_newtcpcb() does */
1748 switch (sopt->sopt_name) {
1749 case TCP_NODELAY:
1750 case TCP_RXT_FINDROP:
1751 case TCP_KEEPINTVL:
1752 case TCP_KEEPCNT:
1753 case TCP_CONNECTIONTIMEOUT:
1754 case TCP_RXT_CONNDROPTIME:
1755 *optval = 0;
1756 break;
1757
1758 case TCP_KEEPALIVE:
1759 *optval = mptcp_subflow_keeptime;
1760 break;
1761
1762 case PERSIST_TIMEOUT:
1763 *optval = tcp_max_persist_timeout;
1764 break;
1765
1766 default:
1767 error = ENOPROTOOPT;
1768 break;
1769 }
1770 return (error);
1771}
1772
1773/*
1774 * MPTCP SOPT_{SET,GET} socket option handler, for options issued on the MP
1775 * socket, at SOL_SOCKET and IPPROTO_TCP levels. The former is restricted
1776 * to those that are allowed by mptcp_usr_socheckopt().
1777 */
1778int
1779mptcp_ctloutput(struct socket *mp_so, struct sockopt *sopt)
1780{
1781 struct mppcb *mpp = sotomppcb(mp_so);
1782 struct mptses *mpte;
1783 int error = 0;
1784
1785 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
1786 error = EINVAL;
1787 goto out;
1788 }
1789 mpte = mptompte(mpp);
1790 MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */
1791
1792 /* we only handle socket and TCP-level socket options for MPTCP */
1793 if (sopt->sopt_level != SOL_SOCKET && sopt->sopt_level != IPPROTO_TCP) {
1794 char buf[32];
1795 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s level not "
1796 "handled\n", __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1797 mptcp_sopt2str(sopt->sopt_level,
1798 sopt->sopt_name, buf, sizeof (buf))));
1799 error = EINVAL;
1800 goto out;
1801 }
1802
1803 switch (sopt->sopt_dir) {
1804 case SOPT_SET:
1805 error = mptcp_setopt(mpte, sopt);
1806 break;
1807
1808 case SOPT_GET:
1809 error = mptcp_getopt(mpte, sopt);
1810 break;
1811 }
1812out:
1813 return (error);
1814}
1815
1816/*
1817 * Return a string representation of <sopt_level,sopt_name>
1818 */
1819const char *
1820mptcp_sopt2str(int level, int optname, char *dst, int size)
1821{
1822 char lbuf[32], obuf[32];
1823 const char *l = lbuf, *o = obuf;
1824
1825 (void) snprintf(lbuf, sizeof (lbuf), "0x%x", level);
1826 (void) snprintf(obuf, sizeof (obuf), "0x%x", optname);
1827
1828 switch (level) {
1829 case SOL_SOCKET:
1830 l = "SOL_SOCKET";
1831 switch (optname) {
1832 case SO_LINGER:
1833 o = "SO_LINGER";
1834 break;
1835 case SO_LINGER_SEC:
1836 o = "SO_LINGER_SEC";
1837 break;
1838 case SO_DEBUG:
1839 o = "SO_DEBUG";
1840 break;
1841 case SO_KEEPALIVE:
1842 o = "SO_KEEPALIVE";
1843 break;
1844 case SO_USELOOPBACK:
1845 o = "SO_USELOOPBACK";
1846 break;
1847 case SO_TYPE:
1848 o = "SO_TYPE";
1849 break;
1850 case SO_NREAD:
1851 o = "SO_NREAD";
1852 break;
1853 case SO_NWRITE:
1854 o = "SO_NWRITE";
1855 break;
1856 case SO_ERROR:
1857 o = "SO_ERROR";
1858 break;
1859 case SO_SNDBUF:
1860 o = "SO_SNDBUF";
1861 break;
1862 case SO_RCVBUF:
1863 o = "SO_RCVBUF";
1864 break;
1865 case SO_SNDLOWAT:
1866 o = "SO_SNDLOWAT";
1867 break;
1868 case SO_RCVLOWAT:
1869 o = "SO_RCVLOWAT";
1870 break;
1871 case SO_SNDTIMEO:
1872 o = "SO_SNDTIMEO";
1873 break;
1874 case SO_RCVTIMEO:
1875 o = "SO_RCVTIMEO";
1876 break;
1877 case SO_NKE:
1878 o = "SO_NKE";
1879 break;
1880 case SO_NOSIGPIPE:
1881 o = "SO_NOSIGPIPE";
1882 break;
1883 case SO_NOADDRERR:
1884 o = "SO_NOADDRERR";
1885 break;
1886 case SO_RESTRICTIONS:
1887 o = "SO_RESTRICTIONS";
1888 break;
1889 case SO_LABEL:
1890 o = "SO_LABEL";
1891 break;
1892 case SO_PEERLABEL:
1893 o = "SO_PEERLABEL";
1894 break;
1895 case SO_RANDOMPORT:
1896 o = "SO_RANDOMPORT";
1897 break;
1898 case SO_TRAFFIC_CLASS:
1899 o = "SO_TRAFFIC_CLASS";
1900 break;
1901 case SO_RECV_TRAFFIC_CLASS:
1902 o = "SO_RECV_TRAFFIC_CLASS";
1903 break;
1904 case SO_TRAFFIC_CLASS_DBG:
1905 o = "SO_TRAFFIC_CLASS_DBG";
1906 break;
1907 case SO_PRIVILEGED_TRAFFIC_CLASS:
1908 o = "SO_PRIVILEGED_TRAFFIC_CLASS";
1909 break;
1910 case SO_DEFUNCTOK:
1911 o = "SO_DEFUNCTOK";
1912 break;
1913 case SO_ISDEFUNCT:
1914 o = "SO_ISDEFUNCT";
1915 break;
1916 case SO_OPPORTUNISTIC:
1917 o = "SO_OPPORTUNISTIC";
1918 break;
1919 case SO_FLUSH:
1920 o = "SO_FLUSH";
1921 break;
1922 case SO_RECV_ANYIF:
1923 o = "SO_RECV_ANYIF";
1924 break;
1925 }
1926 break;
1927 case IPPROTO_TCP:
1928 l = "IPPROTO_TCP";
1929 switch (optname) {
1930 case TCP_KEEPALIVE:
1931 o = "TCP_KEEPALIVE";
1932 break;
1933 case TCP_KEEPINTVL:
1934 o = "TCP_KEEPINTVL";
1935 break;
1936 case TCP_KEEPCNT:
1937 o = "TCP_KEEPCNT";
1938 break;
1939 case TCP_CONNECTIONTIMEOUT:
1940 o = "TCP_CONNECTIONTIMEOUT";
1941 break;
1942 case TCP_RXT_CONNDROPTIME:
1943 o = "TCP_RXT_CONNDROPTIME";
1944 break;
1945 case PERSIST_TIMEOUT:
1946 o = "PERSIST_TIMEOUT";
1947 break;
1948 }
1949 break;
1950 }
1951
1952 (void) snprintf(dst, size, "<%s,%s>", l, o);
1953 return (dst);
1954}