X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/bd504ef0e0b883cdd7917b73b3574eb9ce669905..39236c6e673c41db228275375ab7fdb0f837b292:/bsd/netinet/mptcp_usrreq.c diff --git a/bsd/netinet/mptcp_usrreq.c b/bsd/netinet/mptcp_usrreq.c new file mode 100644 index 000000000..d4ea19cd1 --- /dev/null +++ b/bsd/netinet/mptcp_usrreq.c @@ -0,0 +1,1954 @@ +/* + * Copyright (c) 2012-2013 Apple Inc. All rights reserved. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ + * + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, + * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. + * + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static int mptcp_usr_attach(struct socket *, int, struct proc *); +static int mptcp_usr_detach(struct socket *); +static int mptcp_attach(struct socket *, struct proc *); +static int mptcp_detach(struct socket *, struct mppcb *); +static int mptcp_connectx(struct mptses *, struct sockaddr_list **, + struct sockaddr_list **, struct proc *, uint32_t, associd_t, connid_t *, + uint32_t, void *, uint32_t); +static int mptcp_usr_connectx(struct socket *, struct sockaddr_list **, + struct sockaddr_list **, struct proc *, uint32_t, associd_t, connid_t *, + uint32_t, void *, uint32_t); +static int mptcp_getassocids(struct mptses *, uint32_t *, user_addr_t); +static int mptcp_getconnids(struct mptses *, associd_t, uint32_t *, + user_addr_t); +static int mptcp_getconninfo(struct mptses *, connid_t *, uint32_t *, + uint32_t *, int32_t *, user_addr_t, socklen_t *, user_addr_t, socklen_t *, + uint32_t *, user_addr_t, uint32_t *); +static int mptcp_usr_control(struct socket *, u_long, caddr_t, struct ifnet *, + struct proc *); +static int mptcp_disconnectx(struct mptses *, associd_t, connid_t); +static int mptcp_usr_disconnectx(struct socket *, associd_t, connid_t); +static struct mptses *mptcp_usrclosed(struct mptses *); +static int mptcp_usr_peeloff(struct socket *, associd_t, struct socket **); +static int mptcp_peeloff(struct mptses *, associd_t, struct socket **); +static int mptcp_usr_rcvd(struct socket *, int); +static int mptcp_usr_send(struct socket *, int, struct mbuf *, + struct sockaddr *, struct mbuf *, struct proc *); +static int mptcp_usr_shutdown(struct socket *); +static int mptcp_uiotombuf(struct uio *, int, int, uint32_t, struct mbuf **); +static int mptcp_usr_sosend(struct socket *, struct sockaddr *, struct uio *, + struct mbuf *, struct mbuf *, int); +static int mptcp_usr_socheckopt(struct socket *, struct sockopt *); +static int mptcp_setopt_apply(struct mptses *, struct mptopt *); +static int mptcp_setopt(struct mptses *, struct sockopt *); +static int mptcp_getopt(struct mptses *, struct sockopt *); +static int mptcp_default_tcp_optval(struct mptses *, struct sockopt *, int *); +static void mptcp_connorder_helper(struct mptsub *mpts); + +struct pr_usrreqs mptcp_usrreqs = { + .pru_attach = mptcp_usr_attach, + .pru_connectx = mptcp_usr_connectx, + .pru_control = mptcp_usr_control, + .pru_detach = mptcp_usr_detach, + .pru_disconnectx = mptcp_usr_disconnectx, + .pru_peeloff = mptcp_usr_peeloff, + .pru_rcvd = mptcp_usr_rcvd, + .pru_send = mptcp_usr_send, + .pru_shutdown = mptcp_usr_shutdown, + .pru_sosend = mptcp_usr_sosend, + .pru_soreceive = soreceive, + .pru_socheckopt = mptcp_usr_socheckopt, +}; + +/* + * Attaches an MPTCP control block to a socket. + */ +static int +mptcp_usr_attach(struct socket *mp_so, int proto, struct proc *p) +{ +#pragma unused(proto) + int error; + + VERIFY(sotomppcb(mp_so) == NULL); + + error = mptcp_attach(mp_so, p); + if (error != 0) + goto out; + /* + * XXX: adi@apple.com + * + * Might want to use a different SO_LINGER timeout than TCP's? + */ + if ((mp_so->so_options & SO_LINGER) && mp_so->so_linger == 0) + mp_so->so_linger = TCP_LINGERTIME * hz; +out: + return (error); +} + +/* + * Detaches an MPTCP control block from a socket. + */ +static int +mptcp_usr_detach(struct socket *mp_so) +{ + struct mppcb *mpp = sotomppcb(mp_so); + int error = 0; + + VERIFY(mpp != NULL); + VERIFY(mpp->mpp_socket != NULL); + + error = mptcp_detach(mp_so, mpp); + return (error); +} + +/* + * Attach MPTCP protocol to socket, allocating MP control block, + * MPTCP session, control block, buffer space, etc. + */ +static int +mptcp_attach(struct socket *mp_so, struct proc *p) +{ +#pragma unused(p) + struct mptses *mpte; + struct mptcb *mp_tp; + struct mppcb *mpp; + int error = 0; + + if (mp_so->so_snd.sb_hiwat == 0 || mp_so->so_rcv.sb_hiwat == 0) { + error = soreserve(mp_so, tcp_sendspace, MPTCP_RWIN_MAX); + if (error != 0) + goto out; + } + + /* + * MPTCP socket buffers cannot be compressed, due to the + * fact that each mbuf chained via m_next is a M_PKTHDR + * which carries some MPTCP metadata. + */ + mp_so->so_snd.sb_flags |= SB_NOCOMPRESS; + mp_so->so_rcv.sb_flags |= SB_NOCOMPRESS; + + /* Disable socket buffer auto-tuning. */ + mp_so->so_rcv.sb_flags &= ~SB_AUTOSIZE; + mp_so->so_snd.sb_flags &= ~SB_AUTOSIZE; + + if ((error = mp_pcballoc(mp_so, &mtcbinfo)) != 0) + goto out; + + mpp = sotomppcb(mp_so); + VERIFY(mpp != NULL); + + mpte = mptcp_sescreate(mp_so, mpp); + if (mpte == NULL) { + mp_pcbdetach(mpp); + error = ENOBUFS; + goto out; + } + mp_tp = mpte->mpte_mptcb; + VERIFY(mp_tp != NULL); + + MPT_LOCK(mp_tp); + mp_tp->mpt_state = MPTCPS_CLOSED; + MPT_UNLOCK(mp_tp); + +out: + return (error); +} + +/* + * Called when the socket layer loses its final reference to the socket; + * at this point, there is only one case in which we will keep things + * around: time wait. + */ +static int +mptcp_detach(struct socket *mp_so, struct mppcb *mpp) +{ + struct mptses *mpte; + struct mppcbinfo *mppi; + + VERIFY(mp_so->so_pcb == mpp); + VERIFY(mpp->mpp_socket == mp_so); + + mppi = mpp->mpp_pcbinfo; + VERIFY(mppi != NULL); + + mpte = &((struct mpp_mtp *)mpp)->mpp_ses; + VERIFY(mpte->mpte_mppcb == mpp); + + MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ + + /* + * We are done with this MPTCP socket (it has been closed); + * trigger all subflows to be disconnected, if not already, + * by initiating the PCB detach sequence (SOF_PCBCLEARING + * will be set.) + */ + mp_pcbdetach(mpp); + + (void) mptcp_disconnectx(mpte, ASSOCID_ALL, CONNID_ALL); + + /* + * XXX: adi@apple.com + * + * Here, we would want to handle time wait state. + */ + + return (0); +} + +/* + * Common subroutine to open a MPTCP connection to one of the remote hosts + * specified by dst_sl. This includes allocating and establishing a + * subflow TCP connection, either initially to establish MPTCP connection, + * or to join an existing one. Returns a connection handle upon success. + */ +static int +mptcp_connectx(struct mptses *mpte, struct sockaddr_list **src_sl, + struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope, + associd_t aid, connid_t *pcid, uint32_t flags, void *arg, + uint32_t arglen) +{ +#pragma unused(p, aid, flags, arg, arglen) + struct mptsub *mpts; + struct socket *mp_so; + int error = 0; + + MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ + mp_so = mpte->mpte_mppcb->mpp_socket; + + VERIFY(dst_sl != NULL && *dst_sl != NULL); + VERIFY(pcid != NULL); + + mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx\n", __func__, + (u_int64_t)VM_KERNEL_ADDRPERM(mp_so))); + DTRACE_MPTCP3(connectx, struct mptses *, mpte, associd_t, aid, + struct socket *, mp_so); + + mpts = mptcp_subflow_alloc(M_WAITOK); + if (mpts == NULL) { + error = ENOBUFS; + goto out; + } + MPTS_ADDREF(mpts); /* for this routine */ + + if (src_sl != NULL) { + mpts->mpts_src_sl = *src_sl; + *src_sl = NULL; + } + mpts->mpts_dst_sl = *dst_sl; + *dst_sl = NULL; + + error = mptcp_subflow_add(mpte, mpts, p, ifscope); + if (error == 0 && pcid != NULL) + *pcid = mpts->mpts_connid; + +out: + if (mpts != NULL) { + if ((error != 0) && (error != EWOULDBLOCK)) { + MPTS_LOCK(mpts); + if (mpts->mpts_flags & MPTSF_ATTACHED) { + MPTS_UNLOCK(mpts); + MPTS_REMREF(mpts); + mptcp_subflow_del(mpte, mpts, TRUE); + return (error); + } + MPTS_UNLOCK(mpts); + } + MPTS_REMREF(mpts); + } + + return (error); +} + +/* + * User-protocol pru_connectx callback. + */ +static int +mptcp_usr_connectx(struct socket *mp_so, struct sockaddr_list **src_sl, + struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope, + associd_t aid, connid_t *pcid, uint32_t flags, void *arg, + uint32_t arglen) +{ +#pragma unused(arg, arglen) + struct mppcb *mpp = sotomppcb(mp_so); + struct mptses *mpte; + int error = 0; + + if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) { + error = EINVAL; + goto out; + } + mpte = mptompte(mpp); + VERIFY(mpte != NULL); + + error = mptcp_connectx(mpte, src_sl, dst_sl, p, ifscope, + aid, pcid, flags, arg, arglen); +out: + return (error); +} + +/* + * Handle SIOCGASSOCIDS ioctl for PF_MULTIPATH domain. + */ +static int +mptcp_getassocids(struct mptses *mpte, uint32_t *cnt, user_addr_t aidp) +{ + MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ + + /* MPTCP has at most 1 association */ + *cnt = (mpte->mpte_associd != ASSOCID_ANY) ? 1 : 0; + + /* just asking how many there are? */ + if (aidp == USER_ADDR_NULL) + return (0); + + return (copyout(&mpte->mpte_associd, aidp, + sizeof (mpte->mpte_associd))); +} + +/* + * Handle SIOCGCONNIDS ioctl for PF_MULTIPATH domain. + */ +static int +mptcp_getconnids(struct mptses *mpte, associd_t aid, uint32_t *cnt, + user_addr_t cidp) +{ + struct mptsub *mpts; + int error = 0; + + MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ + + if (aid != ASSOCID_ANY && aid != ASSOCID_ALL && + aid != mpte->mpte_associd) + return (EINVAL); + + *cnt = mpte->mpte_numflows; + + /* just asking how many there are? */ + if (cidp == USER_ADDR_NULL) + return (0); + + TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { + if ((error = copyout(&mpts->mpts_connid, cidp, + sizeof (mpts->mpts_connid))) != 0) + break; + + cidp += sizeof (mpts->mpts_connid); + } + + return (error); +} + +/* + * Handle SIOCGCONNINFO ioctl for PF_MULTIPATH domain. + */ +static int +mptcp_getconninfo(struct mptses *mpte, connid_t *cid, uint32_t *flags, + uint32_t *ifindex, int32_t *soerror, user_addr_t src, socklen_t *src_len, + user_addr_t dst, socklen_t *dst_len, uint32_t *aux_type, + user_addr_t aux_data, uint32_t *aux_len) +{ +#pragma unused(aux_data) + struct sockaddr_entry *se; + struct ifnet *ifp = NULL; + struct mptsub *mpts; + int error = 0; + + MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ + + if (*cid == CONNID_ALL) + return (EINVAL); + + TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { + if (mpts->mpts_connid == *cid || *cid == CONNID_ANY) + break; + } + if (mpts == NULL) + return ((*cid == CONNID_ANY) ? ENXIO : EINVAL); + + MPTS_LOCK(mpts); + ifp = mpts->mpts_outif; + *cid = mpts->mpts_connid; + *ifindex = ((ifp != NULL) ? ifp->if_index : 0); + *soerror = mpts->mpts_soerror; + *flags = 0; + if (mpts->mpts_flags & MPTSF_CONNECTING) + *flags |= CIF_CONNECTING; + if (mpts->mpts_flags & MPTSF_CONNECTED) + *flags |= CIF_CONNECTED; + if (mpts->mpts_flags & MPTSF_DISCONNECTING) + *flags |= CIF_DISCONNECTING; + if (mpts->mpts_flags & MPTSF_DISCONNECTED) + *flags |= CIF_DISCONNECTED; + if (mpts->mpts_flags & MPTSF_BOUND_IF) + *flags |= CIF_BOUND_IF; + if (mpts->mpts_flags & MPTSF_BOUND_IP) + *flags |= CIF_BOUND_IP; + if (mpts->mpts_flags & MPTSF_BOUND_PORT) + *flags |= CIF_BOUND_PORT; + if (mpts->mpts_flags & MPTSF_PREFERRED) + *flags |= CIF_PREFERRED; + if (mpts->mpts_flags & MPTSF_MP_CAPABLE) + *flags |= CIF_MP_CAPABLE; + if (mpts->mpts_flags & MPTSF_MP_DEGRADED) + *flags |= CIF_MP_DEGRADED; + if (mpts->mpts_flags & MPTSF_MP_READY) + *flags |= CIF_MP_READY; + if (mpts->mpts_flags & MPTSF_ACTIVE) + *flags |= CIF_MP_ACTIVE; + + VERIFY(mpts->mpts_src_sl != NULL); + se = TAILQ_FIRST(&mpts->mpts_src_sl->sl_head); + VERIFY(se != NULL && se->se_addr != NULL); + *src_len = se->se_addr->sa_len; + if (src != USER_ADDR_NULL) { + error = copyout(se->se_addr, src, se->se_addr->sa_len); + if (error != 0) + goto out; + } + + VERIFY(mpts->mpts_dst_sl != NULL); + se = TAILQ_FIRST(&mpts->mpts_dst_sl->sl_head); + VERIFY(se != NULL && se->se_addr != NULL); + *dst_len = se->se_addr->sa_len; + if (dst != USER_ADDR_NULL) { + error = copyout(se->se_addr, dst, se->se_addr->sa_len); + if (error != 0) + goto out; + } + + *aux_type = 0; + *aux_len = 0; + if (mpts->mpts_socket != NULL) { + struct conninfo_tcp tcp_ci; + + *aux_type = CIAUX_TCP; + *aux_len = sizeof (tcp_ci); + + if (aux_data != USER_ADDR_NULL) { + struct socket *so = mpts->mpts_socket; + + VERIFY(SOCK_PROTO(so) == IPPROTO_TCP); + bzero(&tcp_ci, sizeof (tcp_ci)); + socket_lock(so, 0); + tcp_getconninfo(so, &tcp_ci); + socket_unlock(so, 0); + error = copyout(&tcp_ci, aux_data, sizeof (tcp_ci)); + if (error != 0) + goto out; + } + } +out: + MPTS_UNLOCK(mpts); + return (error); +} + +/* + * Handle SIOCSCONNORDER + */ +int +mptcp_setconnorder(struct mptses *mpte, connid_t cid, uint32_t rank) +{ + struct mptsub *mpts, *mpts1; + int error = 0; + + MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ + mptcplog((LOG_DEBUG, "%s: cid %d rank %d \n", __func__, cid, rank)); + + if (cid == CONNID_ANY || cid == CONNID_ALL) { + error = EINVAL; + goto out; + } + + TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { + if (mpts->mpts_connid == cid) + break; + } + if (mpts == NULL) { + error = ENXIO; + goto out; + } + + if (rank == 0 || rank > 1) { + /* + * If rank is 0, determine whether this should be the + * primary or backup subflow, depending on what we have. + * + * Otherwise, if greater than 0, make it a backup flow. + */ + TAILQ_FOREACH(mpts1, &mpte->mpte_subflows, mpts_entry) { + MPTS_LOCK(mpts1); + if (mpts1->mpts_flags & MPTSF_PREFERRED) { + MPTS_UNLOCK(mpts1); + break; + } + MPTS_UNLOCK(mpts1); + } + + MPTS_LOCK(mpts); + mpts->mpts_flags &= ~MPTSF_PREFERRED; + mpts->mpts_rank = rank; + if (mpts1 != NULL && mpts != mpts1) { + /* preferred subflow found; set rank as necessary */ + if (rank == 0) + mpts->mpts_rank = (mpts1->mpts_rank + 1); + } else if (rank == 0) { + /* no preferred one found; promote this */ + rank = 1; + } + MPTS_UNLOCK(mpts); + } + + if (rank == 1) { + /* + * If rank is 1, promote this subflow to be preferred. + */ + TAILQ_FOREACH(mpts1, &mpte->mpte_subflows, mpts_entry) { + MPTS_LOCK(mpts1); + if (mpts1 != mpts && + (mpts1->mpts_flags & MPTSF_PREFERRED)) { + mpts1->mpts_flags &= ~MPTSF_PREFERRED; + if (mpte->mpte_nummpcapflows > 1) + mptcp_connorder_helper(mpts1); + } else if (mpts1 == mpts) { + mpts1->mpts_rank = 1; + if (mpts1->mpts_flags & MPTSF_MP_CAPABLE) { + mpts1->mpts_flags |= MPTSF_PREFERRED; + if (mpte->mpte_nummpcapflows > 1) + mptcp_connorder_helper(mpts1); + } + } + MPTS_UNLOCK(mpts1); + } + } + +out: + return (error); +} + +static void +mptcp_connorder_helper(struct mptsub *mpts) +{ + struct socket *so = mpts->mpts_socket; + struct tcpcb *tp = NULL; + + socket_lock(so, 0); + + tp = intotcpcb(sotoinpcb(so)); + tp->t_mpflags |= TMPF_SND_MPPRIO; + if (mpts->mpts_flags & MPTSF_PREFERRED) + tp->t_mpflags &= ~TMPF_BACKUP_PATH; + else + tp->t_mpflags |= TMPF_BACKUP_PATH; + mptcplog((LOG_DEBUG, "%s cid %d flags %x", __func__, + mpts->mpts_connid, mpts->mpts_flags)); + socket_unlock(so, 0); + +} + +/* + * Handle SIOCSGONNORDER + */ +int +mptcp_getconnorder(struct mptses *mpte, connid_t cid, uint32_t *rank) +{ + struct mptsub *mpts; + int error = 0; + + MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ + VERIFY(rank != NULL); + *rank = 0; + + if (cid == CONNID_ANY || cid == CONNID_ALL) { + error = EINVAL; + goto out; + } + + TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { + if (mpts->mpts_connid == cid) + break; + } + if (mpts == NULL) { + error = ENXIO; + goto out; + } + + MPTS_LOCK(mpts); + *rank = mpts->mpts_rank; + MPTS_UNLOCK(mpts); +out: + return (error); +} + +/* + * User-protocol pru_control callback. + */ +static int +mptcp_usr_control(struct socket *mp_so, u_long cmd, caddr_t data, + struct ifnet *ifp, struct proc *p) +{ +#pragma unused(ifp, p) + struct mppcb *mpp = sotomppcb(mp_so); + struct mptses *mpte; + int error = 0; + + if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) { + error = EINVAL; + goto out; + } + mpte = mptompte(mpp); + VERIFY(mpte != NULL); + + MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ + + switch (cmd) { + case SIOCGASSOCIDS32: { /* struct so_aidreq32 */ + struct so_aidreq32 aidr; + bcopy(data, &aidr, sizeof (aidr)); + error = mptcp_getassocids(mpte, &aidr.sar_cnt, + aidr.sar_aidp); + if (error == 0) + bcopy(&aidr, data, sizeof (aidr)); + break; + } + + case SIOCGASSOCIDS64: { /* struct so_aidreq64 */ + struct so_aidreq64 aidr; + bcopy(data, &aidr, sizeof (aidr)); + error = mptcp_getassocids(mpte, &aidr.sar_cnt, + aidr.sar_aidp); + if (error == 0) + bcopy(&aidr, data, sizeof (aidr)); + break; + } + + case SIOCGCONNIDS32: { /* struct so_cidreq32 */ + struct so_cidreq32 cidr; + bcopy(data, &cidr, sizeof (cidr)); + error = mptcp_getconnids(mpte, cidr.scr_aid, &cidr.scr_cnt, + cidr.scr_cidp); + if (error == 0) + bcopy(&cidr, data, sizeof (cidr)); + break; + } + + case SIOCGCONNIDS64: { /* struct so_cidreq64 */ + struct so_cidreq64 cidr; + bcopy(data, &cidr, sizeof (cidr)); + error = mptcp_getconnids(mpte, cidr.scr_aid, &cidr.scr_cnt, + cidr.scr_cidp); + if (error == 0) + bcopy(&cidr, data, sizeof (cidr)); + break; + } + + case SIOCGCONNINFO32: { /* struct so_cinforeq32 */ + struct so_cinforeq32 cifr; + bcopy(data, &cifr, sizeof (cifr)); + error = mptcp_getconninfo(mpte, &cifr.scir_cid, + &cifr.scir_flags, &cifr.scir_ifindex, &cifr.scir_error, + cifr.scir_src, &cifr.scir_src_len, cifr.scir_dst, + &cifr.scir_dst_len, &cifr.scir_aux_type, cifr.scir_aux_data, + &cifr.scir_aux_len); + if (error == 0) + bcopy(&cifr, data, sizeof (cifr)); + break; + } + + case SIOCGCONNINFO64: { /* struct so_cinforeq64 */ + struct so_cinforeq64 cifr; + bcopy(data, &cifr, sizeof (cifr)); + error = mptcp_getconninfo(mpte, &cifr.scir_cid, + &cifr.scir_flags, &cifr.scir_ifindex, &cifr.scir_error, + cifr.scir_src, &cifr.scir_src_len, cifr.scir_dst, + &cifr.scir_dst_len, &cifr.scir_aux_type, cifr.scir_aux_data, + &cifr.scir_aux_len); + if (error == 0) + bcopy(&cifr, data, sizeof (cifr)); + break; + } + + case SIOCSCONNORDER: { /* struct so_cordreq */ + struct so_cordreq cor; + bcopy(data, &cor, sizeof (cor)); + error = mptcp_setconnorder(mpte, cor.sco_cid, cor.sco_rank); + if (error == 0) + bcopy(&cor, data, sizeof (cor)); + break; + } + + case SIOCGCONNORDER: { /* struct so_cordreq */ + struct so_cordreq cor; + bcopy(data, &cor, sizeof (cor)); + error = mptcp_getconnorder(mpte, cor.sco_cid, &cor.sco_rank); + if (error == 0) + bcopy(&cor, data, sizeof (cor)); + break; + } + + default: + error = EOPNOTSUPP; + break; + } +out: + return (error); +} + +/* + * Initiate a disconnect. MPTCP-level disconnection is specified by + * CONNID_{ANY,ALL}. Otherwise, selectively disconnect a subflow + * connection while keeping the MPTCP-level connection (association). + */ +static int +mptcp_disconnectx(struct mptses *mpte, associd_t aid, connid_t cid) +{ + struct mptsub *mpts; + struct socket *mp_so; + struct mptcb *mp_tp; + int error = 0; + + MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ + + mp_so = mpte->mpte_mppcb->mpp_socket; + mp_tp = mpte->mpte_mptcb; + + mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx aid %d cid %d\n", __func__, + (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), aid, cid)); + DTRACE_MPTCP5(disconnectx, struct mptses *, mpte, associd_t, aid, + connid_t, cid, struct socket *, mp_so, struct mptcb *, mp_tp); + + VERIFY(aid == ASSOCID_ANY || aid == ASSOCID_ALL || + aid == mpte->mpte_associd); + + /* terminate the association? */ + if (cid == CONNID_ANY || cid == CONNID_ALL) { + /* if we're not detached, go thru socket state checks */ + if (!(mp_so->so_flags & SOF_PCBCLEARING)) { + if (!(mp_so->so_state & (SS_ISCONNECTED| + SS_ISCONNECTING))) { + error = ENOTCONN; + goto out; + } + if (mp_so->so_state & SS_ISDISCONNECTING) { + error = EALREADY; + goto out; + } + } + MPT_LOCK(mp_tp); + mptcp_cancel_all_timers(mp_tp); + if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) { + (void) mptcp_close(mpte, mp_tp); + MPT_UNLOCK(mp_tp); + } else if ((mp_so->so_options & SO_LINGER) && + mp_so->so_linger == 0) { + (void) mptcp_drop(mpte, mp_tp, 0); + MPT_UNLOCK(mp_tp); + } else { + MPT_UNLOCK(mp_tp); + soisdisconnecting(mp_so); + sbflush(&mp_so->so_rcv); + if (mptcp_usrclosed(mpte) != NULL) + (void) mptcp_output(mpte); + } + } else { + TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { + if (mpts->mpts_connid != cid) + continue; + MPTS_LOCK(mpts); + mptcp_subflow_disconnect(mpte, mpts, FALSE); + MPTS_UNLOCK(mpts); + break; + } + + if (mpts == NULL) { + error = EINVAL; + goto out; + } + } + + if (error == 0) + mptcp_thread_signal(mpte); + + if ((mp_so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) == + (SS_CANTRCVMORE | SS_CANTSENDMORE)) { + /* the socket has been shutdown, no more sockopt's */ + mptcp_flush_sopts(mpte); + } + +out: + return (error); +} + +/* + * User-protocol pru_disconnectx callback. + */ +static int +mptcp_usr_disconnectx(struct socket *mp_so, associd_t aid, connid_t cid) +{ + struct mppcb *mpp = sotomppcb(mp_so); + struct mptses *mpte; + int error = 0; + + if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) { + error = EINVAL; + goto out; + } + mpte = mptompte(mpp); + VERIFY(mpte != NULL); + MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ + + if (aid != ASSOCID_ANY && aid != ASSOCID_ALL && + aid != mpte->mpte_associd) { + error = EINVAL; + goto out; + } + + error = mptcp_disconnectx(mpte, aid, cid); +out: + return (error); +} + +/* + * User issued close, and wish to trail thru shutdown states. + */ +static struct mptses * +mptcp_usrclosed(struct mptses *mpte) +{ + struct socket *mp_so; + struct mptcb *mp_tp; + struct mptsub *mpts; + + MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ + mp_so = mpte->mpte_mppcb->mpp_socket; + mp_tp = mpte->mpte_mptcb; + + MPT_LOCK(mp_tp); + mptcp_close_fsm(mp_tp, MPCE_CLOSE); + + if (mp_tp->mpt_state == TCPS_CLOSED) { + mpte = mptcp_close(mpte, mp_tp); + MPT_UNLOCK(mp_tp); + } else if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2) { + MPT_UNLOCK(mp_tp); + soisdisconnected(mp_so); + } else { + mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */ + MPT_UNLOCK(mp_tp); + + TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { + MPTS_LOCK(mpts); + mptcp_subflow_disconnect(mpte, mpts, FALSE); + MPTS_UNLOCK(mpts); + } + } + /* + * XXX: adi@apple.com + * + * Do we need to handle time wait specially here? We need to handle + * the case where MPTCP has been established, but we have not usable + * subflow to use. Do we want to wait a while before forcibly + * tearing this MPTCP down, in case we have one or more subflows + * that are flow controlled? + */ + + return (mpte); +} + +/* + * User-protocol pru_peeloff callback. + */ +static int +mptcp_usr_peeloff(struct socket *mp_so, associd_t aid, struct socket **psop) +{ + struct mppcb *mpp = sotomppcb(mp_so); + struct mptses *mpte; + int error = 0; + + VERIFY(psop != NULL); + + if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) { + error = EINVAL; + goto out; + } + mpte = mptompte(mpp); + VERIFY(mpte != NULL); + + error = mptcp_peeloff(mpte, aid, psop); +out: + return (error); +} + +/* + * Transform a previously connected TCP subflow connection which has + * failed to negotiate MPTCP to its own socket which can be externalized + * with a file descriptor. Valid only when the MPTCP socket is not + * yet associated (MPTCP-level connection has not been established.) + */ +static int +mptcp_peeloff(struct mptses *mpte, associd_t aid, struct socket **psop) +{ + struct socket *so = NULL, *mp_so; + struct mptsub *mpts; + int error = 0; + + MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ + mp_so = mpte->mpte_mppcb->mpp_socket; + + VERIFY(psop != NULL); + *psop = NULL; + + DTRACE_MPTCP3(peeloff, struct mptses *, mpte, associd_t, aid, + struct socket *, mp_so); + + /* peeloff cannot happen after an association is established */ + if (mpte->mpte_associd != ASSOCID_ANY) { + error = EINVAL; + goto out; + } + + if (aid != ASSOCID_ANY && aid != ASSOCID_ALL) { + error = EINVAL; + goto out; + } + + TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { + MPTS_LOCK(mpts); + if (mpts->mpts_flags & MPTSF_MP_CAPABLE) { + panic("%s: so %p is MPTCP capable but mp_so %p " + "aid is %d\n", __func__, so, mp_so, + mpte->mpte_associd); + /* NOTREACHED */ + } + MPTS_ADDREF_LOCKED(mpts); /* for us */ + so = mpts->mpts_socket; + VERIFY(so != NULL); + /* + * This subflow socket is about to be externalized; make it + * appear as if it has the same properties as the MPTCP socket, + * undo what's done earlier in mptcp_subflow_add(). + */ + mptcp_subflow_sopeeloff(mpte, mpts, so); + MPTS_UNLOCK(mpts); + + mptcp_subflow_del(mpte, mpts, FALSE); + MPTS_REMREF(mpts); /* ours */ + /* + * XXX adi@apple.com + * + * Here we need to make sure the subflow socket is not + * flow controlled; need to clear both INP_FLOW_CONTROLLED + * and INP_FLOW_SUSPENDED on the subflow socket, since + * we will no longer be monitoring its events. + */ + break; + } + + if (so == NULL) { + error = EINVAL; + goto out; + } + *psop = so; + + mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx\n", __func__, + (u_int64_t)VM_KERNEL_ADDRPERM(mp_so))); +out: + return (error); +} + +/* + * After a receive, possible send some update to peer. + */ +static int +mptcp_usr_rcvd(struct socket *mp_so, int flags) +{ +#pragma unused(flags) + struct mppcb *mpp = sotomppcb(mp_so); + struct mptses *mpte; + int error = 0; + + if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) { + error = EINVAL; + goto out; + } + mpte = mptompte(mpp); + VERIFY(mpte != NULL); + + error = mptcp_output(mpte); +out: + return (error); +} + +/* + * Do a send by putting data in the output queue. + */ +static int +mptcp_usr_send(struct socket *mp_so, int prus_flags, struct mbuf *m, + struct sockaddr *nam, struct mbuf *control, struct proc *p) +{ +#pragma unused(nam, p) + struct mppcb *mpp = sotomppcb(mp_so); + struct mptses *mpte; + int error = 0; + + if (prus_flags & (PRUS_OOB|PRUS_EOF)) { + error = EOPNOTSUPP; + goto out; + } + + if (nam != NULL) { + error = EOPNOTSUPP; + goto out; + } + + if (control != NULL && control->m_len != 0) { + error = EOPNOTSUPP; + goto out; + } + + if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) { + error = ECONNRESET; + goto out; + } + mpte = mptompte(mpp); + VERIFY(mpte != NULL); + + if (!(mp_so->so_state & SS_ISCONNECTED)) { + error = ENOTCONN; + goto out; + } + + mptcp_insert_dsn(mpp, m); + VERIFY(mp_so->so_snd.sb_flags & SB_NOCOMPRESS); + (void) sbappendstream(&mp_so->so_snd, m); + m = NULL; + + if (mpte != NULL) { + /* + * XXX: adi@apple.com + * + * PRUS_MORETOCOME could be set, but we don't check it now. + */ + error = mptcp_output(mpte); + } + +out: + if (error) { + if (m != NULL) + m_freem(m); + if (control != NULL) + m_freem(control); + } + return (error); +} + +/* + * Mark the MPTCP connection as being incapable of further output. + */ +static int +mptcp_usr_shutdown(struct socket *mp_so) +{ + struct mppcb *mpp = sotomppcb(mp_so); + struct mptses *mpte; + int error = 0; + + if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) { + error = EINVAL; + goto out; + } + mpte = mptompte(mpp); + VERIFY(mpte != NULL); + + socantsendmore(mp_so); + + mpte = mptcp_usrclosed(mpte); + if (mpte != NULL) + error = mptcp_output(mpte); +out: + return (error); +} + +/* + * Copy the contents of uio into a properly sized mbuf chain. + */ +static int +mptcp_uiotombuf(struct uio *uio, int how, int space, uint32_t align, + struct mbuf **top) +{ + struct mbuf *m, *mb, *nm = NULL, *mtail = NULL; + user_ssize_t resid, tot, len, progress; /* must be user_ssize_t */ + int error; + + VERIFY(top != NULL && *top == NULL); + + /* + * space can be zero or an arbitrary large value bound by + * the total data supplied by the uio. + */ + resid = uio_resid(uio); + if (space > 0) + tot = imin(resid, space); + else + tot = resid; + + /* + * The smallest unit is a single mbuf with pkthdr. + * We can't align past it. + */ + if (align >= MHLEN) + return (EINVAL); + + /* + * Give us the full allocation or nothing. + * If space is zero return the smallest empty mbuf. + */ + if ((len = tot + align) == 0) + len = 1; + + /* Loop and append maximum sized mbufs to the chain tail. */ + while (len > 0) { + uint32_t m_needed = 1; + + if (njcl > 0 && len > MBIGCLBYTES) + mb = m_getpackets_internal(&m_needed, 1, + how, 1, M16KCLBYTES); + else if (len > MCLBYTES) + mb = m_getpackets_internal(&m_needed, 1, + how, 1, MBIGCLBYTES); + else if (len >= (signed)MINCLSIZE) + mb = m_getpackets_internal(&m_needed, 1, + how, 1, MCLBYTES); + else + mb = m_gethdr(how, MT_DATA); + + /* Fail the whole operation if one mbuf can't be allocated. */ + if (mb == NULL) { + if (nm != NULL) + m_freem(nm); + return (ENOBUFS); + } + + /* Book keeping. */ + VERIFY(mb->m_flags & M_PKTHDR); + len -= ((mb->m_flags & M_EXT) ? mb->m_ext.ext_size : MHLEN); + if (mtail != NULL) + mtail->m_next = mb; + else + nm = mb; + mtail = mb; + } + + m = nm; + m->m_data += align; + + progress = 0; + /* Fill all mbufs with uio data and update header information. */ + for (mb = m; mb != NULL; mb = mb->m_next) { + len = imin(M_TRAILINGSPACE(mb), tot - progress); + + error = uiomove(mtod(mb, char *), len, uio); + if (error != 0) { + m_freem(m); + return (error); + } + + /* each mbuf is M_PKTHDR chained via m_next */ + mb->m_len = len; + mb->m_pkthdr.len = len; + + progress += len; + } + VERIFY(progress == tot); + *top = m; + return (0); +} + +/* + * MPTCP socket protocol-user socket send routine, derived from sosend(). + */ +static int +mptcp_usr_sosend(struct socket *mp_so, struct sockaddr *addr, struct uio *uio, + struct mbuf *top, struct mbuf *control, int flags) +{ +#pragma unused(addr) + int32_t space; + user_ssize_t resid; + int error, sendflags; + struct proc *p = current_proc(); + int sblocked = 0; + + /* UIO is required for now, due to per-mbuf M_PKTHDR constrains */ + if (uio == NULL || top != NULL) { + error = EINVAL; + goto out; + } + resid = uio_resid(uio); + + socket_lock(mp_so, 1); + so_update_last_owner_locked(mp_so, p); + so_update_policy(mp_so); + + VERIFY(mp_so->so_type == SOCK_STREAM); + VERIFY(!(mp_so->so_flags & SOF_MP_SUBFLOW)); + + if ((flags & (MSG_OOB|MSG_DONTROUTE|MSG_HOLD|MSG_SEND|MSG_FLUSH)) || + (mp_so->so_flags & SOF_ENABLE_MSGS)) { + error = EOPNOTSUPP; + socket_unlock(mp_so, 1); + goto out; + } + + /* + * In theory resid should be unsigned. However, space must be + * signed, as it might be less than 0 if we over-committed, and we + * must use a signed comparison of space and resid. On the other + * hand, a negative resid causes us to loop sending 0-length + * segments to the protocol. + */ + if (resid < 0 || (flags & MSG_EOR) || control != NULL) { + error = EINVAL; + socket_unlock(mp_so, 1); + goto out; + } + + OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd); + + do { + error = sosendcheck(mp_so, NULL, resid, 0, 0, flags, + &sblocked, NULL); + if (error != 0) + goto release; + + space = sbspace(&mp_so->so_snd); + do { + socket_unlock(mp_so, 0); + /* + * Copy the data from userland into an mbuf chain. + */ + error = mptcp_uiotombuf(uio, M_WAITOK, space, 0, &top); + if (error != 0) { + socket_lock(mp_so, 0); + goto release; + } + VERIFY(top != NULL); + space -= resid - uio_resid(uio); + resid = uio_resid(uio); + socket_lock(mp_so, 0); + + /* + * Compute flags here, for pru_send and NKEs. + */ + sendflags = (resid > 0 && space > 0) ? + PRUS_MORETOCOME : 0; + + /* + * Socket filter processing + */ + VERIFY(control == NULL); + error = sflt_data_out(mp_so, NULL, &top, &control, 0); + if (error != 0) { + if (error == EJUSTRETURN) { + error = 0; + top = NULL; + /* always free control if any */ + } + goto release; + } + if (control != NULL) { + m_freem(control); + control = NULL; + } + + /* + * Pass data to protocol. + */ + error = (*mp_so->so_proto->pr_usrreqs->pru_send) + (mp_so, sendflags, top, NULL, NULL, p); + + top = NULL; + if (error != 0) + goto release; + } while (resid != 0 && space > 0); + } while (resid != 0); + +release: + if (sblocked) + sbunlock(&mp_so->so_snd, FALSE); /* will unlock socket */ + else + socket_unlock(mp_so, 1); +out: + if (top != NULL) + m_freem(top); + if (control != NULL) + m_freem(control); + + return (error); +} + +/* + * Called to filter SOPT_{SET,GET} for SOL_SOCKET level socket options. + * This routine simply indicates to the caller whether or not to proceed + * further with the given socket option. This is invoked by sosetoptlock() + * and sogetoptlock(). + */ +static int +mptcp_usr_socheckopt(struct socket *mp_so, struct sockopt *sopt) +{ +#pragma unused(mp_so) + int error = 0; + + VERIFY(sopt->sopt_level == SOL_SOCKET); + + /* + * We could check for sopt_dir (set/get) here, but we'll just + * let the caller deal with it as appropriate; therefore the + * following is a superset of the socket options which we + * allow for set/get. + * + * XXX: adi@apple.com + * + * Need to consider the following cases: + * + * a. In the event peeloff(2) occurs on the subflow socket, + * we may want to issue those options which are now + * handled at the MP socket. In that case, we will need + * to record them in mptcp_setopt() so that they can + * be replayed during peeloff. + * + * b. Certain socket options don't have a clear definition + * on the expected behavior post connect(2). At the time + * those options are issued on the MP socket, there may + * be existing subflow sockets that are already connected. + */ + switch (sopt->sopt_name) { + case SO_LINGER: /* MP */ + case SO_LINGER_SEC: /* MP */ + case SO_TYPE: /* MP */ + case SO_NREAD: /* MP */ + case SO_NWRITE: /* MP */ + case SO_ERROR: /* MP */ + case SO_SNDBUF: /* MP */ + case SO_RCVBUF: /* MP */ + case SO_SNDLOWAT: /* MP */ + case SO_RCVLOWAT: /* MP */ + case SO_SNDTIMEO: /* MP */ + case SO_RCVTIMEO: /* MP */ + case SO_NKE: /* MP */ + case SO_NOSIGPIPE: /* MP */ + case SO_NOADDRERR: /* MP */ + case SO_LABEL: /* MP */ + case SO_PEERLABEL: /* MP */ + case SO_DEFUNCTOK: /* MP */ + case SO_ISDEFUNCT: /* MP */ + case SO_TRAFFIC_CLASS_DBG: /* MP */ + /* + * Tell the caller that these options are to be processed. + */ + break; + + case SO_DEBUG: /* MP + subflow */ + case SO_KEEPALIVE: /* MP + subflow */ + case SO_USELOOPBACK: /* MP + subflow */ + case SO_RANDOMPORT: /* MP + subflow */ + case SO_TRAFFIC_CLASS: /* MP + subflow */ + case SO_RECV_TRAFFIC_CLASS: /* MP + subflow */ + case SO_PRIVILEGED_TRAFFIC_CLASS: /* MP + subflow */ + case SO_RECV_ANYIF: /* MP + subflow */ + case SO_RESTRICTIONS: /* MP + subflow */ + case SO_FLUSH: /* MP + subflow */ + /* + * Tell the caller that these options are to be processed; + * these will also be recorded later by mptcp_setopt(). + * + * NOTE: Only support integer option value for now. + */ + if (sopt->sopt_valsize != sizeof (int)) + error = EINVAL; + break; + + default: + /* + * Tell the caller to stop immediately and return an error. + */ + error = ENOPROTOOPT; + break; + } + + return (error); +} + +/* + * Issue SOPT_SET for all MPTCP subflows (for integer option values.) + */ +static int +mptcp_setopt_apply(struct mptses *mpte, struct mptopt *mpo) +{ + struct socket *mp_so; + struct mptsub *mpts; + struct mptopt smpo; + int error = 0; + + /* just bail now if this isn't applicable to subflow sockets */ + if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) { + error = ENOPROTOOPT; + goto out; + } + + /* + * Skip those that are handled internally; these options + * should not have been recorded and marked with the + * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case. + */ + if (mpo->mpo_level == SOL_SOCKET && + (mpo->mpo_name == SO_NOSIGPIPE || mpo->mpo_name == SO_NOADDRERR)) { + error = ENOPROTOOPT; + goto out; + } + + MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ + mp_so = mpte->mpte_mppcb->mpp_socket; + + /* + * Don't bother going further if there's no subflow; mark the option + * with MPOF_INTERIM so that we know whether or not to remove this + * option upon encountering an error while issuing it during subflow + * socket creation. + */ + if (mpte->mpte_numflows == 0) { + VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows)); + mpo->mpo_flags |= MPOF_INTERIM; + /* return success */ + goto out; + } + + bzero(&smpo, sizeof (smpo)); + smpo.mpo_flags |= MPOF_SUBFLOW_OK; + smpo.mpo_level = mpo->mpo_level; + smpo.mpo_name = mpo->mpo_name; + + /* grab exisiting values in case we need to rollback */ + TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { + struct socket *so; + + MPTS_LOCK(mpts); + mpts->mpts_flags &= ~(MPTSF_SOPT_OLDVAL|MPTSF_SOPT_INPROG); + mpts->mpts_oldintval = 0; + smpo.mpo_intval = 0; + VERIFY(mpts->mpts_socket != NULL); + so = mpts->mpts_socket; + socket_lock(so, 0); + if (mptcp_subflow_sogetopt(mpte, so, &smpo) == 0) { + mpts->mpts_flags |= MPTSF_SOPT_OLDVAL; + mpts->mpts_oldintval = smpo.mpo_intval; + } + socket_unlock(so, 0); + MPTS_UNLOCK(mpts); + } + + /* apply socket option */ + TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { + struct socket *so; + + MPTS_LOCK(mpts); + mpts->mpts_flags |= MPTSF_SOPT_INPROG; + VERIFY(mpts->mpts_socket != NULL); + so = mpts->mpts_socket; + socket_lock(so, 0); + error = mptcp_subflow_sosetopt(mpte, so, mpo); + socket_unlock(so, 0); + MPTS_UNLOCK(mpts); + if (error != 0) + break; + } + + /* cleanup, and rollback if needed */ + TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) { + struct socket *so; + + MPTS_LOCK(mpts); + if (!(mpts->mpts_flags & MPTSF_SOPT_INPROG)) { + /* clear in case it's set */ + mpts->mpts_flags &= ~MPTSF_SOPT_OLDVAL; + mpts->mpts_oldintval = 0; + MPTS_UNLOCK(mpts); + continue; + } + if (!(mpts->mpts_flags & MPTSF_SOPT_OLDVAL)) { + mpts->mpts_flags &= ~MPTSF_SOPT_INPROG; + VERIFY(mpts->mpts_oldintval == 0); + MPTS_UNLOCK(mpts); + continue; + } + /* error during sosetopt, so roll it back */ + if (error != 0) { + VERIFY(mpts->mpts_socket != NULL); + so = mpts->mpts_socket; + socket_lock(so, 0); + smpo.mpo_intval = mpts->mpts_oldintval; + (void) mptcp_subflow_sosetopt(mpte, so, &smpo); + socket_unlock(so, 0); + } + mpts->mpts_oldintval = 0; + mpts->mpts_flags &= ~(MPTSF_SOPT_OLDVAL|MPTSF_SOPT_INPROG); + MPTS_UNLOCK(mpts); + } + +out: + return (error); +} + +/* + * Handle SOPT_SET for socket options issued on MP socket. + */ +static int +mptcp_setopt(struct mptses *mpte, struct sockopt *sopt) +{ + int error = 0, optval, level, optname, rec = 1; + struct mptopt smpo, *mpo = NULL; + struct socket *mp_so; + char buf[32]; + + level = sopt->sopt_level; + optname = sopt->sopt_name; + + VERIFY(sopt->sopt_dir == SOPT_SET); + VERIFY(level == SOL_SOCKET || level == IPPROTO_TCP); + MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ + mp_so = mpte->mpte_mppcb->mpp_socket; + + /* + * Record socket options which are applicable to subflow sockets so + * that we can replay them for new ones; see mptcp_usr_socheckopt() + * for the list of eligible socket-level options. + */ + if (level == SOL_SOCKET) { + switch (optname) { + case SO_DEBUG: + case SO_KEEPALIVE: + case SO_USELOOPBACK: + case SO_RANDOMPORT: + case SO_TRAFFIC_CLASS: + case SO_RECV_TRAFFIC_CLASS: + case SO_PRIVILEGED_TRAFFIC_CLASS: + case SO_RECV_ANYIF: + case SO_RESTRICTIONS: + /* record it */ + break; + case SO_FLUSH: + /* don't record it */ + rec = 0; + break; + default: + /* nothing to do; just return success */ + goto out; + } + } else { + switch (optname) { + case TCP_NODELAY: + case TCP_RXT_FINDROP: + case TCP_KEEPALIVE: + case TCP_KEEPINTVL: + case TCP_KEEPCNT: + case TCP_CONNECTIONTIMEOUT: + case TCP_RXT_CONNDROPTIME: + case PERSIST_TIMEOUT: + /* eligible; record it */ + break; + default: + /* not eligible */ + error = ENOPROTOOPT; + goto out; + } + } + + if ((error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval))) != 0) + goto out; + + if (rec) { + /* search for an existing one; if not found, allocate */ + if ((mpo = mptcp_sopt_find(mpte, sopt)) == NULL) + mpo = mptcp_sopt_alloc(M_WAITOK); + + if (mpo == NULL) { + error = ENOBUFS; + } else { + mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s " + "val %d %s\n", __func__, + (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), + mptcp_sopt2str(level, optname, buf, + sizeof (buf)), optval, + (mpo->mpo_flags & MPOF_ATTACHED) ? + "updated" : "recorded")); + + /* initialize or update, as needed */ + mpo->mpo_intval = optval; + if (!(mpo->mpo_flags & MPOF_ATTACHED)) { + mpo->mpo_level = level; + mpo->mpo_name = optname; + mptcp_sopt_insert(mpte, mpo); + } + VERIFY(mpo->mpo_flags & MPOF_ATTACHED); + /* this can be issued on the subflow socket */ + mpo->mpo_flags |= MPOF_SUBFLOW_OK; + } + } else { + bzero(&smpo, sizeof (smpo)); + mpo = &smpo; + mpo->mpo_flags |= MPOF_SUBFLOW_OK; + mpo->mpo_level = level; + mpo->mpo_name = optname; + mpo->mpo_intval = optval; + } + VERIFY(mpo == NULL || error == 0); + + /* issue this socket option on existing subflows */ + if (error == 0) { + error = mptcp_setopt_apply(mpte, mpo); + if (error != 0 && (mpo->mpo_flags & MPOF_ATTACHED)) { + VERIFY(mpo != &smpo); + mptcp_sopt_remove(mpte, mpo); + mptcp_sopt_free(mpo); + } + if (mpo == &smpo) + mpo->mpo_flags &= ~MPOF_INTERIM; + } +out: + if (error == 0 && mpo != NULL) { + mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s val %d set %s\n", + __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), + mptcp_sopt2str(level, optname, buf, + sizeof (buf)), optval, (mpo->mpo_flags & MPOF_INTERIM) ? + "pending" : "successful")); + } else if (error != 0) { + mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s can't be issued " + "error %d\n", __func__, + (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mptcp_sopt2str(level, + optname, buf, sizeof (buf)), error)); + } + return (error); +} + +/* + * Handle SOPT_GET for socket options issued on MP socket. + */ +static int +mptcp_getopt(struct mptses *mpte, struct sockopt *sopt) +{ + int error = 0, optval; + + VERIFY(sopt->sopt_dir == SOPT_GET); + MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ + + /* + * We only handle SOPT_GET for TCP level socket options; we should + * not get here for socket level options since they are already + * handled at the socket layer. + */ + if (sopt->sopt_level != IPPROTO_TCP) { + error = ENOPROTOOPT; + goto out; + } + + switch (sopt->sopt_name) { + case TCP_NODELAY: + case TCP_RXT_FINDROP: + case TCP_KEEPALIVE: + case TCP_KEEPINTVL: + case TCP_KEEPCNT: + case TCP_CONNECTIONTIMEOUT: + case TCP_RXT_CONNDROPTIME: + case PERSIST_TIMEOUT: + /* eligible; get the default value just in case */ + error = mptcp_default_tcp_optval(mpte, sopt, &optval); + break; + default: + /* not eligible */ + error = ENOPROTOOPT; + break; + } + + /* + * Search for a previously-issued TCP level socket option and + * return the recorded option value. This assumes that the + * value did not get modified by the lower layer after it was + * issued at setsockopt(2) time. If not found, we'll return + * the default value obtained ealier. + */ + if (error == 0) { + struct mptopt *mpo; + + if ((mpo = mptcp_sopt_find(mpte, sopt)) != NULL) + optval = mpo->mpo_intval; + + error = sooptcopyout(sopt, &optval, sizeof (int)); + } +out: + return (error); +} + +/* + * Return default values for TCP socket options. Ideally we would query the + * subflow TCP socket, but that requires creating a subflow socket before + * connectx(2) time. To simplify things, just return the default values + * that we know of. + */ +static int +mptcp_default_tcp_optval(struct mptses *mpte, struct sockopt *sopt, int *optval) +{ + int error = 0; + + VERIFY(sopt->sopt_level == IPPROTO_TCP); + VERIFY(sopt->sopt_dir == SOPT_GET); + MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ + + /* try to do what tcp_newtcpcb() does */ + switch (sopt->sopt_name) { + case TCP_NODELAY: + case TCP_RXT_FINDROP: + case TCP_KEEPINTVL: + case TCP_KEEPCNT: + case TCP_CONNECTIONTIMEOUT: + case TCP_RXT_CONNDROPTIME: + *optval = 0; + break; + + case TCP_KEEPALIVE: + *optval = mptcp_subflow_keeptime; + break; + + case PERSIST_TIMEOUT: + *optval = tcp_max_persist_timeout; + break; + + default: + error = ENOPROTOOPT; + break; + } + return (error); +} + +/* + * MPTCP SOPT_{SET,GET} socket option handler, for options issued on the MP + * socket, at SOL_SOCKET and IPPROTO_TCP levels. The former is restricted + * to those that are allowed by mptcp_usr_socheckopt(). + */ +int +mptcp_ctloutput(struct socket *mp_so, struct sockopt *sopt) +{ + struct mppcb *mpp = sotomppcb(mp_so); + struct mptses *mpte; + int error = 0; + + if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) { + error = EINVAL; + goto out; + } + mpte = mptompte(mpp); + MPTE_LOCK_ASSERT_HELD(mpte); /* same as MP socket lock */ + + /* we only handle socket and TCP-level socket options for MPTCP */ + if (sopt->sopt_level != SOL_SOCKET && sopt->sopt_level != IPPROTO_TCP) { + char buf[32]; + mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s level not " + "handled\n", __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), + mptcp_sopt2str(sopt->sopt_level, + sopt->sopt_name, buf, sizeof (buf)))); + error = EINVAL; + goto out; + } + + switch (sopt->sopt_dir) { + case SOPT_SET: + error = mptcp_setopt(mpte, sopt); + break; + + case SOPT_GET: + error = mptcp_getopt(mpte, sopt); + break; + } +out: + return (error); +} + +/* + * Return a string representation of + */ +const char * +mptcp_sopt2str(int level, int optname, char *dst, int size) +{ + char lbuf[32], obuf[32]; + const char *l = lbuf, *o = obuf; + + (void) snprintf(lbuf, sizeof (lbuf), "0x%x", level); + (void) snprintf(obuf, sizeof (obuf), "0x%x", optname); + + switch (level) { + case SOL_SOCKET: + l = "SOL_SOCKET"; + switch (optname) { + case SO_LINGER: + o = "SO_LINGER"; + break; + case SO_LINGER_SEC: + o = "SO_LINGER_SEC"; + break; + case SO_DEBUG: + o = "SO_DEBUG"; + break; + case SO_KEEPALIVE: + o = "SO_KEEPALIVE"; + break; + case SO_USELOOPBACK: + o = "SO_USELOOPBACK"; + break; + case SO_TYPE: + o = "SO_TYPE"; + break; + case SO_NREAD: + o = "SO_NREAD"; + break; + case SO_NWRITE: + o = "SO_NWRITE"; + break; + case SO_ERROR: + o = "SO_ERROR"; + break; + case SO_SNDBUF: + o = "SO_SNDBUF"; + break; + case SO_RCVBUF: + o = "SO_RCVBUF"; + break; + case SO_SNDLOWAT: + o = "SO_SNDLOWAT"; + break; + case SO_RCVLOWAT: + o = "SO_RCVLOWAT"; + break; + case SO_SNDTIMEO: + o = "SO_SNDTIMEO"; + break; + case SO_RCVTIMEO: + o = "SO_RCVTIMEO"; + break; + case SO_NKE: + o = "SO_NKE"; + break; + case SO_NOSIGPIPE: + o = "SO_NOSIGPIPE"; + break; + case SO_NOADDRERR: + o = "SO_NOADDRERR"; + break; + case SO_RESTRICTIONS: + o = "SO_RESTRICTIONS"; + break; + case SO_LABEL: + o = "SO_LABEL"; + break; + case SO_PEERLABEL: + o = "SO_PEERLABEL"; + break; + case SO_RANDOMPORT: + o = "SO_RANDOMPORT"; + break; + case SO_TRAFFIC_CLASS: + o = "SO_TRAFFIC_CLASS"; + break; + case SO_RECV_TRAFFIC_CLASS: + o = "SO_RECV_TRAFFIC_CLASS"; + break; + case SO_TRAFFIC_CLASS_DBG: + o = "SO_TRAFFIC_CLASS_DBG"; + break; + case SO_PRIVILEGED_TRAFFIC_CLASS: + o = "SO_PRIVILEGED_TRAFFIC_CLASS"; + break; + case SO_DEFUNCTOK: + o = "SO_DEFUNCTOK"; + break; + case SO_ISDEFUNCT: + o = "SO_ISDEFUNCT"; + break; + case SO_OPPORTUNISTIC: + o = "SO_OPPORTUNISTIC"; + break; + case SO_FLUSH: + o = "SO_FLUSH"; + break; + case SO_RECV_ANYIF: + o = "SO_RECV_ANYIF"; + break; + } + break; + case IPPROTO_TCP: + l = "IPPROTO_TCP"; + switch (optname) { + case TCP_KEEPALIVE: + o = "TCP_KEEPALIVE"; + break; + case TCP_KEEPINTVL: + o = "TCP_KEEPINTVL"; + break; + case TCP_KEEPCNT: + o = "TCP_KEEPCNT"; + break; + case TCP_CONNECTIONTIMEOUT: + o = "TCP_CONNECTIONTIMEOUT"; + break; + case TCP_RXT_CONNDROPTIME: + o = "TCP_RXT_CONNDROPTIME"; + break; + case PERSIST_TIMEOUT: + o = "PERSIST_TIMEOUT"; + break; + } + break; + } + + (void) snprintf(dst, size, "<%s,%s>", l, o); + return (dst); +}