X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/593a1d5fd87cdf5b46dd5fcb84467b432cea0f91..bd504ef0e0b883cdd7917b73b3574eb9ce669905:/bsd/kern/uipc_socket.c diff --git a/bsd/kern/uipc_socket.c b/bsd/kern/uipc_socket.c index 57dff6de9..af4b4fbe1 100644 --- a/bsd/kern/uipc_socket.c +++ b/bsd/kern/uipc_socket.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 1998-2007 Apple Inc. All rights reserved. + * Copyright (c) 1998-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -92,33 +92,38 @@ #include #include #include +#include +#include #include +#include #include #include +#include +#include #include #include #include #include #include +#include +#include +#include +#include #if CONFIG_MACF #include #include #endif /* MAC */ -/* how a timeval looks to a 64-bit process */ -struct timeval64 { - int64_t tv_sec; - int32_t tv_usec; -}; int so_cache_hw = 0; int so_cache_timeouts = 0; int so_cache_max_freed = 0; int cached_sock_count = 0; +__private_extern__ int max_cached_sock_count = MAX_CACHED_SOCKETS; struct socket *socket_cache_head = 0; struct socket *socket_cache_tail = 0; -u_long so_cache_time = 0; +u_int32_t so_cache_time = 0; int so_cache_init_done = 0; struct zone *so_cache_zone; @@ -133,7 +138,8 @@ static void filt_sordetach(struct knote *kn); static int filt_soread(struct knote *kn, long hint); static void filt_sowdetach(struct knote *kn); static int filt_sowrite(struct knote *kn, long hint); -static int filt_solisten(struct knote *kn, long hint); +static void filt_sockdetach(struct knote *kn); +static int filt_sockev(struct knote *kn, long hint); static int sooptcopyin_timeval(struct sockopt *sopt, struct timeval * tv_p); @@ -141,12 +147,21 @@ sooptcopyin_timeval(struct sockopt *sopt, struct timeval * tv_p); static int sooptcopyout_timeval(struct sockopt *sopt, const struct timeval * tv_p); -static struct filterops solisten_filtops = - { 1, NULL, filt_sordetach, filt_solisten }; -static struct filterops soread_filtops = - { 1, NULL, filt_sordetach, filt_soread }; -static struct filterops sowrite_filtops = - { 1, NULL, filt_sowdetach, filt_sowrite }; +static struct filterops soread_filtops = { + .f_isfd = 1, + .f_detach = filt_sordetach, + .f_event = filt_soread, +}; +static struct filterops sowrite_filtops = { + .f_isfd = 1, + .f_detach = filt_sowdetach, + .f_event = filt_sowrite, +}; +static struct filterops sock_filtops = { + .f_isfd = 1, + .f_detach = filt_sockdetach, + .f_event = filt_sockev, +}; #define EVEN_MORE_LOCKING_DEBUG 0 int socket_debug = 0; @@ -170,15 +185,15 @@ MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); SYSCTL_DECL(_kern_ipc); int somaxconn = SOMAXCONN; -SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn, 0, ""); +SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, ""); /* Should we get a maximum also ??? */ static int sosendmaxchain = 65536; static int sosendminchain = 16384; static int sorecvmincopy = 16384; -SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain, CTLFLAG_RW, &sosendminchain, +SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain, CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, ""); -SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW, &sorecvmincopy, +SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, ""); /* @@ -186,7 +201,7 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW, &sorecvmincopy, * the socket is marked with SOF_MULTIPAGES; see below. */ int sosendjcl = 1; -SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl, CTLFLAG_RW, &sosendjcl, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl, CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, ""); /* * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large @@ -200,9 +215,17 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl, CTLFLAG_RW, &sosendjcl, 0, ""); * capable. Set this to 1 only for testing/debugging purposes. */ int sosendjcl_ignore_capab = 0; -SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab, CTLFLAG_RW, +SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab, CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, ""); +int sodefunctlog = 0; +SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED, + &sodefunctlog, 0, ""); + +int sothrottlelog = 0; +SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED, + &sothrottlelog, 0, ""); + /* * Socket operation routines. * These routines are called by the routines in @@ -214,6 +237,7 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab, CTLFLAG_RW, /* sys_generic.c */ extern void postevent(struct socket *, struct sockbuf *, int); extern void evsofree(struct socket *); +extern int tcp_notsent_lowat_check(struct socket *so); /* TODO: these should be in header file */ extern int get_inpcb_str_size(void); @@ -233,7 +257,16 @@ static void cached_sock_free(struct socket *); static void so_cache_timer(void *); void soclose_wait_locked(struct socket *so); +int so_isdstlocal(struct socket *so); +/* + * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from + * setting the DSCP code on the packet based on the service class; see + * for details. + */ +__private_extern__ u_int32_t sotcdb = SOTCDB_NO_DSCP; +SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED, + &sotcdb, 0, ""); void socketinit(void) @@ -272,6 +305,8 @@ socketinit(void) get_inpcb_str_size() + 4 + get_tcp_str_size()); so_cache_zone = zinit(str_size, 120000*str_size, 8192, "socache zone"); + zone_change(so_cache_zone, Z_CALLERACCT, FALSE); + zone_change(so_cache_zone, Z_NOENCRYPT, TRUE); #if TEMPDEBUG printf("cached_sock_alloc -- so_cache_zone size is %x\n", str_size); #endif @@ -280,13 +315,19 @@ socketinit(void) so_cache_zone_element_size = str_size; sflt_init(); + + _CASSERT(_SO_TC_MAX == SO_TC_STATS_MAX); + + socket_tclass_init(); + + socket_flowadv_init(); } static void cached_sock_alloc(struct socket **so, int waitok) { caddr_t temp; - register u_long offset; + register uintptr_t offset; lck_mtx_lock(so_cache_mtx); @@ -333,20 +374,18 @@ cached_sock_alloc(struct socket **so, int waitok) * Define offsets for extra structures into our single block of * memory. Align extra structures on longword boundaries. */ - offset = (u_long) *so; + + offset = (uintptr_t) *so; offset += sizeof (struct socket); - if (offset & 0x3) { - offset += 4; - offset &= 0xfffffffc; - } + + offset = ALIGN(offset); + (*so)->so_saved_pcb = (caddr_t)offset; offset += get_inpcb_str_size(); - if (offset & 0x3) { - offset += 4; - offset &= 0xfffffffc; - } - ((struct inpcb *)(*so)->so_saved_pcb)->inp_saved_ppcb = + offset = ALIGN(offset); + + ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb = (caddr_t)offset; #if TEMPDEBUG kprintf("Allocating cached socket - %p, pcb=%p tcpcb=%p\n", @@ -364,7 +403,7 @@ cached_sock_free(struct socket *so) lck_mtx_lock(so_cache_mtx); - if (++cached_sock_count > MAX_CACHED_SOCKETS) { + if (++cached_sock_count > max_cached_sock_count) { --cached_sock_count; lck_mtx_unlock(so_cache_mtx); #if TEMPDEBUG @@ -396,6 +435,24 @@ cached_sock_free(struct socket *so) #endif } +static void +so_update_last_owner_locked( + struct socket *so, + proc_t self) +{ + if (so->last_pid != 0) + { + if (self == NULL) + self = current_proc(); + + if (self) + { + so->last_upid = proc_uniqueid(self); + so->last_pid = proc_pid(self); + } + } +} + static void so_cache_timer(__unused void *dummy) { @@ -486,6 +543,7 @@ socreate(int dom, struct socket **aso, int type, int proto) register struct protosw *prp; register struct socket *so; register int error = 0; + #if TCPDEBUG extern int tcpconsdebug; #endif @@ -507,19 +565,20 @@ socreate(int dom, struct socket **aso, int type, int proto) } if (prp->pr_type != type) return (EPROTOTYPE); - so = soalloc(p != 0, dom, type); + so = soalloc(1, dom, type); if (so == 0) return (ENOBUFS); TAILQ_INIT(&so->so_incomp); TAILQ_INIT(&so->so_comp); so->so_type = type; + so->last_upid = proc_uniqueid(p); + so->last_pid = proc_pid(p); + + so->so_cred = kauth_cred_proc_ref(p); + if (!suser(kauth_cred_get(), NULL)) + so->so_state = SS_PRIV; - if (p != 0) { - so->so_uid = kauth_cred_getuid(kauth_cred_get()); - if (!suser(kauth_cred_get(), NULL)) - so->so_state = SS_PRIV; - } so->so_proto = prp; #ifdef __APPLE__ so->so_rcv.sb_flags |= SB_RECV; /* XXX */ @@ -562,6 +621,27 @@ socreate(int dom, struct socket **aso, int type, int proto) so->so_options |= SO_DEBUG; #endif #endif + so_set_default_traffic_class(so); + /* + * If this is a background thread/task, mark the socket as such. + */ + if (proc_get_self_isbackground() != 0) { + socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND); + so->so_background_thread = current_thread(); + } + + switch (dom) { + /* + * Don't mark Unix domain or system sockets as eligible for defunct by default. + */ + case PF_LOCAL: + case PF_SYSTEM: + so->so_flags |= SOF_NODEFUNCT; + break; + default: + break; + } + *aso = so; return (0); } @@ -593,40 +673,25 @@ sobind(struct socket *so, struct sockaddr *nam) { struct proc *p = current_proc(); int error = 0; - struct socket_filter_entry *filter; - int filtered = 0; socket_lock(so, 1); + VERIFY(so->so_usecount > 1); + so_update_last_owner_locked(so, p); /* - * If this is a bind request on a previously-accepted socket - * that has been marked as inactive, reject it now before - * we go any further. + * If this is a bind request on a socket that has been marked + * as inactive, reject it now before we go any further. */ if (so->so_flags & SOF_DEFUNCT) { error = EINVAL; + SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", + __func__, proc_pid(p), so, INP_SOCKAF(so), INP_SOCKTYPE(so), + error)); goto out; } /* Socket filter */ - error = 0; - for (filter = so->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_bind) { - if (filtered == 0) { - filtered = 1; - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_bind(filter->sfe_cookie, so, nam); - } - } - if (filtered != 0) { - socket_lock(so, 0); - sflt_unuse(so); - } - /* End socket filter */ + error = sflt_bind(so, nam); if (error == 0) error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p); @@ -642,6 +707,11 @@ out: void sodealloc(struct socket *so) { + kauth_cred_unref(&so->so_cred); + + /* Remove any filters */ + sflt_termsock(so); + so->so_gencnt = ++so_gencnt; #if CONFIG_MACF_SOCKET @@ -681,10 +751,9 @@ solisten(struct socket *so, int backlog) { struct proc *p = current_proc(); int error = 0; - struct socket_filter_entry *filter; - int filtered = 0; socket_lock(so, 1); + if (so->so_proto == NULL) { error = EINVAL; goto out; @@ -696,13 +765,18 @@ solisten(struct socket *so, int backlog) /* * If the listen request is made on a socket that is not fully - * disconnected, or on a previously-accepted socket that has - * been marked as inactive, reject the request now. + * disconnected, or on a socket that has been marked as inactive, + * reject the request now. */ if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) || (so->so_flags & SOF_DEFUNCT)) { error = EINVAL; + if (so->so_flags & SOF_DEFUNCT) { + SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", + __func__, proc_pid(p), so, INP_SOCKAF(so), + INP_SOCKTYPE(so), error)); + } goto out; } @@ -711,23 +785,7 @@ solisten(struct socket *so, int backlog) goto out; } - error = 0; - for (filter = so->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_listen) { - if (filtered == 0) { - filtered = 1; - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_listen(filter->sfe_cookie, so); - } - } - if (filtered != 0) { - socket_lock(so, 0); - sflt_unuse(so); - } + error = sflt_listen(so); if (error == 0) { error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p); @@ -771,9 +829,6 @@ sofreelastref(struct socket *so, int dealloc) /* Assume socket is locked */ - /* Remove any filters - may be called more than once */ - sflt_termsock(so); - if ((!(so->so_flags & SOF_PCBCLEARING)) || ((so->so_state & SS_NOFDREF) == 0)) { #ifdef __APPLE__ @@ -841,10 +896,10 @@ soclose_wait_locked(struct socket *so) * Double check here and return if there's no outstanding upcall; * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set. */ - if (!(so->so_flags & SOF_UPCALLINUSE) || - !(so->so_flags & SOF_UPCALLCLOSEWAIT)) + if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) return; - + so->so_rcv.sb_flags &= ~SB_UPCALL; + so->so_snd.sb_flags &= ~SB_UPCALL; so->so_flags |= SOF_CLOSEWAIT; (void) msleep((caddr_t)&so->so_upcall, mutex_held, (PZERO - 1), "soclose_wait_locked", NULL); @@ -980,6 +1035,15 @@ drop: if (so->so_usecount == 0) panic("soclose: usecount is zero so=%p\n", so); if (so->so_pcb && !(so->so_flags & SOF_PCBCLEARING)) { + /* + * Let NetworkStatistics know this PCB is going away + * before we detach it. + */ + if (nstat_collect && + (so->so_proto->pr_domain->dom_family == AF_INET || + so->so_proto->pr_domain->dom_family == AF_INET6)) + nstat_pcb_detach(so->so_pcb); + int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); if (error == 0) error = error2; @@ -990,6 +1054,9 @@ discard: if (so->so_pcb && so->so_state & SS_NOFDREF) panic("soclose: NOFDREF"); so->so_state |= SS_NOFDREF; + + if ((so->so_flags & SOF_KNOTE) != 0) + KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED); #ifdef __APPLE__ so->so_proto->pr_domain->dom_refs--; evsofree(so); @@ -1005,7 +1072,7 @@ soclose(struct socket *so) int error = 0; socket_lock(so, 1); - if (so->so_flags & SOF_UPCALLINUSE) + if (so->so_upcallusecount) soclose_wait_locked(so); if (so->so_retaincnt == 0) { @@ -1082,48 +1149,28 @@ int soacceptfilter(struct socket *so) { struct sockaddr *local = NULL, *remote = NULL; - struct socket_filter_entry *filter; - int error = 0, filtered = 0; + int error = 0; struct socket *head = so->so_head; /* - * There's no need to hold the lock; this socket + * Hold the lock even if this socket * has not been made visible to the filter(s). + * For sockets with global locks, this protect against the + * head or peer going away */ - if ((sock_getaddr(so, &remote, 1) != 0) || - sock_getaddr(so, &local, 0) != 0) { + socket_lock(so, 1); + if (sogetaddr_locked(so, &remote, 1) != 0 || + sogetaddr_locked(so, &local, 0) != 0) { so->so_state &= ~(SS_NOFDREF | SS_COMP); so->so_head = NULL; + socket_unlock(so, 1); soclose(so); /* Out of resources; try it again next time */ error = ECONNABORTED; goto done; } - /* - * At this point, we have a reference on the listening socket - * so we know it won't be going away. Do the same for the newly - * accepted socket while we invoke the accept callback routine. - */ - socket_lock(so, 1); - for (filter = so->so_filt; filter != NULL && error == 0; - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_accept != NULL) { - if (!filtered) { - filtered = 1; - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_accept(filter->sfe_cookie, - head, so, local, remote); - } - } - - if (filtered) { - socket_lock(so, 0); - sflt_unuse(so); - } + error = sflt_accept(head, so, local, remote); /* * If we get EJUSTRETURN from one of the filters, mark this socket @@ -1132,10 +1179,8 @@ soacceptfilter(struct socket *so) */ if (error == EJUSTRETURN) { error = 0; - so->so_flags |= SOF_DEFUNCT; - /* Prevent data from being appended to the socket buffers */ - so->so_snd.sb_flags |= SB_DROP; - so->so_rcv.sb_flags |= SB_DROP; + (void) sosetdefunct(current_proc(), so, + SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE); } if (error != 0) { @@ -1181,15 +1226,21 @@ soconnectlock(struct socket *so, struct sockaddr *nam, int dolock) if (dolock) socket_lock(so, 1); - + /* * If this is a listening socket or if this is a previously-accepted * socket that has been marked as inactive, reject the connect request. */ if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) { + error = EOPNOTSUPP; + if (so->so_flags & SOF_DEFUNCT) { + SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", + __func__, proc_pid(p), so, INP_SOCKAF(so), + INP_SOCKTYPE(so), error)); + } if (dolock) socket_unlock(so, 1); - return (EOPNOTSUPP); + return (error); } if ((so->so_restrictions & SO_RESTRICT_DENYOUT) != 0) { @@ -1213,36 +1264,14 @@ soconnectlock(struct socket *so, struct sockaddr *nam, int dolock) * Run connect filter before calling protocol: * - non-blocking connect returns before completion; */ - struct socket_filter_entry *filter; - int filtered = 0; - - error = 0; - for (filter = so->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_connect_out) { - if (filtered == 0) { - filtered = 1; - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_connect_out(filter->sfe_cookie, so, nam); - } - } - if (filtered != 0) { - socket_lock(so, 0); - sflt_unuse(so); - } + error = sflt_connectout(so, nam); if (error) { if (error == EJUSTRETURN) error = 0; - if (dolock) - socket_unlock(so, 1); - return (error); + } else { + error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p); } - - error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p); } if (dolock) socket_unlock(so, 1); @@ -1330,11 +1359,11 @@ sodisconnect(struct socket *so) * [so_error]:??? */ static int -sosendcheck(struct socket *so, struct sockaddr *addr, long resid, long clen, - long atomic, int flags, int *sblocked) +sosendcheck(struct socket *so, struct sockaddr *addr, int32_t resid, int32_t clen, + int32_t atomic, int flags, int *sblocked) { - int error = 0; - long space; + int error = 0; + int32_t space; int assumelock = 0; restart: @@ -1352,6 +1381,8 @@ restart: } else { error = sblock(&so->so_snd, SBLOCKWAIT(flags)); if (error) { + if (so->so_flags & SOF_DEFUNCT) + goto defunct; return (error); } *sblocked = 1; @@ -1359,12 +1390,17 @@ restart: } /* - * If a send attempt is made on a previously-accepted socket - * that has been marked as inactive (disconnected), reject - * the request. + * If a send attempt is made on a socket that has been marked + * as inactive (disconnected), reject the request. */ - if (so->so_flags & SOF_DEFUNCT) - return (ENOTCONN); + if (so->so_flags & SOF_DEFUNCT) { +defunct: + error = EPIPE; + SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", __func__, + proc_selfpid(), so, INP_SOCKAF(so), INP_SOCKTYPE(so), + error)); + return (error); + } if (so->so_state & SS_CANTSENDMORE) return (EPIPE); @@ -1391,15 +1427,19 @@ restart: if ((atomic && resid > so->so_snd.sb_hiwat) || clen > so->so_snd.sb_hiwat) return (EMSGSIZE); - if (space < resid + clen && - (atomic || space < (long)so->so_snd.sb_lowat || space < clen)) { + if ((space < resid + clen && + (atomic || space < (int32_t)so->so_snd.sb_lowat || space < clen)) || + (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) { if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) || assumelock) { return (EWOULDBLOCK); } sbunlock(&so->so_snd, 1); + *sblocked = 0; error = sbwait(&so->so_snd); if (error) { + if (so->so_flags & SOF_DEFUNCT) + goto defunct; return (error); } goto restart; @@ -1474,7 +1514,7 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, { struct mbuf **mp; register struct mbuf *m, *freelist = NULL; - register long space, len, resid; + register int32_t space, len, resid; int clen = 0, error, dontroute, mlen, sendflags; int atomic = sosendallatonce(so) || top; int sblocked = 0; @@ -1490,6 +1530,8 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat); socket_lock(so, 1); + so_update_last_owner_locked(so, p); + if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) { error = EOPNOTSUPP; socket_unlock(so, 1); @@ -1515,8 +1557,7 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, dontroute = (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && (so->so_proto->pr_flags & PR_ATOMIC); - if (p) - OSIncrementAtomic(&p->p_stats->p_ru.ru_msgsnd); + OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd); if (control) clen = control->m_len; @@ -1531,10 +1572,6 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, 1024 : 0); do { - struct socket_filter_entry *filter; - int filtered; - boolean_t recursive; - if (uio == NULL) { /* * Data is prepackaged in "top". @@ -1547,7 +1584,7 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, int bytes_to_copy; boolean_t jumbocl; - bytes_to_copy = min(resid, space); + bytes_to_copy = imin(resid, space); if (sosendminchain > 0) { chainlength = 0; @@ -1587,7 +1624,8 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, * haven't yet consumed. */ if (freelist == NULL && - bytes_to_copy > NBPG && jumbocl) { + bytes_to_copy > MBIGCLBYTES && + jumbocl) { num_needed = bytes_to_copy / M16KCLBYTES; @@ -1610,10 +1648,10 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, if (freelist == NULL && bytes_to_copy > MCLBYTES) { num_needed = - bytes_to_copy / NBPG; + bytes_to_copy / MBIGCLBYTES; if ((bytes_to_copy - - (num_needed * NBPG)) >= + (num_needed * MBIGCLBYTES)) >= MINCLSIZE) num_needed++; @@ -1621,7 +1659,7 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, m_getpackets_internal( (unsigned int *)&num_needed, hdrs_needed, M_WAIT, 0, - NBPG); + MBIGCLBYTES); /* * Fall back to cluster size * if allocation failed @@ -1684,16 +1722,15 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, MHLEN - m_leadingspace(m); else mlen = MLEN; - len = min(mlen, bytes_to_copy); + len = imin(mlen, bytes_to_copy); chainlength += len; space -= len; error = uiomove(mtod(m, caddr_t), - (int)len, uio); + len, uio); - // LP64todo - fix this! resid = uio_resid(uio); m->m_len = len; @@ -1760,65 +1797,24 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, /* * Socket filter processing */ - recursive = (so->so_send_filt_thread != NULL); - filtered = 0; - error = 0; - for (filter = so->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_data_out) { - int so_flags = 0; - if (filtered == 0) { - filtered = 1; - so->so_send_filt_thread = - current_thread(); - sflt_use(so); - socket_unlock(so, 0); - so_flags = - (sendflags & MSG_OOB) ? - sock_data_filt_flag_oob : 0; - } - error = filter->sfe_filter->sf_filter. - sf_data_out(filter->sfe_cookie, so, - addr, &top, &control, so_flags); + error = sflt_data_out(so, addr, &top, &control, + (sendflags & MSG_OOB) ? sock_data_filt_flag_oob : 0); + if (error) { + if (error == EJUSTRETURN) { + error = 0; + clen = 0; + control = 0; + top = 0; } - } - - if (filtered) { - /* - * At this point, we've run at least one - * filter. The socket is unlocked as is - * the socket buffer. Clear the recorded - * filter thread only when we are outside - * of a filter's context. This allows for - * a filter to issue multiple inject calls - * from its sf_data_out callback routine. - */ - socket_lock(so, 0); - sflt_unuse(so); - if (!recursive) - so->so_send_filt_thread = 0; - if (error) { - if (error == EJUSTRETURN) { - error = 0; - clen = 0; - control = 0; - top = 0; - } - goto release; - } + goto release; } /* * End Socket filter processing */ - if (error == EJUSTRETURN) { - /* A socket filter handled this data */ - error = 0; - } else { - error = (*so->so_proto->pr_usrreqs->pru_send) - (so, sendflags, top, addr, control, p); - } + error = (*so->so_proto->pr_usrreqs->pru_send) + (so, sendflags, top, addr, control, p); #ifdef __APPLE__ if (flags & MSG_SEND) so->so_temp = NULL; @@ -1900,7 +1896,6 @@ soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct protosw *pr = so->so_proto; struct mbuf *nextrecord; int moff, type = 0; - // LP64todo - fix this! int orig_resid = uio_resid(uio); struct mbuf *free_list; int delayed_copy_len; @@ -1913,6 +1908,7 @@ soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat); socket_lock(so, 1); + so_update_last_owner_locked(so, p); #ifdef MORE_LOCKING_DEBUG if (so->so_usecount == 1) @@ -1936,14 +1932,18 @@ soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, if (so->so_flags & SOF_DEFUNCT) { struct sockbuf *sb = &so->so_rcv; + error = ENOTCONN; + SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", __func__, + proc_pid(p), so, INP_SOCKAF(so), INP_SOCKTYPE(so), error)); /* * This socket should have been disconnected and flushed - * prior to being returned from accept; there should be - * no data on its receive list, so panic otherwise. + * prior to being returned from sodefunct(); there should + * be no data on its receive list, so panic otherwise. */ - sb_empty_assert(sb, __func__); + if (so->so_state & SS_DEFUNCT) + sb_empty_assert(sb, __func__); socket_unlock(so, 1); - return (ENOTCONN); + return (error); } /* @@ -1967,9 +1967,8 @@ soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, goto bad; socket_unlock(so, 0); do { - // LP64todo - fix this! error = uiomove(mtod(m, caddr_t), - (int)min(uio_resid(uio), m->m_len), uio); + imin(uio_resid(uio), m->m_len), uio); m = m_free(m); } while (uio_resid(uio) && error == 0 && m); socket_lock(so, 0); @@ -2062,9 +2061,7 @@ restart: * end up with false positives during select() or poll() * which could put the application in a bad state. */ - if (m == NULL && so->so_rcv.sb_cc != 0) - panic("soreceive corrupted so_rcv: m %p cc %lu", - m, so->so_rcv.sb_cc); + SB_MB_CHECK(&so->so_rcv); if (so->so_error) { if (m) @@ -2122,19 +2119,7 @@ restart: goto restart; } dontblock: -#ifndef __APPLE__ - if (uio->uio_procp) - uio->uio_procp->p_stats->p_ru.ru_msgrcv++; -#else /* __APPLE__ */ - /* - * 2207985 - * This should be uio->uio-procp; however, some callers of this - * function use auto variables with stack garbage, and fail to - * fill out the uio structure properly. - */ - if (p) - OSIncrementAtomic(&p->p_stats->p_ru.ru_msgrcv); -#endif /* __APPLE__ */ + OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv); SBLASTRECORDCHK(&so->so_rcv, "soreceive 1"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 1"); nextrecord = m->m_nextpkt; @@ -2188,6 +2173,14 @@ dontblock: goto restart; } socket_lock(so, 0); + /* + * If the socket has been defunct'd, drop it. + */ + if (so->so_flags & SOF_DEFUNCT) { + m_freem(m); + error = ENOTCONN; + goto release; + } /* * Re-adjust the socket receive list and re-enqueue * the record in front of any packets which may have @@ -2244,6 +2237,7 @@ dontblock: struct mbuf *cm = NULL, *cmn; struct mbuf **cme = &cm; struct sockbuf *sb_rcv = &so->so_rcv; + struct mbuf **msgpcm = NULL; /* * Externalizing the control messages would require us to @@ -2256,7 +2250,23 @@ dontblock: do { if (flags & MSG_PEEK) { if (controlp != NULL) { + if (*controlp == NULL) { + msgpcm = controlp; + } *controlp = m_copy(m, 0, m->m_len); + + /* If we failed to allocate an mbuf, + * release any previously allocated + * mbufs for control data. Return + * an error. Keep the mbufs in the + * socket as this is using + * MSG_PEEK flag. + */ + if (*controlp == NULL) { + m_freem(*msgpcm); + error = ENOBUFS; + goto release; + } controlp = &(*controlp)->m_next; } m = m->m_next; @@ -2324,11 +2334,16 @@ dontblock: } cm = cmn; } - orig_resid = 0; - if (sb_rcv->sb_mb != NULL) + /* + * Update the value of nextrecord in case we received new + * records when the socket was unlocked above for + * externalizing SCM_RIGHTS. + */ + if (m != NULL) nextrecord = sb_rcv->sb_mb->m_nextpkt; else - nextrecord = NULL; + nextrecord = sb_rcv->sb_mb; + orig_resid = 0; } if (m != NULL) { @@ -2353,7 +2368,6 @@ dontblock: flags |= MSG_OOB; } else { if (!(flags & MSG_PEEK)) { - so->so_rcv.sb_mb = nextrecord; SB_EMPTY_FIXUP(&so->so_rcv); } } @@ -2387,7 +2401,6 @@ dontblock: flags |= MSG_OOB; } so->so_state &= ~SS_RCVATMARK; - // LP64todo - fix this! len = uio_resid(uio) - delayed_copy_len; if (so->so_oobmark && len > so->so_oobmark - offset) len = so->so_oobmark - offset; @@ -2491,8 +2504,25 @@ dontblock: if (flags & MSG_PEEK) { moff += len; } else { - if (mp) - *mp = m_copym(m, 0, len, M_WAIT); + if (mp != NULL) { + int copy_flag; + + if (flags & MSG_DONTWAIT) + copy_flag = M_DONTWAIT; + else + copy_flag = M_WAIT; + *mp = m_copym(m, 0, len, copy_flag); + if (*mp == NULL) { + /* + * Failed to allocate an mbuf. + * Adjust uio_resid back, it was + * adjusted down by len bytes which + * we didn't copy over + */ + uio_setresid(uio, (uio_resid(uio) + len)); + break; + } + } m->m_data += len; m->m_len -= len; so->so_rcv.sb_cc -= len; @@ -2584,6 +2614,7 @@ dontblock: if (m) { nextrecord = m->m_nextpkt; } + SB_MB_CHECK(&so->so_rcv); } } #ifdef MORE_LOCKING_DEBUG @@ -2631,6 +2662,7 @@ dontblock: } else if (nextrecord->m_nextpkt == NULL) { so->so_rcv.sb_lastrecord = nextrecord; } + SB_MB_CHECK(&so->so_rcv); } SBLASTRECORDCHK(&so->so_rcv, "soreceive 4"); SBLASTMBUFCHK(&so->so_rcv, "soreceive 4"); @@ -2825,18 +2857,7 @@ sorflush(struct socket *so) if (asb.sb_flags & SB_UNIX) sb->sb_flags |= SB_UNIX; if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) { - boolean_t unp = (pr->pr_domain->dom_dispose == unp_dispose); - /* - * Currently AF_UNIX domain uses a global domain mutex; - * unp_dispose() may end up calling soclose() on another - * AF_UNIX socket and therefore the lock must not be held - * across the call. - */ - if (unp) - socket_unlock(so, 0); (*pr->pr_domain->dom_dispose)(asb.sb_mb); - if (unp) - socket_lock(so, 0); } sbrelease(&asb); } @@ -2868,7 +2889,7 @@ sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen) if (valsize > len) sopt->sopt_valsize = valsize = len; - if (sopt->sopt_p != 0) + if (sopt->sopt_p != kernproc) return (copyin(sopt->sopt_val, buf, valsize)); bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize); @@ -2886,17 +2907,21 @@ static int sooptcopyin_timeval(struct sockopt *sopt, struct timeval * tv_p) { int error; - + if (proc_is64bit(sopt->sopt_p)) { - struct timeval64 tv64; + struct user64_timeval tv64; if (sopt->sopt_valsize < sizeof(tv64)) { return (EINVAL); } sopt->sopt_valsize = sizeof(tv64); - error = copyin(sopt->sopt_val, &tv64, sizeof(tv64)); - if (error != 0) { - return (error); + if (sopt->sopt_p != kernproc) { + error = copyin(sopt->sopt_val, &tv64, sizeof(tv64)); + if (error != 0) + return (error); + } else { + bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64, + sizeof(tv64)); } if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX || tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) { @@ -2905,23 +2930,29 @@ sooptcopyin_timeval(struct sockopt *sopt, struct timeval * tv_p) tv_p->tv_sec = tv64.tv_sec; tv_p->tv_usec = tv64.tv_usec; } else { - if (sopt->sopt_valsize < sizeof(*tv_p)) { + struct user32_timeval tv32; + + if (sopt->sopt_valsize < sizeof(tv32)) { return (EINVAL); } - sopt->sopt_valsize = sizeof(*tv_p); - if (sopt->sopt_p != 0) { - error = copyin(sopt->sopt_val, tv_p, sizeof(*tv_p)); + sopt->sopt_valsize = sizeof(tv32); + if (sopt->sopt_p != kernproc) { + error = copyin(sopt->sopt_val, &tv32, sizeof(tv32)); if (error != 0) { return (error); } } else { - bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), tv_p, - sizeof(*tv_p)); + bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32, + sizeof(tv32)); } - if (tv_p->tv_sec < 0 || tv_p->tv_sec > LONG_MAX - || tv_p->tv_usec < 0 || tv_p->tv_usec >= 1000000) { +#ifndef __LP64__ // K64todo "comparison is always false due to limited range of data type" + if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX + || tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) { return (EDOM); } +#endif + tv_p->tv_sec = tv32.tv_sec; + tv_p->tv_usec = tv32.tv_usec; } return (0); } @@ -2952,15 +2983,15 @@ sosetopt(struct socket *so, struct sockopt *sopt) int error, optval; struct linger l; struct timeval tv; - struct socket_filter_entry *filter; - int filtered = 0; #if CONFIG_MACF_SOCKET struct mac extmac; #endif /* MAC_SOCKET */ socket_lock(so, 1); + if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) - == (SS_CANTRCVMORE | SS_CANTSENDMORE)) { + == (SS_CANTRCVMORE | SS_CANTSENDMORE) && + (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) { /* the socket has been shutdown, no more sockopt's */ error = EINVAL; goto bad; @@ -2970,29 +3001,11 @@ sosetopt(struct socket *so, struct sockopt *sopt) sopt->sopt_dir = SOPT_SET; } - error = 0; - for (filter = so->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_setoption) { - if (filtered == 0) { - filtered = 1; - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_setoption(filter->sfe_cookie, so, sopt); - } - } - - if (filtered != 0) { - socket_lock(so, 0); - sflt_unuse(so); - - if (error) { - if (error == EJUSTRETURN) - error = 0; - goto bad; - } + error = sflt_setsockopt(so, sopt); + if (error) { + if (error == EJUSTRETURN) + error = 0; + goto bad; } error = 0; @@ -3028,6 +3041,7 @@ sosetopt(struct socket *so, struct sockopt *sopt) case SO_REUSEPORT: case SO_OOBINLINE: case SO_TIMESTAMP: + case SO_TIMESTAMP_MONOTONIC: #ifdef __APPLE__ case SO_DONTTRUNC: case SO_WANTMORE: @@ -3064,17 +3078,18 @@ sosetopt(struct socket *so, struct sockopt *sopt) switch (sopt->sopt_name) { case SO_SNDBUF: case SO_RCVBUF: - if (sbreserve(sopt->sopt_name == SO_SNDBUF ? - &so->so_snd : &so->so_rcv, - (u_long) optval) == 0) { + { + struct sockbuf *sb = (sopt->sopt_name == SO_SNDBUF) ? + &so->so_snd : &so->so_rcv; + if (sbreserve(sb, (u_int32_t) optval) == 0) { error = ENOBUFS; goto bad; } - if (sopt->sopt_name == SO_SNDBUF) - so->so_snd.sb_flags |= SB_USRSIZE; - else - so->so_rcv.sb_flags |= SB_USRSIZE; + sb->sb_flags |= SB_USRSIZE; + sb->sb_flags &= ~SB_AUTOSIZE; + sb->sb_idealsize = (u_int32_t)optval; break; + } /* * Make sure the low-water is never greater than @@ -3118,8 +3133,7 @@ sosetopt(struct socket *so, struct sockopt *sopt) if (error) goto bad; - error = sflt_attach_private(so, NULL, - nke.nke_handle, 1); + error = sflt_attach_internal(so, nke.nke_handle); break; } @@ -3212,6 +3226,144 @@ sosetopt(struct socket *so, struct sockopt *sopt) break; #endif + case SO_RANDOMPORT: + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + if (error) + goto bad; + if (optval) + so->so_flags |= SOF_BINDRANDOMPORT; + else + so->so_flags &= ~SOF_BINDRANDOMPORT; + break; + + case SO_NP_EXTENSIONS: { + struct so_np_extensions sonpx; + + error = sooptcopyin(sopt, &sonpx, sizeof(sonpx), sizeof(sonpx)); + if (error) + goto bad; + if (sonpx.npx_mask & ~SONPX_MASK_VALID) { + error = EINVAL; + goto bad; + } + /* + * Only one bit defined for now + */ + if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) { + if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) + so->so_flags |= SOF_NPX_SETOPTSHUT; + else + so->so_flags &= ~SOF_NPX_SETOPTSHUT; + } + break; + } + + case SO_TRAFFIC_CLASS: { + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + if (error) + goto bad; + error = so_set_traffic_class(so, optval); + if (error) + goto bad; + break; + } + + case SO_RECV_TRAFFIC_CLASS: { + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + if (error) + goto bad; + if (optval == 0) + so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS; + else + so->so_flags |= SOF_RECV_TRAFFIC_CLASS; + break; + } + + case SO_TRAFFIC_CLASS_DBG: { + struct so_tcdbg so_tcdbg; + + error = sooptcopyin(sopt, &so_tcdbg, + sizeof (struct so_tcdbg), sizeof (struct so_tcdbg)); + if (error) + goto bad; + error = so_set_tcdbg(so, &so_tcdbg); + if (error) + goto bad; + break; + } + + case SO_PRIVILEGED_TRAFFIC_CLASS: + error = priv_check_cred(kauth_cred_get(), + PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0); + if (error) + goto bad; + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + if (error) + goto bad; + if (optval == 0) + so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS; + else + so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS; + break; + + case SO_DEFUNCTOK: + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + if (error != 0 || (so->so_flags & SOF_DEFUNCT)) { + if (error == 0) + error = EBADF; + goto bad; + } + /* + * Any process can set SO_DEFUNCTOK (clear + * SOF_NODEFUNCT), but only root can clear + * SO_DEFUNCTOK (set SOF_NODEFUNCT). + */ + if (optval == 0 && + kauth_cred_issuser(kauth_cred_get()) == 0) { + error = EPERM; + goto bad; + } + if (optval) + so->so_flags &= ~SOF_NODEFUNCT; + else + so->so_flags |= SOF_NODEFUNCT; + + SODEFUNCTLOG(("%s[%d]: so %p [%d,%d] is now marked as " + "%seligible for defunct\n", __func__, + proc_selfpid(), so, INP_SOCKAF(so), + INP_SOCKTYPE(so), + (so->so_flags & SOF_NODEFUNCT) ? "not " : "")); + break; + + case SO_ISDEFUNCT: + /* This option is not settable */ + error = EINVAL; + break; + + case SO_OPPORTUNISTIC: + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + if (error == 0) + error = so_set_opportunistic(so, optval); + break; + + case SO_FLUSH: + /* This option is handled by lower layer(s) */ + error = 0; + break; + + case SO_RECV_ANYIF: + error = sooptcopyin(sopt, &optval, sizeof (optval), + sizeof (optval)); + if (error == 0) + error = so_set_recv_anyif(so, optval); + break; + default: error = ENOPROTOOPT; break; @@ -3246,7 +3398,7 @@ sooptcopyout(struct sockopt *sopt, void *buf, size_t len) valsize = min(len, sopt->sopt_valsize); sopt->sopt_valsize = valsize; if (sopt->sopt_val != USER_ADDR_NULL) { - if (sopt->sopt_p != 0) + if (sopt->sopt_p != kernproc) error = copyout(buf, sopt->sopt_val, valsize); else bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize); @@ -3259,24 +3411,27 @@ sooptcopyout_timeval(struct sockopt *sopt, const struct timeval * tv_p) { int error; size_t len; - struct timeval64 tv64; + struct user64_timeval tv64; + struct user32_timeval tv32; const void * val; size_t valsize; - + error = 0; if (proc_is64bit(sopt->sopt_p)) { - len = sizeof(struct timeval64); + len = sizeof(tv64); tv64.tv_sec = tv_p->tv_sec; tv64.tv_usec = tv_p->tv_usec; val = &tv64; } else { - len = sizeof(struct timeval); - val = tv_p; + len = sizeof(tv32); + tv32.tv_sec = tv_p->tv_sec; + tv32.tv_usec = tv_p->tv_usec; + val = &tv32; } valsize = min(len, sopt->sopt_valsize); sopt->sopt_valsize = valsize; if (sopt->sopt_val != USER_ADDR_NULL) { - if (sopt->sopt_p != 0) + if (sopt->sopt_p != kernproc) error = copyout(val, sopt->sopt_val, valsize); else bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize); @@ -3297,8 +3452,6 @@ sogetopt(struct socket *so, struct sockopt *sopt) int error, optval; struct linger l; struct timeval tv; - struct socket_filter_entry *filter; - int filtered = 0; #if CONFIG_MACF_SOCKET struct mac extmac; #endif /* MAC_SOCKET */ @@ -3309,31 +3462,14 @@ sogetopt(struct socket *so, struct sockopt *sopt) socket_lock(so, 1); - error = 0; - for (filter = so->so_filt; filter && (error == 0); - filter = filter->sfe_next_onsocket) { - if (filter->sfe_filter->sf_filter.sf_getoption) { - if (filtered == 0) { - filtered = 1; - sflt_use(so); - socket_unlock(so, 0); - } - error = filter->sfe_filter->sf_filter. - sf_getoption(filter->sfe_cookie, so, sopt); - } - } - if (filtered != 0) { - socket_lock(so, 0); - sflt_unuse(so); - - if (error) { - if (error == EJUSTRETURN) - error = 0; - socket_unlock(so, 1); - return (error); - } + error = sflt_getsockopt(so, sopt); + if (error) { + if (error == EJUSTRETURN) + error = 0; + socket_unlock(so, 1); + return (error); } - + error = 0; if (sopt->sopt_level != SOL_SOCKET) { if (so->so_proto && so->so_proto->pr_ctloutput) { @@ -3363,6 +3499,7 @@ sogetopt(struct socket *so, struct sockopt *sopt) case SO_BROADCAST: case SO_OOBINLINE: case SO_TIMESTAMP: + case SO_TIMESTAMP_MONOTONIC: #ifdef __APPLE__ case SO_DONTTRUNC: case SO_WANTMORE: @@ -3485,6 +3622,60 @@ integer: optval = (so->so_flags & SOF_UPCALLCLOSEWAIT); goto integer; #endif + case SO_RANDOMPORT: + optval = (so->so_flags & SOF_BINDRANDOMPORT); + goto integer; + + case SO_NP_EXTENSIONS: { + struct so_np_extensions sonpx; + + sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ? SONPX_SETOPTSHUT : 0; + sonpx.npx_mask = SONPX_MASK_VALID; + + error = sooptcopyout(sopt, &sonpx, sizeof(struct so_np_extensions)); + break; + } + + case SO_TRAFFIC_CLASS: + optval = so->so_traffic_class; + goto integer; + + case SO_RECV_TRAFFIC_CLASS: + optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS); + goto integer; + + case SO_TRAFFIC_CLASS_STATS: + error = sooptcopyout(sopt, &so->so_tc_stats, sizeof(so->so_tc_stats)); + break; + + case SO_TRAFFIC_CLASS_DBG: + error = sogetopt_tcdbg(so, sopt); + break; + + case SO_PRIVILEGED_TRAFFIC_CLASS: + optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS); + goto integer; + + case SO_DEFUNCTOK: + optval = !(so->so_flags & SOF_NODEFUNCT); + goto integer; + + case SO_ISDEFUNCT: + optval = (so->so_flags & SOF_DEFUNCT); + goto integer; + + case SO_OPPORTUNISTIC: + optval = so_get_opportunistic(so); + goto integer; + + case SO_FLUSH: + /* This option is not gettable */ + error = EINVAL; + break; + + case SO_RECV_ANYIF: + optval = so_get_recv_anyif(so); + goto integer; default: error = ENOPROTOOPT; @@ -3494,22 +3685,26 @@ integer: return (error); } } - -/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */ +/* The size limits on our soopt_getm is different from that on FreeBSD. + * We limit the size of options to MCLBYTES. This will have to change + * if we need to define options that need more space than MCLBYTES. + */ int soopt_getm(struct sockopt *sopt, struct mbuf **mp) { struct mbuf *m, *m_prev; int sopt_size = sopt->sopt_valsize; + int how; - if (sopt_size > MAX_SOOPTGETM_SIZE) + if (sopt_size <= 0 || sopt_size > MCLBYTES) return (EMSGSIZE); - MGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT, MT_DATA); + how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT; + MGET(m, how, MT_DATA); if (m == 0) return (ENOBUFS); if (sopt_size > MLEN) { - MCLGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT); + MCLGET(m, how); if ((m->m_flags & M_EXT) == 0) { m_free(m); return (ENOBUFS); @@ -3522,16 +3717,17 @@ soopt_getm(struct sockopt *sopt, struct mbuf **mp) *mp = m; m_prev = m; - while (sopt_size) { - MGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT, MT_DATA); + while (sopt_size > 0) { + MGET(m, how, MT_DATA); if (m == 0) { m_freem(*mp); return (ENOBUFS); } if (sopt_size > MLEN) { - MCLGET(m, sopt->sopt_p ? M_WAIT : M_DONTWAIT); + MCLGET(m, how); if ((m->m_flags & M_EXT) == 0) { m_freem(*mp); + m_freem(m); return (ENOBUFS); } m->m_len = min(MCLBYTES, sopt_size); @@ -3545,7 +3741,7 @@ soopt_getm(struct sockopt *sopt, struct mbuf **mp) return (0); } -/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */ +/* copyin sopt data into mbuf chain */ int soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) { @@ -3554,7 +3750,7 @@ soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) if (sopt->sopt_val == USER_ADDR_NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { - if (sopt->sopt_p != NULL) { + if (sopt->sopt_p != kernproc) { int error; error = copyin(sopt->sopt_val, mtod(m, char *), @@ -3576,7 +3772,7 @@ soopt_mcopyin(struct sockopt *sopt, struct mbuf *m) return (0); } -/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */ +/* copyout mbuf chain data into soopt */ int soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) { @@ -3586,7 +3782,7 @@ soopt_mcopyout(struct sockopt *sopt, struct mbuf *m) if (sopt->sopt_val == USER_ADDR_NULL) return (0); while (m != NULL && sopt->sopt_valsize >= m->m_len) { - if (sopt->sopt_p != NULL) { + if (sopt->sopt_p != kernproc) { int error; error = copyout(mtod(m, char *), sopt->sopt_val, @@ -3673,7 +3869,7 @@ soo_kqfilter(__unused struct fileproc *fp, struct knote *kn, __unused struct proc *p) { struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; - struct sockbuf *sb; + struct klist *skl; socket_lock(so, 1); @@ -3686,23 +3882,38 @@ soo_kqfilter(__unused struct fileproc *fp, struct knote *kn, switch (kn->kn_filter) { case EVFILT_READ: - if (so->so_options & SO_ACCEPTCONN) - kn->kn_fop = &solisten_filtops; - else - kn->kn_fop = &soread_filtops; - sb = &so->so_rcv; + kn->kn_fop = &soread_filtops; + skl = &so->so_rcv.sb_sel.si_note; break; case EVFILT_WRITE: kn->kn_fop = &sowrite_filtops; - sb = &so->so_snd; + skl = &so->so_snd.sb_sel.si_note; + break; + case EVFILT_SOCK: + kn->kn_fop = &sock_filtops; + skl = &so->so_klist; break; default: socket_unlock(so, 1); return (1); } - if (KNOTE_ATTACH(&sb->sb_sel.si_note, kn)) - sb->sb_flags |= SB_KNOTE; + if (KNOTE_ATTACH(skl, kn)) { + switch(kn->kn_filter) { + case EVFILT_READ: + so->so_rcv.sb_flags |= SB_KNOTE; + break; + case EVFILT_WRITE: + so->so_snd.sb_flags |= SB_KNOTE; + break; + case EVFILT_SOCK: + so->so_flags |= SOF_KNOTE; + break; + default: + socket_unlock(so, 1); + return (1); + } + } socket_unlock(so, 1); return (0); } @@ -3728,6 +3939,25 @@ filt_soread(struct knote *kn, long hint) if ((hint & SO_FILT_HINT_LOCKED) == 0) socket_lock(so, 1); + if (so->so_options & SO_ACCEPTCONN) { + int isempty; + + /* Radar 6615193 handle the listen case dynamically + * for kqueue read filter. This allows to call listen() after registering + * the kqueue EVFILT_READ. + */ + + kn->kn_data = so->so_qlen; + isempty = ! TAILQ_EMPTY(&so->so_comp); + + if ((hint & SO_FILT_HINT_LOCKED) == 0) + socket_unlock(so, 1); + + return (isempty); + } + + /* socket isn't a listener */ + kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl; if (so->so_oobmark) { @@ -3769,12 +3999,19 @@ filt_soread(struct knote *kn, long hint) return (1); } + int64_t lowwat = so->so_rcv.sb_lowat; + if (kn->kn_sfflags & NOTE_LOWAT) + { + if (kn->kn_sdata > so->so_rcv.sb_hiwat) + lowwat = so->so_rcv.sb_hiwat; + else if (kn->kn_sdata > lowwat) + lowwat = kn->kn_sdata; + } + if ((hint & SO_FILT_HINT_LOCKED) == 0) socket_unlock(so, 1); - - return ((kn->kn_flags & EV_OOBAND) || - kn->kn_data >= ((kn->kn_sfflags & NOTE_LOWAT) ? - kn->kn_sdata : so->so_rcv.sb_lowat)); + + return ((kn->kn_flags & EV_OOBAND) || kn->kn_data >= lowwat); } static void @@ -3789,11 +4026,25 @@ filt_sowdetach(struct knote *kn) socket_unlock(so, 1); } +int +so_wait_for_if_feedback(struct socket *so) +{ + if ((so->so_proto->pr_domain->dom_family == AF_INET || + so->so_proto->pr_domain->dom_family == AF_INET6) && + (so->so_state & SS_ISCONNECTED)) { + struct inpcb *inp = sotoinpcb(so); + if (INP_WAIT_FOR_IF_FEEDBACK(inp)) + return (1); + } + return (0); +} + /*ARGSUSED*/ static int filt_sowrite(struct knote *kn, long hint) { struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; + int ret = 0; if ((hint & SO_FILT_HINT_LOCKED) == 0) socket_lock(so, 1); @@ -3802,51 +4053,165 @@ filt_sowrite(struct knote *kn, long hint) if (so->so_state & SS_CANTSENDMORE) { kn->kn_flags |= EV_EOF; kn->kn_fflags = so->so_error; - if ((hint & SO_FILT_HINT_LOCKED) == 0) - socket_unlock(so, 1); - return (1); + ret = 1; + goto out; } if (so->so_error) { /* temporary udp error */ - if ((hint & SO_FILT_HINT_LOCKED) == 0) - socket_unlock(so, 1); - return (1); + ret = 1; + goto out; } if (((so->so_state & SS_ISCONNECTED) == 0) && (so->so_proto->pr_flags & PR_CONNREQUIRED)) { - if ((hint & SO_FILT_HINT_LOCKED) == 0) - socket_unlock(so, 1); - return (0); + ret = 0; + goto out; + } + int64_t lowwat = so->so_snd.sb_lowat; + if (kn->kn_sfflags & NOTE_LOWAT) + { + if (kn->kn_sdata > so->so_snd.sb_hiwat) + lowwat = so->so_snd.sb_hiwat; + else if (kn->kn_sdata > lowwat) + lowwat = kn->kn_sdata; } + if (kn->kn_data >= lowwat) { + if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0) { + ret = tcp_notsent_lowat_check(so); + } else { + ret = 1; + } + } + if (so_wait_for_if_feedback(so)) + ret = 0; +out: if ((hint & SO_FILT_HINT_LOCKED) == 0) socket_unlock(so, 1); - if (kn->kn_sfflags & NOTE_LOWAT) - return (kn->kn_data >= kn->kn_sdata); - return (kn->kn_data >= so->so_snd.sb_lowat); + return(ret); +} + +static void +filt_sockdetach(struct knote *kn) +{ + struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; + socket_lock(so, 1); + + if ((so->so_flags & SOF_KNOTE) != 0) + if (KNOTE_DETACH(&so->so_klist, kn)) + so->so_flags &= ~SOF_KNOTE; + socket_unlock(so, 1); } -/*ARGSUSED*/ static int -filt_solisten(struct knote *kn, long hint) +filt_sockev(struct knote *kn, long hint) { + int ret = 0, locked = 0; struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data; - int isempty; - if ((hint & SO_FILT_HINT_LOCKED) == 0) + if ((hint & SO_FILT_HINT_LOCKED) == 0) { socket_lock(so, 1); - kn->kn_data = so->so_qlen; - isempty = ! TAILQ_EMPTY(&so->so_comp); - if ((hint & SO_FILT_HINT_LOCKED) == 0) + locked = 1; + } + + switch (hint & SO_FILT_HINT_EV) { + case SO_FILT_HINT_CONNRESET: + if (kn->kn_sfflags & NOTE_CONNRESET) + kn->kn_fflags |= NOTE_CONNRESET; + break; + case SO_FILT_HINT_TIMEOUT: + if (kn->kn_sfflags & NOTE_TIMEOUT) + kn->kn_fflags |= NOTE_TIMEOUT; + break; + case SO_FILT_HINT_NOSRCADDR: + if (kn->kn_sfflags & NOTE_NOSRCADDR) + kn->kn_fflags |= NOTE_NOSRCADDR; + break; + case SO_FILT_HINT_IFDENIED: + if ((kn->kn_sfflags & NOTE_IFDENIED)) + kn->kn_fflags |= NOTE_IFDENIED; + break; + case SO_FILT_HINT_KEEPALIVE: + if (kn->kn_sfflags & NOTE_KEEPALIVE) + kn->kn_fflags |= NOTE_KEEPALIVE; + } + + if ((kn->kn_sfflags & NOTE_READCLOSED) && + (so->so_state & SS_CANTRCVMORE)) + kn->kn_fflags |= NOTE_READCLOSED; + + if ((kn->kn_sfflags & NOTE_WRITECLOSED) && + (so->so_state & SS_CANTSENDMORE)) + kn->kn_fflags |= NOTE_WRITECLOSED; + + if ((kn->kn_sfflags & NOTE_SUSPEND) && + ((hint & SO_FILT_HINT_SUSPEND) || + (so->so_flags & SOF_SUSPENDED))) { + kn->kn_fflags &= + ~(NOTE_SUSPEND | NOTE_RESUME); + kn->kn_fflags |= NOTE_SUSPEND; + } + + if ((kn->kn_sfflags & NOTE_RESUME) && + ((hint & SO_FILT_HINT_RESUME) || + (so->so_flags & SOF_SUSPENDED) == 0)) { + kn->kn_fflags &= + ~(NOTE_SUSPEND | NOTE_RESUME); + kn->kn_fflags |= NOTE_RESUME; + } + + if (so->so_error != 0) { + ret = 1; + kn->kn_data = so->so_error; + kn->kn_flags |= EV_EOF; + } else { + get_sockev_state(so, (u_int32_t *)&(kn->kn_data)); + } + + if (kn->kn_fflags != 0) + ret = 1; + + if (locked) socket_unlock(so, 1); - return (isempty); + + return(ret); } +void +get_sockev_state(struct socket *so, u_int32_t *statep) { + u_int32_t state = *(statep); + + if (so->so_state & SS_ISCONNECTED) + state |= SOCKEV_CONNECTED; + else + state &= ~(SOCKEV_CONNECTED); + state |= ((so->so_state & SS_ISDISCONNECTED) ? + SOCKEV_DISCONNECTED : 0); + *(statep) = state; + return; +} + +#define SO_LOCK_HISTORY_STR_LEN (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof(void *)) + 1) + 1) + +__private_extern__ const char * solockhistory_nr(struct socket *so) +{ + size_t n = 0; + int i; + static char lock_history_str[SO_LOCK_HISTORY_STR_LEN]; + + bzero(lock_history_str, sizeof(lock_history_str)); + for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) { + n += snprintf(lock_history_str + n, SO_LOCK_HISTORY_STR_LEN - n, "%lx:%lx ", + (uintptr_t) so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX], + (uintptr_t) so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]); + } + return lock_history_str; +} int socket_lock(struct socket *so, int refcount) { - int error = 0, lr_saved; + int error = 0; + void *lr_saved; - lr_saved = (unsigned int) __builtin_return_address(0); + lr_saved = __builtin_return_address(0); if (so->so_proto->pr_lock) { error = (*so->so_proto->pr_lock)(so, refcount, lr_saved); @@ -3858,7 +4223,7 @@ socket_lock(struct socket *so, int refcount) lck_mtx_lock(so->so_proto->pr_domain->dom_mtx); if (refcount) so->so_usecount++; - so->lock_lr[so->next_lock_lr] = (u_int32_t)lr_saved; + so->lock_lr[so->next_lock_lr] = lr_saved; so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX; } @@ -3868,10 +4233,11 @@ socket_lock(struct socket *so, int refcount) int socket_unlock(struct socket *so, int refcount) { - int error = 0, lr_saved; + int error = 0; + void *lr_saved; lck_mtx_t *mutex_held; - lr_saved = (unsigned int) __builtin_return_address(0); + lr_saved = __builtin_return_address(0); if (so->so_proto == NULL) panic("socket_unlock null so_proto so=%p\n", so); @@ -3883,13 +4249,16 @@ socket_unlock(struct socket *so, int refcount) #ifdef MORE_LOCKING_DEBUG lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED); #endif - so->unlock_lr[so->next_unlock_lr] = (u_int32_t)lr_saved; + so->unlock_lr[so->next_unlock_lr] = lr_saved; so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX; if (refcount) { if (so->so_usecount <= 0) - panic("socket_unlock: bad refcount so=%p " - "value=%d\n", so, so->so_usecount); + panic("socket_unlock: bad refcount=%d so=%p (%d, %d, %d) lrh=%s", + so->so_usecount, so, so->so_proto->pr_domain->dom_family, + so->so_type, so->so_proto->pr_protocol, + solockhistory_nr(so)); + so->so_usecount--; if (so->so_usecount == 0) { sofreelastref(so, 1); @@ -3943,3 +4312,157 @@ somultipages(struct socket *so, boolean_t set) else so->so_flags &= ~SOF_MULTIPAGES; } + +int +so_isdstlocal(struct socket *so) { + + struct inpcb *inp = (struct inpcb *)so->so_pcb; + + if (so->so_proto->pr_domain->dom_family == AF_INET) { + return inaddr_local(inp->inp_faddr); + } else if (so->so_proto->pr_domain->dom_family == AF_INET6) { + return in6addr_local(&inp->in6p_faddr); + } + return 0; +} + +int +sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce) +{ + int err = 0, defunct; + + defunct = (so->so_flags & SOF_DEFUNCT); + if (defunct) { + if (!(so->so_snd.sb_flags & so->so_rcv.sb_flags & SB_DROP)) + panic("%s: SB_DROP not set", __func__); + goto done; + } + + if (so->so_flags & SOF_NODEFUNCT) { + if (noforce) { + err = EOPNOTSUPP; + SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p " + "[%d,%d] is not eligible for defunct (%d)\n", + __func__, proc_selfpid(), proc_pid(p), level, so, + INP_SOCKAF(so), INP_SOCKTYPE(so), err)); + return (err); + } + so->so_flags &= ~SOF_NODEFUNCT; + SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] " + "defunct by force\n", __func__, proc_selfpid(), proc_pid(p), + level, so, INP_SOCKAF(so), INP_SOCKTYPE(so))); + } + + so->so_flags |= SOF_DEFUNCT; + /* Prevent further data from being appended to the socket buffers */ + so->so_snd.sb_flags |= SB_DROP; + so->so_rcv.sb_flags |= SB_DROP; + +done: + SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] %s " + "defunct\n", __func__, proc_selfpid(), proc_pid(p), level, so, + INP_SOCKAF(so), INP_SOCKTYPE(so), + defunct ? "is already" : "marked as")); + + return (err); +} + +int +sodefunct(struct proc *p, struct socket *so, int level) +{ + struct sockbuf *rcv, *snd; + + if (!(so->so_flags & SOF_DEFUNCT)) + panic("%s improperly called", __func__); + + if (so->so_state & SS_DEFUNCT) + goto done; + + rcv = &so->so_rcv; + snd = &so->so_snd; + + SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] is now " + "defunct [rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", + __func__, proc_selfpid(), proc_pid(p), level, so, + INP_SOCKAF(so), INP_SOCKTYPE(so), + (uint32_t)rcv->sb_sel.si_flags, (uint32_t)snd->sb_sel.si_flags, + (uint16_t)rcv->sb_flags, (uint16_t)snd->sb_flags)); + + /* + * Unwedge threads blocked on sbwait() and sb_lock(). + */ + sbwakeup(rcv); + sbwakeup(snd); + + if (rcv->sb_flags & SB_LOCK) + sbunlock(rcv, 1); + if (snd->sb_flags & SB_LOCK) + sbunlock(snd, 1); + + /* + * Flush the buffers and disconnect. We explicitly call shutdown + * on both data directions to ensure that SS_CANT{RCV,SEND}MORE + * states are set for the socket. This would also flush out data + * hanging off the receive list of this socket. + */ + (void) soshutdownlock(so, SHUT_RD); + (void) soshutdownlock(so, SHUT_WR); + (void) sodisconnectlocked(so); + + /* + * Explicitly handle connectionless-protocol disconnection + * and release any remaining data in the socket buffers. + */ + if (!(so->so_flags & SS_ISDISCONNECTED)) + (void) soisdisconnected(so); + + if (so->so_error == 0) + so->so_error = EBADF; + + if (rcv->sb_cc != 0) + sbrelease(rcv); + if (snd->sb_cc != 0) + sbrelease(snd); + + so->so_state |= SS_DEFUNCT; + +done: + return (0); +} + +__private_extern__ int +so_set_recv_anyif(struct socket *so, int optval) +{ + int ret = 0; + +#if INET6 + if (INP_SOCKAF(so) == AF_INET || INP_SOCKAF(so) == AF_INET6) { +#else + if (INP_SOCKAF(so) == AF_INET) { +#endif /* !INET6 */ + if (optval) + sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF; + else + sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF; + } else { + ret = EPROTONOSUPPORT; + } + + return (ret); +} + +__private_extern__ int +so_get_recv_anyif(struct socket *so) +{ + int ret = 0; + +#if INET6 + if (INP_SOCKAF(so) == AF_INET || INP_SOCKAF(so) == AF_INET6) { +#else + if (INP_SOCKAF(so) == AF_INET) { +#endif /* !INET6 */ + ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0; + } + + return (ret); +}