]> git.saurik.com Git - apple/xnu.git/blobdiff - bsd/kern/uipc_socket.c
xnu-2050.48.11.tar.gz
[apple/xnu.git] / bsd / kern / uipc_socket.c
index fa8ae828f0154c85dfb9462803e2d61d8c893712..af4b4fbe13f304cc1c7b78ec439a4f3a76e5ae4c 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1998-2008 Apple Inc. All rights reserved.
+ * Copyright (c) 1998-2012 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  * 
 #include <sys/ev.h>
 #include <sys/kdebug.h>
 #include <sys/un.h>
+#include <sys/user.h>
+#include <sys/priv.h>
 #include <net/route.h>
+#include <net/ntstat.h>
 #include <netinet/in.h>
 #include <netinet/in_pcb.h>
+#include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
 #include <kern/zalloc.h>
 #include <kern/locks.h>
 #include <machine/limits.h>
 #include <libkern/OSAtomic.h>
 #include <pexpert/pexpert.h>
 #include <kern/assert.h>
+#include <kern/task.h>
+#include <sys/kpi_mbuf.h>
+#include <sys/mcache.h>
 
 #if CONFIG_MACF
 #include <security/mac.h>
 #include <security/mac_framework.h>
 #endif /* MAC */
 
+
 int                    so_cache_hw = 0;
 int                    so_cache_timeouts = 0;
 int                    so_cache_max_freed = 0;
@@ -129,6 +138,8 @@ static void filt_sordetach(struct knote *kn);
 static int     filt_soread(struct knote *kn, long hint);
 static void    filt_sowdetach(struct knote *kn);
 static int     filt_sowrite(struct knote *kn, long hint);
+static void    filt_sockdetach(struct knote *kn);
+static int     filt_sockev(struct knote *kn, long hint);
 
 static int
 sooptcopyin_timeval(struct sockopt *sopt, struct timeval * tv_p);
@@ -146,6 +157,11 @@ static struct filterops sowrite_filtops = {
         .f_detach = filt_sowdetach,
         .f_event = filt_sowrite,
 };
+static struct filterops sock_filtops = {
+       .f_isfd = 1,
+       .f_detach = filt_sockdetach,
+       .f_event = filt_sockev,
+};
 
 #define        EVEN_MORE_LOCKING_DEBUG 0
 int socket_debug = 0;
@@ -169,15 +185,15 @@ MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 SYSCTL_DECL(_kern_ipc);
 
 int somaxconn = SOMAXCONN;
-SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
 
 /* Should we get a maximum also ??? */
 static int sosendmaxchain = 65536;
 static int sosendminchain = 16384;
 static int sorecvmincopy  = 16384;
-SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain, CTLFLAG_RW, &sosendminchain,
+SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain, CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain,
     0, "");
-SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW, &sorecvmincopy,
+SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy,
     0, "");
 
 /*
@@ -185,7 +201,7 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy, CTLFLAG_RW, &sorecvmincopy,
  * the socket is marked with SOF_MULTIPAGES; see below.
  */
 int sosendjcl = 1;
-SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl, CTLFLAG_RW, &sosendjcl, 0, "");
+SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl, CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
 
 /*
  * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
@@ -199,9 +215,17 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl, CTLFLAG_RW, &sosendjcl, 0, "");
  * capable.  Set this to 1 only for testing/debugging purposes.
  */
 int sosendjcl_ignore_capab = 0;
-SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab, CTLFLAG_RW,
+SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab, CTLFLAG_RW | CTLFLAG_LOCKED,
     &sosendjcl_ignore_capab, 0, "");
 
+int sodefunctlog = 0;
+SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
+    &sodefunctlog, 0, "");
+
+int sothrottlelog = 0;
+SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
+    &sothrottlelog, 0, "");
+
 /*
  * Socket operation routines.
  * These routines are called by the routines in
@@ -213,6 +237,7 @@ SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab, CTLFLAG_RW,
 /* sys_generic.c */
 extern void postevent(struct socket *, struct sockbuf *, int);
 extern void evsofree(struct socket *);
+extern int tcp_notsent_lowat_check(struct socket *so);
 
 /* TODO: these should be in header file */
 extern int get_inpcb_str_size(void);
@@ -234,6 +259,14 @@ static void so_cache_timer(void *);
 void soclose_wait_locked(struct socket *so);
 int so_isdstlocal(struct socket *so);
 
+/*
+ * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
+ * setting the DSCP code on the packet based on the service class; see
+ * <rdar://problem/11277343> for details.
+ */
+__private_extern__ u_int32_t sotcdb = SOTCDB_NO_DSCP;
+SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
+    &sotcdb, 0, "");
 
 void
 socketinit(void)
@@ -272,6 +305,8 @@ socketinit(void)
            get_inpcb_str_size() + 4 + get_tcp_str_size());
 
        so_cache_zone = zinit(str_size, 120000*str_size, 8192, "socache zone");
+       zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
+       zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
 #if TEMPDEBUG
        printf("cached_sock_alloc -- so_cache_zone size is %x\n", str_size);
 #endif
@@ -280,6 +315,12 @@ socketinit(void)
        so_cache_zone_element_size = str_size;
 
        sflt_init();
+
+       _CASSERT(_SO_TC_MAX == SO_TC_STATS_MAX);
+
+       socket_tclass_init();
+
+       socket_flowadv_init();
 }
 
 static void
@@ -344,7 +385,7 @@ cached_sock_alloc(struct socket **so, int waitok)
 
                offset = ALIGN(offset);
 
-               ((struct inpcb *)(*so)->so_saved_pcb)->inp_saved_ppcb =
+               ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
                    (caddr_t)offset;
 #if TEMPDEBUG
                kprintf("Allocating cached socket - %p, pcb=%p tcpcb=%p\n",
@@ -394,6 +435,24 @@ cached_sock_free(struct socket *so)
 #endif
 }
 
+static void
+so_update_last_owner_locked(
+       struct socket   *so,
+       proc_t                  self)
+{
+       if (so->last_pid != 0)
+       {
+               if (self == NULL)
+                       self = current_proc();
+               
+               if (self)
+               {
+                       so->last_upid = proc_uniqueid(self);
+                       so->last_pid = proc_pid(self);
+               }
+       }
+}
+
 static void
 so_cache_timer(__unused void *dummy)
 {
@@ -484,6 +543,7 @@ socreate(int dom, struct socket **aso, int type, int proto)
        register struct protosw *prp;
        register struct socket *so;
        register int error = 0;
+
 #if TCPDEBUG
        extern int tcpconsdebug;
 #endif
@@ -512,8 +572,10 @@ socreate(int dom, struct socket **aso, int type, int proto)
        TAILQ_INIT(&so->so_incomp);
        TAILQ_INIT(&so->so_comp);
        so->so_type = type;
+       so->last_upid = proc_uniqueid(p);
+       so->last_pid = proc_pid(p);
 
-       so->so_uid = kauth_cred_getuid(kauth_cred_get());
+       so->so_cred = kauth_cred_proc_ref(p);
        if (!suser(kauth_cred_get(), NULL))
                so->so_state = SS_PRIV;
 
@@ -559,6 +621,27 @@ socreate(int dom, struct socket **aso, int type, int proto)
                so->so_options |= SO_DEBUG;
 #endif
 #endif
+       so_set_default_traffic_class(so);
+       /*
+        * If this is a background thread/task, mark the socket as such.
+        */
+       if (proc_get_self_isbackground() != 0) {
+               socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
+               so->so_background_thread = current_thread();
+       }
+
+       switch (dom) {
+       /*
+        * Don't mark Unix domain or system sockets as eligible for defunct by default.
+       */
+       case PF_LOCAL:
+       case PF_SYSTEM:
+               so->so_flags |= SOF_NODEFUNCT;
+               break;
+       default:
+               break;
+       }
+
        *aso = so;
        return (0);
 }
@@ -590,40 +673,25 @@ sobind(struct socket *so, struct sockaddr *nam)
 {
        struct proc *p = current_proc();
        int error = 0;
-       struct socket_filter_entry *filter;
-       int filtered = 0;
 
        socket_lock(so, 1);
+       VERIFY(so->so_usecount > 1);    
+       so_update_last_owner_locked(so, p);
 
        /*
-        * If this is a bind request on a previously-accepted socket
-        * that has been marked as inactive, reject it now before
-        * we go any further.
+        * If this is a bind request on a socket that has been marked
+        * as inactive, reject it now before we go any further.
         */
        if (so->so_flags & SOF_DEFUNCT) {
                error = EINVAL;
+               SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n",
+                   __func__, proc_pid(p), so, INP_SOCKAF(so), INP_SOCKTYPE(so),
+                   error));
                goto out;
        }
 
        /* Socket filter */
-       error = 0;
-       for (filter = so->so_filt; filter && (error == 0);
-           filter = filter->sfe_next_onsocket) {
-               if (filter->sfe_filter->sf_filter.sf_bind) {
-                       if (filtered == 0) {
-                               filtered = 1;
-                               sflt_use(so);
-                               socket_unlock(so, 0);
-                       }
-                       error = filter->sfe_filter->sf_filter.
-                           sf_bind(filter->sfe_cookie, so, nam);
-               }
-       }
-       if (filtered != 0) {
-               socket_lock(so, 0);
-               sflt_unuse(so);
-       }
-       /* End socket filter */
+       error = sflt_bind(so, nam);
 
        if (error == 0)
                error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
@@ -639,6 +707,11 @@ out:
 void
 sodealloc(struct socket *so)
 {
+       kauth_cred_unref(&so->so_cred);
+
+       /* Remove any filters */
+       sflt_termsock(so);
+
        so->so_gencnt = ++so_gencnt;
 
 #if CONFIG_MACF_SOCKET
@@ -678,10 +751,9 @@ solisten(struct socket *so, int backlog)
 {
        struct proc *p = current_proc();
        int error = 0;
-       struct socket_filter_entry *filter;
-       int filtered = 0;
 
        socket_lock(so, 1);
+       
        if (so->so_proto == NULL) {
                error = EINVAL;
                goto out;
@@ -693,13 +765,18 @@ solisten(struct socket *so, int backlog)
 
        /*
         * If the listen request is made on a socket that is not fully
-        * disconnected, or on a previously-accepted socket that has
-        * been marked as inactive, reject the request now.
+        * disconnected, or on a socket that has been marked as inactive,
+        * reject the request now.
         */
        if ((so->so_state &
            (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
            (so->so_flags & SOF_DEFUNCT)) {
                error = EINVAL;
+               if (so->so_flags & SOF_DEFUNCT) {
+                       SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n",
+                           __func__, proc_pid(p), so, INP_SOCKAF(so),
+                           INP_SOCKTYPE(so), error));
+               }
                goto out;
        }
 
@@ -708,23 +785,7 @@ solisten(struct socket *so, int backlog)
                goto out;
        }
 
-       error = 0;
-       for (filter = so->so_filt; filter && (error == 0);
-           filter = filter->sfe_next_onsocket) {
-               if (filter->sfe_filter->sf_filter.sf_listen) {
-                       if (filtered == 0) {
-                               filtered = 1;
-                               sflt_use(so);
-                               socket_unlock(so, 0);
-                       }
-                       error = filter->sfe_filter->sf_filter.
-                           sf_listen(filter->sfe_cookie, so);
-               }
-       }
-       if (filtered != 0) {
-               socket_lock(so, 0);
-               sflt_unuse(so);
-       }
+       error = sflt_listen(so);
 
        if (error == 0) {
                error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
@@ -768,9 +829,6 @@ sofreelastref(struct socket *so, int dealloc)
 
        /* Assume socket is locked */
 
-       /* Remove any filters - may be called more than once */
-       sflt_termsock(so);
-
        if ((!(so->so_flags & SOF_PCBCLEARING)) ||
            ((so->so_state & SS_NOFDREF) == 0)) {
 #ifdef __APPLE__
@@ -838,10 +896,10 @@ soclose_wait_locked(struct socket *so)
         * Double check here and return if there's no outstanding upcall;
         * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
         */
-       if (!(so->so_flags & SOF_UPCALLINUSE) ||
-           !(so->so_flags & SOF_UPCALLCLOSEWAIT))
+       if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
                return;
-
+       so->so_rcv.sb_flags &= ~SB_UPCALL;
+       so->so_snd.sb_flags &= ~SB_UPCALL;
        so->so_flags |= SOF_CLOSEWAIT;
        (void) msleep((caddr_t)&so->so_upcall, mutex_held, (PZERO - 1),
            "soclose_wait_locked", NULL);
@@ -977,6 +1035,15 @@ drop:
        if (so->so_usecount == 0)
                panic("soclose: usecount is zero so=%p\n", so);
        if (so->so_pcb && !(so->so_flags & SOF_PCBCLEARING)) {
+               /*
+                * Let NetworkStatistics know this PCB is going away
+                * before we detach it.
+                */
+               if (nstat_collect &&
+                   (so->so_proto->pr_domain->dom_family == AF_INET ||
+                   so->so_proto->pr_domain->dom_family == AF_INET6))
+                       nstat_pcb_detach(so->so_pcb);
+
                int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
                if (error == 0)
                        error = error2;
@@ -987,6 +1054,9 @@ discard:
        if (so->so_pcb && so->so_state & SS_NOFDREF)
                panic("soclose: NOFDREF");
        so->so_state |= SS_NOFDREF;
+       
+       if ((so->so_flags & SOF_KNOTE) != 0)
+               KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
 #ifdef __APPLE__
        so->so_proto->pr_domain->dom_refs--;
        evsofree(so);
@@ -1002,7 +1072,7 @@ soclose(struct socket *so)
        int error = 0;
        socket_lock(so, 1);
 
-       if (so->so_flags & SOF_UPCALLINUSE)
+       if (so->so_upcallusecount)
                soclose_wait_locked(so);
 
        if (so->so_retaincnt == 0) {
@@ -1079,8 +1149,7 @@ int
 soacceptfilter(struct socket *so)
 {
        struct sockaddr *local = NULL, *remote = NULL;
-       struct socket_filter_entry *filter;
-       int error = 0, filtered = 0;
+       int error = 0;
        struct socket *head = so->so_head;
 
        /*
@@ -1101,29 +1170,7 @@ soacceptfilter(struct socket *so)
                goto done;
        }
 
-       /*
-        * At this point, we have a reference on the listening socket
-        * so we know it won't be going away.  Do the same for the newly
-        * accepted socket while we invoke the accept callback routine.
-        */
-       for (filter = so->so_filt; filter != NULL && error == 0;
-           filter = filter->sfe_next_onsocket) {
-               if (filter->sfe_filter->sf_filter.sf_accept != NULL) {
-                       if (!filtered) {
-                               filtered = 1;
-                               sflt_use(so);
-                               socket_unlock(so, 0);
-                       }
-                       error = filter->sfe_filter->sf_filter.
-                           sf_accept(filter->sfe_cookie,
-                           head, so, local, remote);
-               }
-       }
-
-       if (filtered) {
-               socket_lock(so, 0);
-               sflt_unuse(so);
-       }
+       error = sflt_accept(head, so, local, remote);
 
        /*
         * If we get EJUSTRETURN from one of the filters, mark this socket
@@ -1132,10 +1179,8 @@ soacceptfilter(struct socket *so)
         */
        if (error == EJUSTRETURN) {
                error = 0;
-               so->so_flags |= SOF_DEFUNCT;
-               /* Prevent data from being appended to the socket buffers */
-               so->so_snd.sb_flags |= SB_DROP;
-               so->so_rcv.sb_flags |= SB_DROP;
+               (void) sosetdefunct(current_proc(), so,
+                   SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
        }
 
        if (error != 0) {
@@ -1181,15 +1226,21 @@ soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
 
        if (dolock)
                socket_lock(so, 1);
-
+       
        /*
         * If this is a listening socket or if this is a previously-accepted
         * socket that has been marked as inactive, reject the connect request.
         */
        if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
+               error = EOPNOTSUPP;
+               if (so->so_flags & SOF_DEFUNCT) {
+                       SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n",
+                           __func__, proc_pid(p), so, INP_SOCKAF(so),
+                           INP_SOCKTYPE(so), error));
+               }
                if (dolock)
                        socket_unlock(so, 1);
-               return (EOPNOTSUPP);
+               return (error);
        }
 
        if ((so->so_restrictions & SO_RESTRICT_DENYOUT) != 0) {
@@ -1213,36 +1264,14 @@ soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
                 * Run connect filter before calling protocol:
                 *  - non-blocking connect returns before completion;
                 */
-               struct socket_filter_entry *filter;
-               int filtered = 0;
-
-               error = 0;
-               for (filter = so->so_filt; filter && (error == 0);
-                   filter = filter->sfe_next_onsocket) {
-                       if (filter->sfe_filter->sf_filter.sf_connect_out) {
-                               if (filtered == 0) {
-                                       filtered = 1;
-                                       sflt_use(so);
-                                       socket_unlock(so, 0);
-                               }
-                               error = filter->sfe_filter->sf_filter.
-                                   sf_connect_out(filter->sfe_cookie, so, nam);
-                       }
-               }
-               if (filtered != 0) {
-                       socket_lock(so, 0);
-                       sflt_unuse(so);
-               }
+               error = sflt_connectout(so, nam);
 
                if (error) {
                        if (error == EJUSTRETURN)
                                error = 0;
-                       if (dolock)
-                               socket_unlock(so, 1);
-                       return (error);
+               } else {
+                       error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
                }
-
-               error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p);
        }
        if (dolock)
                socket_unlock(so, 1);
@@ -1352,6 +1381,8 @@ restart:
                } else {
                        error = sblock(&so->so_snd, SBLOCKWAIT(flags));
                        if (error) {
+                               if (so->so_flags & SOF_DEFUNCT)
+                                       goto defunct;
                                return (error);
                        }
                        *sblocked = 1;
@@ -1359,12 +1390,17 @@ restart:
        }
 
        /*
-        * If a send attempt is made on a previously-accepted socket
-        * that has been marked as inactive (disconnected), reject
-        * the request.
+        * If a send attempt is made on a socket that has been marked
+        * as inactive (disconnected), reject the request.
         */
-       if (so->so_flags & SOF_DEFUNCT)
-               return (ENOTCONN);
+       if (so->so_flags & SOF_DEFUNCT) {
+defunct:
+               error = EPIPE;
+               SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", __func__,
+                   proc_selfpid(), so, INP_SOCKAF(so), INP_SOCKTYPE(so),
+                   error));
+               return (error);
+       }
 
        if (so->so_state & SS_CANTSENDMORE)
                return (EPIPE);
@@ -1391,15 +1427,19 @@ restart:
        if ((atomic && resid > so->so_snd.sb_hiwat) ||
            clen > so->so_snd.sb_hiwat)
                return (EMSGSIZE);
-       if (space < resid + clen &&
-           (atomic || space < (int32_t)so->so_snd.sb_lowat || space < clen)) {
+       if ((space < resid + clen &&
+           (atomic || space < (int32_t)so->so_snd.sb_lowat || space < clen)) ||
+           (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
                if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
                    assumelock) {
                        return (EWOULDBLOCK);
                }
                sbunlock(&so->so_snd, 1);
+               *sblocked = 0;
                error = sbwait(&so->so_snd);
                if (error) {
+                       if (so->so_flags & SOF_DEFUNCT)
+                               goto defunct;
                        return (error);
                }
                goto restart;
@@ -1490,6 +1530,8 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
            so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
 
        socket_lock(so, 1);
+       so_update_last_owner_locked(so, p);
+       
        if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
                error = EOPNOTSUPP;
                socket_unlock(so, 1);
@@ -1530,10 +1572,6 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
                    1024 : 0);
 
                do {
-                       struct socket_filter_entry *filter;
-                       int filtered;
-                       boolean_t recursive;
-
                        if (uio == NULL) {
                                /*
                                 * Data is prepackaged in "top".
@@ -1586,7 +1624,8 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
                                         * haven't yet consumed.
                                         */
                                        if (freelist == NULL &&
-                                           bytes_to_copy > NBPG && jumbocl) {
+                                           bytes_to_copy > MBIGCLBYTES &&
+                                           jumbocl) {
                                                num_needed =
                                                    bytes_to_copy / M16KCLBYTES;
 
@@ -1609,10 +1648,10 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
                                        if (freelist == NULL &&
                                            bytes_to_copy > MCLBYTES) {
                                                num_needed =
-                                                   bytes_to_copy / NBPG;
+                                                   bytes_to_copy / MBIGCLBYTES;
 
                                                if ((bytes_to_copy -
-                                                   (num_needed * NBPG)) >=
+                                                   (num_needed * MBIGCLBYTES)) >=
                                                    MINCLSIZE)
                                                        num_needed++;
 
@@ -1620,7 +1659,7 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
                                                    m_getpackets_internal(
                                                    (unsigned int *)&num_needed,
                                                    hdrs_needed, M_WAIT, 0,
-                                                   NBPG);
+                                                   MBIGCLBYTES);
                                                /*
                                                 * Fall back to cluster size
                                                 * if allocation failed
@@ -1758,65 +1797,24 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
                        /*
                         * Socket filter processing
                         */
-                       recursive = (so->so_send_filt_thread != NULL);
-                       filtered = 0;
-                       error = 0;
-                       for (filter = so->so_filt; filter && (error == 0);
-                           filter = filter->sfe_next_onsocket) {
-                               if (filter->sfe_filter->sf_filter.sf_data_out) {
-                                       int so_flags = 0;
-                                       if (filtered == 0) {
-                                               filtered = 1;
-                                               so->so_send_filt_thread =
-                                                   current_thread();
-                                               sflt_use(so);
-                                               socket_unlock(so, 0);
-                                               so_flags =
-                                                   (sendflags & MSG_OOB) ?
-                                                   sock_data_filt_flag_oob : 0;
-                                       }
-                                       error = filter->sfe_filter->sf_filter.
-                                           sf_data_out(filter->sfe_cookie, so,
-                                           addr, &top, &control, so_flags);
+                       error = sflt_data_out(so, addr, &top, &control,
+                                               (sendflags & MSG_OOB) ? sock_data_filt_flag_oob : 0);
+                       if (error) {
+                               if (error == EJUSTRETURN) {
+                                       error = 0;
+                                       clen = 0;
+                                       control = 0;
+                                       top = 0;
                                }
-                       }
 
-                       if (filtered) {
-                               /*
-                                * At this point, we've run at least one
-                                * filter.  The socket is unlocked as is
-                                * the socket buffer.  Clear the recorded
-                                * filter thread only when we are outside
-                                * of a filter's context.  This allows for
-                                * a filter to issue multiple inject calls
-                                * from its sf_data_out callback routine.
-                                */
-                               socket_lock(so, 0);
-                               sflt_unuse(so);
-                               if (!recursive)
-                                       so->so_send_filt_thread = 0;
-                               if (error) {
-                                       if (error == EJUSTRETURN) {
-                                               error = 0;
-                                               clen = 0;
-                                               control = 0;
-                                               top = 0;
-                                       }
-
-                                       goto release;
-                               }
+                               goto release;
                        }
                        /*
                         * End Socket filter processing
                         */
 
-                       if (error == EJUSTRETURN) {
-                               /* A socket filter handled this data */
-                               error = 0;
-                       } else {
-                               error = (*so->so_proto->pr_usrreqs->pru_send)
-                                   (so, sendflags, top, addr, control, p);
-                       }
+                       error = (*so->so_proto->pr_usrreqs->pru_send)
+                               (so, sendflags, top, addr, control, p);
 #ifdef __APPLE__
                        if (flags & MSG_SEND)
                                so->so_temp = NULL;
@@ -1910,6 +1908,7 @@ soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
            so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
 
        socket_lock(so, 1);
+       so_update_last_owner_locked(so, p);
 
 #ifdef MORE_LOCKING_DEBUG
        if (so->so_usecount == 1)
@@ -1933,14 +1932,18 @@ soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
        if (so->so_flags & SOF_DEFUNCT) {
                struct sockbuf *sb = &so->so_rcv;
 
+               error = ENOTCONN;
+               SODEFUNCTLOG(("%s[%d]: defunct so %p [%d,%d] (%d)\n", __func__,
+                   proc_pid(p), so, INP_SOCKAF(so), INP_SOCKTYPE(so), error));
                /*
                 * This socket should have been disconnected and flushed
-                * prior to being returned from accept; there should be
-                * no data on its receive list, so panic otherwise.
+                * prior to being returned from sodefunct(); there should
+                * be no data on its receive list, so panic otherwise.
                 */
-               sb_empty_assert(sb, __func__);
+               if (so->so_state & SS_DEFUNCT)
+                       sb_empty_assert(sb, __func__);
                socket_unlock(so, 1);
-               return (ENOTCONN);
+               return (error);
        }
 
        /*
@@ -2058,9 +2061,7 @@ restart:
                 * end up with false positives during select() or poll()
                 * which could put the application in a bad state.
                 */
-               if (m == NULL && so->so_rcv.sb_cc != 0)
-                       panic("soreceive corrupted so_rcv: m %p cc %u",
-                           m, so->so_rcv.sb_cc);
+               SB_MB_CHECK(&so->so_rcv);
 
                if (so->so_error) {
                        if (m)
@@ -2172,6 +2173,14 @@ dontblock:
                                goto restart;
                        }
                        socket_lock(so, 0);
+                       /*
+                        * If the socket has been defunct'd, drop it.
+                        */
+                       if (so->so_flags & SOF_DEFUNCT) {
+                               m_freem(m);
+                               error = ENOTCONN;
+                               goto release;
+                       }
                        /*
                         * Re-adjust the socket receive list and re-enqueue
                         * the record in front of any packets which may have
@@ -2228,6 +2237,7 @@ dontblock:
                struct mbuf *cm = NULL, *cmn;
                struct mbuf **cme = &cm;
                struct sockbuf *sb_rcv = &so->so_rcv;
+               struct mbuf **msgpcm = NULL;
 
                /*
                 * Externalizing the control messages would require us to
@@ -2240,7 +2250,23 @@ dontblock:
                do {
                        if (flags & MSG_PEEK) {
                                if (controlp != NULL) {
+                                       if (*controlp == NULL) {
+                                               msgpcm = controlp;
+                                       }
                                        *controlp = m_copy(m, 0, m->m_len);
+
+                                       /* If we failed to allocate an mbuf,
+                                        * release any previously allocated
+                                        * mbufs for control data. Return 
+                                        * an error. Keep the mbufs in the
+                                        * socket as this is using 
+                                        * MSG_PEEK flag.
+                                        */
+                                       if (*controlp == NULL) {
+                                               m_freem(*msgpcm);
+                                               error = ENOBUFS;
+                                               goto release;
+                                       }
                                        controlp = &(*controlp)->m_next;
                                }
                                m = m->m_next;
@@ -2308,11 +2334,16 @@ dontblock:
                        }
                        cm = cmn;
                }
-               orig_resid = 0;
-               if (sb_rcv->sb_mb != NULL)
+               /* 
+                * Update the value of nextrecord in case we received new
+                * records when the socket was unlocked above for 
+                * externalizing SCM_RIGHTS.
+                */
+               if (m != NULL)
                        nextrecord = sb_rcv->sb_mb->m_nextpkt;
                else
-                       nextrecord = NULL;
+                       nextrecord = sb_rcv->sb_mb;
+               orig_resid = 0;
        }
 
        if (m != NULL) {
@@ -2337,7 +2368,6 @@ dontblock:
                        flags |= MSG_OOB;
        } else {
                if (!(flags & MSG_PEEK)) {
-                       so->so_rcv.sb_mb = nextrecord;
                        SB_EMPTY_FIXUP(&so->so_rcv);
                }
        }
@@ -2474,8 +2504,25 @@ dontblock:
                        if (flags & MSG_PEEK) {
                                moff += len;
                        } else {
-                               if (mp)
-                                       *mp = m_copym(m, 0, len, M_WAIT);
+                               if (mp != NULL) {
+                                       int copy_flag;
+
+                                       if (flags & MSG_DONTWAIT)
+                                               copy_flag = M_DONTWAIT;
+                                       else
+                                               copy_flag = M_WAIT;
+                                       *mp = m_copym(m, 0, len, copy_flag);
+                                       if (*mp == NULL) {
+                                               /*
+                                                * Failed to allocate an mbuf.
+                                                * Adjust uio_resid back, it was
+                                                * adjusted down by len bytes which
+                                                * we didn't copy over
+                                                */
+                                               uio_setresid(uio, (uio_resid(uio) + len));
+                                               break;
+                                       }
+                               }
                                m->m_data += len;
                                m->m_len -= len;
                                so->so_rcv.sb_cc -= len;
@@ -2567,6 +2614,7 @@ dontblock:
                        if (m) {
                                nextrecord = m->m_nextpkt;
                        }
+                       SB_MB_CHECK(&so->so_rcv);
                }
        }
 #ifdef MORE_LOCKING_DEBUG
@@ -2614,6 +2662,7 @@ dontblock:
                        } else if (nextrecord->m_nextpkt == NULL) {
                                so->so_rcv.sb_lastrecord = nextrecord;
                        }
+                       SB_MB_CHECK(&so->so_rcv);
                }
                SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
                SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
@@ -2934,13 +2983,12 @@ sosetopt(struct socket *so, struct sockopt *sopt)
        int     error, optval;
        struct  linger l;
        struct  timeval tv;
-       struct socket_filter_entry *filter;
-       int filtered = 0;
 #if CONFIG_MACF_SOCKET
        struct mac extmac;
 #endif /* MAC_SOCKET */
 
        socket_lock(so, 1);
+       
        if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE))
            == (SS_CANTRCVMORE | SS_CANTSENDMORE) && 
            (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
@@ -2953,29 +3001,11 @@ sosetopt(struct socket *so, struct sockopt *sopt)
                sopt->sopt_dir = SOPT_SET;
        }
 
-       error = 0;
-       for (filter = so->so_filt; filter && (error == 0);
-           filter = filter->sfe_next_onsocket) {
-               if (filter->sfe_filter->sf_filter.sf_setoption) {
-                       if (filtered == 0) {
-                               filtered = 1;
-                               sflt_use(so);
-                               socket_unlock(so, 0);
-                       }
-                       error = filter->sfe_filter->sf_filter.
-                           sf_setoption(filter->sfe_cookie, so, sopt);
-               }
-       }
-
-       if (filtered != 0) {
-               socket_lock(so, 0);
-               sflt_unuse(so);
-
-               if (error) {
-                       if (error == EJUSTRETURN)
-                               error = 0;
-                       goto bad;
-               }
+       error = sflt_setsockopt(so, sopt);
+       if (error) {
+               if (error == EJUSTRETURN)
+                       error = 0;
+               goto bad;
        }
 
        error = 0;
@@ -3011,6 +3041,7 @@ sosetopt(struct socket *so, struct sockopt *sopt)
                case SO_REUSEPORT:
                case SO_OOBINLINE:
                case SO_TIMESTAMP:
+               case SO_TIMESTAMP_MONOTONIC:
 #ifdef __APPLE__
                case SO_DONTTRUNC:
                case SO_WANTMORE:
@@ -3047,17 +3078,18 @@ sosetopt(struct socket *so, struct sockopt *sopt)
                        switch (sopt->sopt_name) {
                        case SO_SNDBUF:
                        case SO_RCVBUF:
-                               if (sbreserve(sopt->sopt_name == SO_SNDBUF ?
-                                   &so->so_snd : &so->so_rcv,
-                                   (u_int32_t) optval) == 0) {
+                       {
+                               struct sockbuf *sb = (sopt->sopt_name == SO_SNDBUF) ?
+                                       &so->so_snd : &so->so_rcv;
+                               if (sbreserve(sb, (u_int32_t) optval) == 0) {
                                        error = ENOBUFS;
                                        goto bad;
                                }
-                               if (sopt->sopt_name == SO_SNDBUF)
-                                       so->so_snd.sb_flags |= SB_USRSIZE;
-                               else
-                                       so->so_rcv.sb_flags |= SB_USRSIZE;
+                               sb->sb_flags |= SB_USRSIZE;
+                               sb->sb_flags &= ~SB_AUTOSIZE;
+                               sb->sb_idealsize = (u_int32_t)optval;
                                break;
+                       }
 
                        /*
                         * Make sure the low-water is never greater than
@@ -3101,8 +3133,7 @@ sosetopt(struct socket *so, struct sockopt *sopt)
                        if (error)
                                goto bad;
 
-                       error = sflt_attach_private(so, NULL,
-                           nke.nke_handle, 1);
+                       error = sflt_attach_internal(so, nke.nke_handle);
                        break;
                }
 
@@ -3228,6 +3259,111 @@ sosetopt(struct socket *so, struct sockopt *sopt)
                        break;
                }
 
+               case SO_TRAFFIC_CLASS: {
+                       error = sooptcopyin(sopt, &optval, sizeof (optval),
+                               sizeof (optval));
+                       if (error)
+                               goto bad;
+                       error = so_set_traffic_class(so, optval);
+                       if (error)
+                               goto bad;
+                       break;
+               }
+
+               case SO_RECV_TRAFFIC_CLASS: {
+                       error = sooptcopyin(sopt, &optval, sizeof (optval),
+                               sizeof (optval));
+                       if (error)
+                               goto bad;
+                       if (optval == 0)
+                               so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
+                       else
+                               so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
+                       break;
+               }
+
+               case SO_TRAFFIC_CLASS_DBG: {
+                       struct so_tcdbg so_tcdbg;
+
+                       error = sooptcopyin(sopt, &so_tcdbg,
+                           sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
+                       if (error)
+                               goto bad;
+                       error = so_set_tcdbg(so, &so_tcdbg);
+                       if (error)
+                               goto bad;
+                       break;
+               }
+
+               case SO_PRIVILEGED_TRAFFIC_CLASS:
+                       error = priv_check_cred(kauth_cred_get(),
+                           PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
+                       if (error)
+                               goto bad;
+                       error = sooptcopyin(sopt, &optval, sizeof (optval),
+                               sizeof (optval));
+                       if (error)
+                               goto bad;
+                       if (optval == 0)
+                               so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
+                       else
+                               so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
+                       break;
+
+               case SO_DEFUNCTOK:
+                       error = sooptcopyin(sopt, &optval, sizeof (optval),
+                           sizeof (optval));
+                       if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
+                               if (error == 0)
+                                       error = EBADF;
+                               goto bad;
+                       }
+                       /*
+                        * Any process can set SO_DEFUNCTOK (clear
+                        * SOF_NODEFUNCT), but only root can clear
+                        * SO_DEFUNCTOK (set SOF_NODEFUNCT).
+                        */
+                       if (optval == 0 &&
+                           kauth_cred_issuser(kauth_cred_get()) == 0) {
+                               error = EPERM;
+                               goto bad;
+                       }
+                       if (optval)
+                               so->so_flags &= ~SOF_NODEFUNCT;
+                       else
+                               so->so_flags |= SOF_NODEFUNCT;
+
+                       SODEFUNCTLOG(("%s[%d]: so %p [%d,%d] is now marked as "
+                           "%seligible for defunct\n", __func__,
+                           proc_selfpid(), so, INP_SOCKAF(so),
+                           INP_SOCKTYPE(so),
+                           (so->so_flags & SOF_NODEFUNCT) ? "not " : ""));
+                       break;
+
+               case SO_ISDEFUNCT:
+                       /* This option is not settable */
+                       error = EINVAL;
+                       break;
+
+               case SO_OPPORTUNISTIC:
+                       error = sooptcopyin(sopt, &optval, sizeof (optval),
+                           sizeof (optval));
+                       if (error == 0)
+                               error = so_set_opportunistic(so, optval);
+                       break;
+
+               case SO_FLUSH:
+                       /* This option is handled by lower layer(s) */
+                       error = 0;
+                       break;
+
+               case SO_RECV_ANYIF:
+                       error = sooptcopyin(sopt, &optval, sizeof (optval),
+                           sizeof (optval));
+                       if (error == 0)
+                               error = so_set_recv_anyif(so, optval);
+                       break;
+
                default:
                        error = ENOPROTOOPT;
                        break;
@@ -3316,8 +3452,6 @@ sogetopt(struct socket *so, struct sockopt *sopt)
        int     error, optval;
        struct  linger l;
        struct  timeval tv;
-       struct  socket_filter_entry *filter;
-       int     filtered = 0;
 #if CONFIG_MACF_SOCKET
        struct mac extmac;
 #endif /* MAC_SOCKET */
@@ -3328,31 +3462,14 @@ sogetopt(struct socket *so, struct sockopt *sopt)
 
        socket_lock(so, 1);
 
-       error = 0;
-       for (filter = so->so_filt; filter && (error == 0);
-           filter = filter->sfe_next_onsocket) {
-               if (filter->sfe_filter->sf_filter.sf_getoption) {
-                       if (filtered == 0) {
-                               filtered = 1;
-                               sflt_use(so);
-                               socket_unlock(so, 0);
-                       }
-                       error = filter->sfe_filter->sf_filter.
-                           sf_getoption(filter->sfe_cookie, so, sopt);
-               }
-       }
-       if (filtered != 0) {
-               socket_lock(so, 0);
-               sflt_unuse(so);
-
-               if (error) {
-                       if (error == EJUSTRETURN)
-                               error = 0;
-                       socket_unlock(so, 1);
-                       return (error);
-               }
+       error = sflt_getsockopt(so, sopt);
+       if (error) {
+               if (error == EJUSTRETURN)
+                       error = 0;
+               socket_unlock(so, 1);
+               return (error);
        }
-
+       
        error = 0;
        if (sopt->sopt_level != SOL_SOCKET) {
                if (so->so_proto && so->so_proto->pr_ctloutput) {
@@ -3382,6 +3499,7 @@ sogetopt(struct socket *so, struct sockopt *sopt)
                case SO_BROADCAST:
                case SO_OOBINLINE:
                case SO_TIMESTAMP:
+               case SO_TIMESTAMP_MONOTONIC:
 #ifdef __APPLE__
                case SO_DONTTRUNC:
                case SO_WANTMORE:
@@ -3517,6 +3635,48 @@ integer:
                        error = sooptcopyout(sopt, &sonpx, sizeof(struct so_np_extensions));
                        break;  
                }
+
+               case SO_TRAFFIC_CLASS:
+                       optval = so->so_traffic_class;
+                       goto integer;
+
+               case SO_RECV_TRAFFIC_CLASS:
+                       optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
+                       goto integer;
+
+               case SO_TRAFFIC_CLASS_STATS:
+                       error = sooptcopyout(sopt, &so->so_tc_stats, sizeof(so->so_tc_stats));
+                       break;
+
+               case SO_TRAFFIC_CLASS_DBG: 
+                       error = sogetopt_tcdbg(so, sopt);
+                       break;
+
+               case SO_PRIVILEGED_TRAFFIC_CLASS:
+                       optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
+                       goto integer;
+
+               case SO_DEFUNCTOK:
+                       optval = !(so->so_flags & SOF_NODEFUNCT);
+                       goto integer;
+
+               case SO_ISDEFUNCT:
+                       optval = (so->so_flags & SOF_DEFUNCT);
+                       goto integer;
+
+               case SO_OPPORTUNISTIC:
+                       optval = so_get_opportunistic(so);
+                       goto integer;
+
+               case SO_FLUSH:
+                       /* This option is not gettable */
+                       error = EINVAL;
+                       break;
+
+               case SO_RECV_ANYIF:
+                       optval = so_get_recv_anyif(so);
+                       goto integer;
+
                default:
                        error = ENOPROTOOPT;
                        break;
@@ -3525,8 +3685,10 @@ integer:
                return (error);
        }
 }
-
-/* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
+/* The size limits on our soopt_getm is different from that on FreeBSD.
+ * We limit the size of options to MCLBYTES. This will have to change
+ * if we need to define options that need more space than MCLBYTES.
+ */
 int
 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
 {
@@ -3534,7 +3696,7 @@ soopt_getm(struct sockopt *sopt, struct mbuf **mp)
        int sopt_size = sopt->sopt_valsize;
        int how;
 
-       if (sopt_size > MAX_SOOPTGETM_SIZE)
+       if (sopt_size <= 0 || sopt_size > MCLBYTES)
                return (EMSGSIZE);
 
        how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
@@ -3555,7 +3717,7 @@ soopt_getm(struct sockopt *sopt, struct mbuf **mp)
        *mp = m;
        m_prev = m;
 
-       while (sopt_size) {
+       while (sopt_size > 0) {
                MGET(m, how, MT_DATA);
                if (m == 0) {
                        m_freem(*mp);
@@ -3565,6 +3727,7 @@ soopt_getm(struct sockopt *sopt, struct mbuf **mp)
                        MCLGET(m, how);
                        if ((m->m_flags & M_EXT) == 0) {
                                m_freem(*mp);
+                               m_freem(m);
                                return (ENOBUFS);
                        }
                        m->m_len = min(MCLBYTES, sopt_size);
@@ -3578,7 +3741,7 @@ soopt_getm(struct sockopt *sopt, struct mbuf **mp)
        return (0);
 }
 
-/* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
+/* copyin sopt data into mbuf chain */
 int
 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
 {
@@ -3609,7 +3772,7 @@ soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
        return (0);
 }
 
-/* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
+/* copyout mbuf chain data into soopt */
 int
 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
 {
@@ -3706,7 +3869,7 @@ soo_kqfilter(__unused struct fileproc *fp, struct knote *kn,
     __unused struct proc *p)
 {
        struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
-       struct sockbuf *sb;
+       struct klist *skl;
 
        socket_lock(so, 1);
 
@@ -3720,19 +3883,37 @@ soo_kqfilter(__unused struct fileproc *fp, struct knote *kn,
        switch (kn->kn_filter) {
        case EVFILT_READ:
                kn->kn_fop = &soread_filtops;
-               sb = &so->so_rcv;
+               skl = &so->so_rcv.sb_sel.si_note;
                break;
        case EVFILT_WRITE:
                kn->kn_fop = &sowrite_filtops;
-               sb = &so->so_snd;
+               skl = &so->so_snd.sb_sel.si_note;
+               break;
+       case EVFILT_SOCK:
+               kn->kn_fop = &sock_filtops;
+               skl = &so->so_klist;
                break;
        default:
                socket_unlock(so, 1);
                return (1);
        }
 
-       if (KNOTE_ATTACH(&sb->sb_sel.si_note, kn))
-               sb->sb_flags |= SB_KNOTE;
+       if (KNOTE_ATTACH(skl, kn)) {
+               switch(kn->kn_filter) {
+               case EVFILT_READ:
+                       so->so_rcv.sb_flags |= SB_KNOTE;
+                       break;
+               case EVFILT_WRITE:
+                       so->so_snd.sb_flags |= SB_KNOTE;
+                       break;
+               case EVFILT_SOCK:
+                       so->so_flags |= SOF_KNOTE;
+                       break;
+               default:
+                       socket_unlock(so, 1);
+                       return (1);
+               }
+       }
        socket_unlock(so, 1);
        return (0);
 }
@@ -3818,12 +3999,19 @@ filt_soread(struct knote *kn, long hint)
                return (1);
        }
 
+       int64_t lowwat = so->so_rcv.sb_lowat;
+       if (kn->kn_sfflags & NOTE_LOWAT)
+       {
+               if (kn->kn_sdata > so->so_rcv.sb_hiwat)
+                       lowwat = so->so_rcv.sb_hiwat;
+               else if (kn->kn_sdata > lowwat)
+                       lowwat = kn->kn_sdata;
+       }
+       
        if ((hint & SO_FILT_HINT_LOCKED) == 0)
                socket_unlock(so, 1);
-
-       return ((kn->kn_flags & EV_OOBAND) ||
-           kn->kn_data >= ((kn->kn_sfflags & NOTE_LOWAT) ?
-           kn->kn_sdata : so->so_rcv.sb_lowat));
+       
+       return ((kn->kn_flags & EV_OOBAND) || kn->kn_data >= lowwat);
 }
 
 static void
@@ -3838,11 +4026,25 @@ filt_sowdetach(struct knote *kn)
        socket_unlock(so, 1);
 }
 
+int
+so_wait_for_if_feedback(struct socket *so)
+{
+       if ((so->so_proto->pr_domain->dom_family == AF_INET ||
+           so->so_proto->pr_domain->dom_family == AF_INET6) &&
+           (so->so_state & SS_ISCONNECTED)) {
+               struct inpcb *inp = sotoinpcb(so);
+               if (INP_WAIT_FOR_IF_FEEDBACK(inp))
+                       return (1);
+       }
+       return (0);
+}
+
 /*ARGSUSED*/
 static int
 filt_sowrite(struct knote *kn, long hint)
 {
        struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
+       int ret = 0;
 
        if ((hint & SO_FILT_HINT_LOCKED) == 0)
                socket_lock(so, 1);
@@ -3851,29 +4053,142 @@ filt_sowrite(struct knote *kn, long hint)
        if (so->so_state & SS_CANTSENDMORE) {
                kn->kn_flags |= EV_EOF;
                kn->kn_fflags = so->so_error;
-               if ((hint & SO_FILT_HINT_LOCKED) == 0)
-                       socket_unlock(so, 1);
-               return (1);
+               ret = 1;
+               goto out;
        }
        if (so->so_error) {     /* temporary udp error */
-               if ((hint & SO_FILT_HINT_LOCKED) == 0)
-                       socket_unlock(so, 1);
-               return (1);
+               ret = 1;
+               goto out;
        }
        if (((so->so_state & SS_ISCONNECTED) == 0) &&
            (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
-               if ((hint & SO_FILT_HINT_LOCKED) == 0)
-                       socket_unlock(so, 1);
-               return (0);
+               ret = 0;
+               goto out;
+       }
+       int64_t lowwat = so->so_snd.sb_lowat;
+       if (kn->kn_sfflags & NOTE_LOWAT)
+       {
+               if (kn->kn_sdata > so->so_snd.sb_hiwat)
+                       lowwat = so->so_snd.sb_hiwat;
+               else if (kn->kn_sdata > lowwat)
+                       lowwat = kn->kn_sdata;
        }
+       if (kn->kn_data >= lowwat) {
+               if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0) {
+                       ret = tcp_notsent_lowat_check(so);
+               } else {
+                       ret = 1;
+               }
+       }
+       if (so_wait_for_if_feedback(so))
+               ret = 0;
+out:
        if ((hint & SO_FILT_HINT_LOCKED) == 0)
                socket_unlock(so, 1);
-       if (kn->kn_sfflags & NOTE_LOWAT)
-               return (kn->kn_data >= kn->kn_sdata);
-       return (kn->kn_data >= so->so_snd.sb_lowat);
+       return(ret);
 }
 
-#define SO_LOCK_HISTORY_STR_LEN (2 * SO_LCKDBG_MAX * (2 + sizeof(void *) + 1) + 1)
+static void
+filt_sockdetach(struct knote *kn)
+{
+       struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
+       socket_lock(so, 1);
+       
+       if ((so->so_flags & SOF_KNOTE) != 0)
+               if (KNOTE_DETACH(&so->so_klist, kn))
+                       so->so_flags &= ~SOF_KNOTE;
+       socket_unlock(so, 1);
+}
+
+static int
+filt_sockev(struct knote *kn, long hint)
+{
+       int ret = 0, locked = 0;
+       struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
+
+       if ((hint & SO_FILT_HINT_LOCKED) == 0) {
+               socket_lock(so, 1);
+               locked = 1;
+       }
+
+       switch (hint & SO_FILT_HINT_EV) {
+       case SO_FILT_HINT_CONNRESET:
+               if (kn->kn_sfflags & NOTE_CONNRESET)
+                       kn->kn_fflags |= NOTE_CONNRESET;
+               break;
+       case SO_FILT_HINT_TIMEOUT:
+               if (kn->kn_sfflags & NOTE_TIMEOUT)
+                       kn->kn_fflags |= NOTE_TIMEOUT;
+               break;
+       case SO_FILT_HINT_NOSRCADDR:
+               if (kn->kn_sfflags & NOTE_NOSRCADDR)
+                       kn->kn_fflags |= NOTE_NOSRCADDR;
+               break;
+       case SO_FILT_HINT_IFDENIED:
+               if ((kn->kn_sfflags & NOTE_IFDENIED))
+                       kn->kn_fflags |= NOTE_IFDENIED;
+               break;
+       case SO_FILT_HINT_KEEPALIVE:
+               if (kn->kn_sfflags & NOTE_KEEPALIVE)
+                       kn->kn_fflags |= NOTE_KEEPALIVE;
+       }
+
+       if ((kn->kn_sfflags & NOTE_READCLOSED) &&
+               (so->so_state & SS_CANTRCVMORE))
+               kn->kn_fflags |= NOTE_READCLOSED;
+
+       if ((kn->kn_sfflags & NOTE_WRITECLOSED) &&
+               (so->so_state & SS_CANTSENDMORE))
+               kn->kn_fflags |= NOTE_WRITECLOSED;
+
+       if ((kn->kn_sfflags & NOTE_SUSPEND) &&
+           ((hint & SO_FILT_HINT_SUSPEND) ||
+           (so->so_flags & SOF_SUSPENDED))) {
+               kn->kn_fflags &=
+                       ~(NOTE_SUSPEND | NOTE_RESUME);
+               kn->kn_fflags |= NOTE_SUSPEND;
+       }
+
+       if ((kn->kn_sfflags & NOTE_RESUME) &&
+           ((hint & SO_FILT_HINT_RESUME) ||
+           (so->so_flags & SOF_SUSPENDED) == 0)) {
+               kn->kn_fflags &=
+                       ~(NOTE_SUSPEND | NOTE_RESUME);
+               kn->kn_fflags |= NOTE_RESUME;
+       }
+
+       if (so->so_error != 0) {
+               ret = 1;
+               kn->kn_data = so->so_error;
+               kn->kn_flags |= EV_EOF;
+       } else {
+               get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
+       }
+
+       if (kn->kn_fflags != 0)
+               ret = 1;
+
+       if (locked)
+               socket_unlock(so, 1);
+
+       return(ret);
+}
+
+void
+get_sockev_state(struct socket *so, u_int32_t *statep) {
+       u_int32_t state = *(statep);
+
+       if (so->so_state & SS_ISCONNECTED)      
+               state |= SOCKEV_CONNECTED;
+       else 
+               state &= ~(SOCKEV_CONNECTED);
+       state |= ((so->so_state & SS_ISDISCONNECTED) ?
+               SOCKEV_DISCONNECTED : 0);
+       *(statep) = state;
+       return;
+}
+
+#define SO_LOCK_HISTORY_STR_LEN (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof(void *)) + 1) + 1)
 
 __private_extern__ const char * solockhistory_nr(struct socket *so)
 {
@@ -3881,6 +4196,7 @@ __private_extern__ const char * solockhistory_nr(struct socket *so)
         int i;
         static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
 
+       bzero(lock_history_str, sizeof(lock_history_str));
         for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
                 n += snprintf(lock_history_str + n, SO_LOCK_HISTORY_STR_LEN - n, "%lx:%lx ",
                         (uintptr_t) so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
@@ -4009,3 +4325,144 @@ so_isdstlocal(struct socket *so) {
        } 
        return 0;
 }
+
+int
+sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
+{
+       int err = 0, defunct;
+
+       defunct = (so->so_flags & SOF_DEFUNCT);
+       if (defunct) {
+               if (!(so->so_snd.sb_flags & so->so_rcv.sb_flags & SB_DROP))
+                       panic("%s: SB_DROP not set", __func__);
+               goto done;
+       }
+
+       if (so->so_flags & SOF_NODEFUNCT) {
+               if (noforce) {
+                       err = EOPNOTSUPP;
+                       SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p "
+                           "[%d,%d] is not eligible for defunct (%d)\n",
+                           __func__, proc_selfpid(), proc_pid(p), level, so,
+                           INP_SOCKAF(so), INP_SOCKTYPE(so), err));
+                       return (err);
+               }
+               so->so_flags &= ~SOF_NODEFUNCT;
+               SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] "
+                   "defunct by force\n", __func__, proc_selfpid(), proc_pid(p),
+                   level, so, INP_SOCKAF(so), INP_SOCKTYPE(so)));
+       }
+
+       so->so_flags |= SOF_DEFUNCT;
+       /* Prevent further data from being appended to the socket buffers */
+       so->so_snd.sb_flags |= SB_DROP;
+       so->so_rcv.sb_flags |= SB_DROP;
+
+done:
+       SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] %s "
+           "defunct\n", __func__, proc_selfpid(), proc_pid(p), level, so,
+           INP_SOCKAF(so), INP_SOCKTYPE(so),
+           defunct ? "is already" : "marked as"));
+
+       return (err);
+}
+
+int
+sodefunct(struct proc *p, struct socket *so, int level)
+{
+       struct sockbuf *rcv, *snd;
+
+       if (!(so->so_flags & SOF_DEFUNCT))
+               panic("%s improperly called", __func__);
+
+       if (so->so_state & SS_DEFUNCT)
+               goto done;
+
+       rcv = &so->so_rcv;
+       snd = &so->so_snd;
+
+       SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so %p [%d,%d] is now "
+           "defunct [rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n",
+           __func__, proc_selfpid(), proc_pid(p), level, so,
+           INP_SOCKAF(so), INP_SOCKTYPE(so),
+           (uint32_t)rcv->sb_sel.si_flags, (uint32_t)snd->sb_sel.si_flags,
+           (uint16_t)rcv->sb_flags, (uint16_t)snd->sb_flags));
+
+       /*
+        * Unwedge threads blocked on sbwait() and sb_lock().
+        */
+       sbwakeup(rcv);
+       sbwakeup(snd);
+
+       if (rcv->sb_flags & SB_LOCK)
+               sbunlock(rcv, 1);
+       if (snd->sb_flags & SB_LOCK)
+               sbunlock(snd, 1);
+
+       /*
+        * Flush the buffers and disconnect.  We explicitly call shutdown
+        * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
+        * states are set for the socket.  This would also flush out data
+        * hanging off the receive list of this socket.
+        */
+       (void) soshutdownlock(so, SHUT_RD);
+       (void) soshutdownlock(so, SHUT_WR);
+       (void) sodisconnectlocked(so);
+
+       /*
+        * Explicitly handle connectionless-protocol disconnection
+        * and release any remaining data in the socket buffers.
+        */
+       if (!(so->so_flags & SS_ISDISCONNECTED))
+               (void) soisdisconnected(so);
+
+       if (so->so_error == 0)
+               so->so_error = EBADF;
+
+       if (rcv->sb_cc != 0)
+               sbrelease(rcv);
+       if (snd->sb_cc != 0)
+               sbrelease(snd);
+
+       so->so_state |= SS_DEFUNCT;
+
+done:
+       return (0);
+}
+
+__private_extern__ int
+so_set_recv_anyif(struct socket *so, int optval)
+{
+       int ret = 0;
+
+#if INET6
+       if (INP_SOCKAF(so) == AF_INET || INP_SOCKAF(so) == AF_INET6) {
+#else
+       if (INP_SOCKAF(so) == AF_INET) {
+#endif /* !INET6 */
+               if (optval)
+                       sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
+               else
+                       sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
+       } else {
+               ret = EPROTONOSUPPORT;
+       }
+
+       return (ret);
+}
+
+__private_extern__ int
+so_get_recv_anyif(struct socket *so)
+{
+       int ret = 0;
+
+#if INET6
+       if (INP_SOCKAF(so) == AF_INET || INP_SOCKAF(so) == AF_INET6) {
+#else
+       if (INP_SOCKAF(so) == AF_INET) {
+#endif /* !INET6 */
+               ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
+       }
+
+       return (ret);
+}