]> git.saurik.com Git - apple/xnu.git/blobdiff - bsd/netinet/in_tclass.c
xnu-2050.7.9.tar.gz
[apple/xnu.git] / bsd / netinet / in_tclass.c
index 54b5fcc1df7d9a9fd90dd7ef87f65e513f4d268c..02d9ccc86bf13b7bbcd999f613a4871825224756 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2009-2011 Apple Inc. All rights reserved.
+ * Copyright (c) 2009-2012 Apple Inc. All rights reserved.
  *
  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
  *
@@ -60,32 +60,32 @@ extern char *proc_name_address(void *p);
 
 static int tfp_count = 0;
 
-static TAILQ_HEAD(, tclass_for_proc) tfp_head = TAILQ_HEAD_INITIALIZER(tfp_head);
+static TAILQ_HEAD(, tclass_for_proc) tfp_head =
+    TAILQ_HEAD_INITIALIZER(tfp_head);
 
 struct tclass_for_proc {
        TAILQ_ENTRY(tclass_for_proc)    tfp_link;
-       int                                                     tfp_class;
-       pid_t                                                   tfp_pid;
-       char                                                    tfp_pname[MAXCOMLEN + 1];
+       int     tfp_class;
+       pid_t   tfp_pid;
+       char    tfp_pname[MAXCOMLEN + 1];
 };
 
-extern void tcp_set_background_cc(struct socket *);
-extern void tcp_set_foreground_cc(struct socket *);
-
-int dscp_code_from_mbuf_tclass(int );
-
-static int get_pid_tclass(pid_t , int *);
-static int get_pname_tclass(const char * , int *);
-static int set_pid_tclass(pid_t , int );
-static int set_pname_tclass(const char * , int );
+static int dscp_code_from_mbuf_tclass(mbuf_traffic_class_t);
+static int get_pid_tclass(struct so_tcdbg *);
+static int get_pname_tclass(struct so_tcdbg *);
+static int set_pid_tclass(struct so_tcdbg *);
+static int set_pname_tclass(struct so_tcdbg *);
+static int flush_pid_tclass(struct so_tcdbg *);
 static int purge_tclass_for_proc(void);
 static int flush_tclass_for_proc(void);
+static void so_set_lro(struct socket*, int);
+int get_tclass_for_curr_proc(int *);
 
-
-static lck_grp_attr_t *tclass_lck_grp_attr = NULL;  /* mutex group attributes */
-static lck_grp_t *tclass_lck_grp = NULL;            /* mutex group definition */
-static lck_attr_t *tclass_lck_attr = NULL;          /* mutex attributes */
-static lck_mtx_t *tclass_lock = NULL;
+static lck_grp_attr_t *tclass_lck_grp_attr = NULL; /* mutex group attributes */
+static lck_grp_t *tclass_lck_grp = NULL;       /* mutex group definition */
+static lck_attr_t *tclass_lck_attr = NULL;     /* mutex attributes */
+decl_lck_mtx_data(static, tclass_lock_data);
+static lck_mtx_t *tclass_lock = &tclass_lock_data;
 
 /*
  * Must be called with tclass_lock held
@@ -94,12 +94,12 @@ static struct tclass_for_proc *
 find_tfp_by_pid(pid_t pid)
 {
        struct tclass_for_proc *tfp;
-       
+
        TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
                if (tfp->tfp_pid == pid)
                        break;
        }
-       return tfp;
+       return (tfp);
 }
 
 /*
@@ -109,36 +109,39 @@ static struct tclass_for_proc *
 find_tfp_by_pname(const char *pname)
 {
        struct tclass_for_proc *tfp;
-       
+
        TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
-               if (strncmp(pname, tfp->tfp_pname, sizeof(tfp->tfp_pname)) == 0)
+               if (strncmp(pname, tfp->tfp_pname,
+                   sizeof (tfp->tfp_pname)) == 0)
                        break;
        }
-       return tfp;
+       return (tfp);
 }
 
-static int
-get_tclass_for_curr_proc(void)
+__private_extern__ int
+get_tclass_for_curr_proc(int *sotc)
 {
-       struct tclass_for_proc *tfp;
-       int sotc = SO_TC_BE;
+       struct tclass_for_proc *tfp = NULL;
        proc_t p = current_proc();      /* Not ref counted */
        pid_t pid = proc_pid(p);
        char *pname = proc_name_address(p);
-       
+
+       *sotc = -1;
+
        lck_mtx_lock(tclass_lock);
-       
+
        TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
-               if ((tfp->tfp_pid == pid) ||
-                       (tfp->tfp_pid == -1 && strncmp(pname, tfp->tfp_pname, sizeof(tfp->tfp_pname)) == 0)) {
-                       sotc = tfp->tfp_class;
+               if ((tfp->tfp_pid == pid) || (tfp->tfp_pid == -1 &&
+                   strncmp(pname, tfp->tfp_pname,
+                   sizeof (tfp->tfp_pname)) == 0)) {
+                       *sotc = tfp->tfp_class;
                        break;
-               } 
+               }
        }
 
        lck_mtx_unlock(tclass_lock);
 
-       return sotc;
+       return ((tfp == NULL) ? 0 : 1);
 }
 
 /*
@@ -154,13 +157,13 @@ purge_tclass_for_proc(void)
 
        TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) {
                proc_t p;
-               
+
                if (tfp->tfp_pid == -1)
                        continue;
                if ((p = proc_find(tfp->tfp_pid)) == NULL) {
                        tfp_count--;
                        TAILQ_REMOVE(&tfp_head, tfp, tfp_link);
-                                       
+
                        _FREE(tfp, M_TEMP);
                } else {
                        proc_rele(p);
@@ -168,8 +171,8 @@ purge_tclass_for_proc(void)
        }
 
        lck_mtx_unlock(tclass_lock);
-       
-       return error;
+
+       return (error);
 }
 
 /*
@@ -200,10 +203,10 @@ flush_tclass_for_proc(void)
        TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) {
                free_tclass_for_proc(tfp);
        }
-       
+
        lck_mtx_unlock(tclass_lock);
-               
-       return error;
+
+       return (error);
 
 }
 
@@ -211,40 +214,39 @@ flush_tclass_for_proc(void)
  * Must be called with tclass_lock held
  */
 static struct tclass_for_proc *
-alloc_tclass_for_proc(pid_t pid, const char *pname, int tclass)
+alloc_tclass_for_proc(pid_t pid, const char *pname)
 {
        struct tclass_for_proc *tfp;
-       
+
        if (pid == -1 && pname == NULL)
-               return NULL;
+               return (NULL);
 
-       tfp = _MALLOC(sizeof(struct tclass_for_proc), M_TEMP, M_NOWAIT | M_ZERO);
+       tfp = _MALLOC(sizeof (struct tclass_for_proc), M_TEMP, M_NOWAIT|M_ZERO);
        if (tfp == NULL)
-               return NULL;
-       
+               return (NULL);
+
        tfp->tfp_pid = pid;
-       tfp->tfp_class = tclass;
        /*
-        * Add per pid entries before per proc name so we can find 
+        * Add per pid entries before per proc name so we can find
         * a specific instance of a process before the general name base entry.
         */
        if (pid != -1) {
                TAILQ_INSERT_HEAD(&tfp_head, tfp, tfp_link);
        } else {
-               strlcpy(tfp->tfp_pname, pname, sizeof(tfp->tfp_pname));
+               strlcpy(tfp->tfp_pname, pname, sizeof (tfp->tfp_pname));
                TAILQ_INSERT_TAIL(&tfp_head, tfp, tfp_link);
        }
-       
+
        tfp_count++;
 
-       return tfp;
+       return (tfp);
 }
 
 /*
  * -1 for tclass means to remove the entry
  */
-int 
-set_pid_tclass(pid_t pid, int tclass)
+int
+set_pid_tclass(struct so_tcdbg *so_tcdbg)
 {
        int error = EINVAL;
        proc_t p = NULL;
@@ -252,205 +254,279 @@ set_pid_tclass(pid_t pid, int tclass)
        struct fileproc *fp;
        struct tclass_for_proc *tfp;
        int i;
+       pid_t pid = so_tcdbg->so_tcdbg_pid;
+       int tclass = so_tcdbg->so_tcdbg_tclass;
 
        p = proc_find(pid);
        if (p == NULL) {
-               printf("set_pid_tclass proc_find(%d) \n", pid);
+               printf("%s proc_find(%d) failed\n", __func__, pid);
                goto done;
        }
-       
+
        /* Need a tfp */
        lck_mtx_lock(tclass_lock);
-       
+
        tfp = find_tfp_by_pid(pid);
-       if (tclass == -1) {
-               if (tfp != NULL) {
-                       free_tclass_for_proc(tfp);
-                       error = 0;
-               }
-               lck_mtx_unlock(tclass_lock);
-               goto done;
-       } else {
+       if (tfp == NULL) {
+               tfp = alloc_tclass_for_proc(pid, NULL);
                if (tfp == NULL) {
-                       tfp = alloc_tclass_for_proc(pid, NULL, tclass);
-                       if (tfp == NULL) {
-                               lck_mtx_unlock(tclass_lock);
-                               error = ENOBUFS;
-                               goto done;
-                       }
-               } else {
-                       tfp->tfp_class = tclass;
+                       lck_mtx_unlock(tclass_lock);
+                       error = ENOBUFS;
+                       goto done;
                }
        }
+       tfp->tfp_class = tclass;
+
        lck_mtx_unlock(tclass_lock);
 
        if (tfp != NULL) {
                proc_fdlock(p);
-               
+
                fdp = p->p_fd;
                for (i = 0; i < fdp->fd_nfiles; i++) {
                        struct socket *so;
-                       
+
                        fp = fdp->fd_ofiles[i];
-                       if (fp == NULL || (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
-                               fp->f_fglob->fg_type != DTYPE_SOCKET)
+                       if (fp == NULL ||
+                           (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
+                           fp->f_fglob->fg_type != DTYPE_SOCKET)
                                continue;
-                       
+
                        so = (struct socket *)fp->f_fglob->fg_data;
-                       if (so->so_proto->pr_domain->dom_family != AF_INET && 
-                               so->so_proto->pr_domain->dom_family != AF_INET6)
+                       if (so->so_proto->pr_domain->dom_family != AF_INET &&
+                           so->so_proto->pr_domain->dom_family != AF_INET6)
                                continue;
                        socket_lock(so, 1);
-                       error = so_set_traffic_class(so, tclass != -1 ? tclass : SO_TC_BE);
-                       socket_unlock(so, 1);
-                       if (error != 0) {
-                               printf("set_pid_tclass so_set_traffic_class(%p, %d) failed %d\n", so, tclass, error);
-                               error = 0;
+                       if (tclass != -1) {
+                               error = so_set_traffic_class(so, tclass);
+                               if (error != 0) {
+                                       printf("%s: so_set_traffic_class"
+                                           "(so=%p, fd=%d, tclass=%d) "
+                                           "failed %d\n", __func__,
+                                           so, i, tclass, error);
+                                       error = 0;
+                               }
                        }
+                       socket_unlock(so, 1);
                }
-               
+
                proc_fdunlock(p);
        }
-       
-       error = 0;      
+
+       error = 0;
 done:
        if (p != NULL)
                proc_rele(p);
-       
-       return error;
+
+       return (error);
 }
 
-int 
-set_pname_tclass(const char *pname, int tclass)
+int
+set_pname_tclass(struct so_tcdbg *so_tcdbg)
 {
        int error = EINVAL;
        struct tclass_for_proc *tfp;
 
        lck_mtx_lock(tclass_lock);
-       
-       tfp = find_tfp_by_pname(pname);
-       if (tclass == -1) {
-               if (tfp != NULL)
-                       free_tclass_for_proc(tfp);
-       } else {
+
+       tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname);
+       if (tfp == NULL) {
+               tfp = alloc_tclass_for_proc(-1, so_tcdbg->so_tcdbg_pname);
                if (tfp == NULL) {
-                       tfp = alloc_tclass_for_proc(-1, pname, tclass);
-                       if (tfp == NULL) {
-                               lck_mtx_unlock(tclass_lock);
-                               error = ENOBUFS;
-                               goto done;
-                       }
-               } else {
-                       tfp->tfp_class = tclass;
+                       lck_mtx_unlock(tclass_lock);
+                       error = ENOBUFS;
+                       goto done;
                }
        }
+       tfp->tfp_class = so_tcdbg->so_tcdbg_tclass;
+
        lck_mtx_unlock(tclass_lock);
-       
-       error = 0;      
+
+       error = 0;
 done:
-       
-       return error;
+
+       return (error);
 }
 
-int 
-get_pid_tclass(pid_t pid, int *tclass)
+static int
+flush_pid_tclass(struct so_tcdbg *so_tcdbg)
+{
+       pid_t pid = so_tcdbg->so_tcdbg_pid;
+       int tclass = so_tcdbg->so_tcdbg_tclass;
+       struct filedesc *fdp;
+       int error = EINVAL;
+       proc_t p;
+       int i;
+
+       p = proc_find(pid);
+       if (p == PROC_NULL) {
+               printf("%s proc_find(%d) failed\n", __func__, pid);
+               goto done;
+       }
+
+       proc_fdlock(p);
+       fdp = p->p_fd;
+       for (i = 0; i < fdp->fd_nfiles; i++) {
+               struct socket *so;
+               struct fileproc *fp;
+
+               fp = fdp->fd_ofiles[i];
+               if (fp == NULL ||
+                   (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
+                   fp->f_fglob->fg_type != DTYPE_SOCKET)
+                       continue;
+
+               so = (struct socket *)fp->f_fglob->fg_data;
+               error = sock_setsockopt(so, SOL_SOCKET, SO_FLUSH, &tclass,
+                   sizeof (tclass));
+               if (error != 0) {
+                       printf("%s: setsockopt(SO_FLUSH) (so=%p, fd=%d, "
+                           "tclass=%d) failed %d\n", __func__, so, i, tclass,
+                           error);
+                       error = 0;
+               }
+       }
+       proc_fdunlock(p);
+
+       error = 0;
+done:
+       if (p != PROC_NULL)
+               proc_rele(p);
+
+       return (error);
+}
+
+int
+get_pid_tclass(struct so_tcdbg *so_tcdbg)
 {
        int error = EINVAL;
        proc_t p = NULL;
        struct tclass_for_proc *tfp;
-       
-       *tclass = -1; /* Means not set */
+       pid_t pid = so_tcdbg->so_tcdbg_pid;
+
+       so_tcdbg->so_tcdbg_tclass = -1; /* Means not set */
+       so_tcdbg->so_tcdbg_opportunistic = -1; /* Means not set */
 
        p = proc_find(pid);
        if (p == NULL) {
-               printf("get_pid_tclass proc_find(%d) \n", pid);
+               printf("%s proc_find(%d) failed\n", __func__, pid);
                goto done;
        }
-       
+
        /* Need a tfp */
        lck_mtx_lock(tclass_lock);
-       
+
        tfp = find_tfp_by_pid(pid);
        if (tfp != NULL) {
-               *tclass = tfp->tfp_class ;
+               so_tcdbg->so_tcdbg_tclass = tfp->tfp_class;
                error = 0;
        }
        lck_mtx_unlock(tclass_lock);
 done:
        if (p != NULL)
                proc_rele(p);
-       
-       return error;
+
+       return (error);
 }
 
-int 
-get_pname_tclass(const char *pname, int *tclass)
+int
+get_pname_tclass(struct so_tcdbg *so_tcdbg)
 {
        int error = EINVAL;
        struct tclass_for_proc *tfp;
-       
-       *tclass = -1; /* Means not set */
+
+       so_tcdbg->so_tcdbg_tclass = -1; /* Means not set */
+       so_tcdbg->so_tcdbg_opportunistic = -1; /* Means not set */
 
        /* Need a tfp */
        lck_mtx_lock(tclass_lock);
-       
-       tfp = find_tfp_by_pname(pname);
+
+       tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname);
        if (tfp != NULL) {
-               *tclass = tfp->tfp_class ;
+               so_tcdbg->so_tcdbg_tclass = tfp->tfp_class;
                error = 0;
        }
        lck_mtx_unlock(tclass_lock);
-       
-       return error;
+
+       return (error);
 }
 
+static int
+delete_tclass_for_pid_pname(struct so_tcdbg *so_tcdbg)
+{
+       int error = EINVAL;
+       pid_t pid = so_tcdbg->so_tcdbg_pid;
+       struct tclass_for_proc *tfp = NULL;
+
+       lck_mtx_lock(tclass_lock);
 
+       if (pid != -1)
+               tfp = find_tfp_by_pid(pid);
+       else
+               tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname);
+
+       if (tfp != NULL) {
+               free_tclass_for_proc(tfp);
+               error = 0;
+       }
+
+       lck_mtx_unlock(tclass_lock);
+
+       return (error);
+}
 
 /*
  * Setting options requires privileges
  */
-__private_extern__ int 
+__private_extern__ int
 so_set_tcdbg(struct socket *so, struct so_tcdbg *so_tcdbg)
 {
        int error = 0;
-       
+
        if ((so->so_state & SS_PRIV) == 0)
-               return EPERM;
+               return (EPERM);
 
        socket_unlock(so, 0);
 
        switch (so_tcdbg->so_tcdbg_cmd) {
                case SO_TCDBG_PID:
-                       error = set_pid_tclass(so_tcdbg->so_tcdbg_pid, so_tcdbg->so_tcdbg_tclass);
+                       error = set_pid_tclass(so_tcdbg);
                        break;
-               
+
                case SO_TCDBG_PNAME:
-                       error = set_pname_tclass(so_tcdbg->so_tcdbg_pname, so_tcdbg->so_tcdbg_tclass);
+                       error = set_pname_tclass(so_tcdbg);
                        break;
-               
+
                case SO_TCDBG_PURGE:
                        error = purge_tclass_for_proc();
                        break;
-               
+
                case SO_TCDBG_FLUSH:
                        error = flush_tclass_for_proc();
                        break;
-               
+
+               case SO_TCDBG_DELETE:
+                       error = delete_tclass_for_pid_pname(so_tcdbg);
+                       break;
+
+               case SO_TCDBG_TCFLUSH_PID:
+                       error = flush_pid_tclass(so_tcdbg);
+                       break;
+
                default:
                        error = EINVAL;
                        break;
-               
        }
 
        socket_lock(so, 0);
 
-       return error;
+       return (error);
 }
 
 /*
  * Not required to be privileged to get
  */
-__private_extern__ int 
+__private_extern__ int
 sogetopt_tcdbg(struct socket *so, struct sockopt *sopt)
 {
        int error = 0;
@@ -458,23 +534,24 @@ sogetopt_tcdbg(struct socket *so, struct sockopt *sopt)
        void *buf = NULL;
        size_t len = sopt->sopt_valsize;
 
-       error = sooptcopyin(sopt, &so_tcdbg, sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
+       error = sooptcopyin(sopt, &so_tcdbg, sizeof (struct so_tcdbg),
+           sizeof (struct so_tcdbg));
        if (error != 0)
-               return error;
-       
+               return (error);
+
        sopt->sopt_valsize = len;
-       
+
        socket_unlock(so, 0);
 
        switch (so_tcdbg.so_tcdbg_cmd) {
                case SO_TCDBG_PID:
-                       error = get_pid_tclass(so_tcdbg.so_tcdbg_pid, &so_tcdbg.so_tcdbg_tclass);
+                       error = get_pid_tclass(&so_tcdbg);
                        break;
-               
+
                case SO_TCDBG_PNAME:
-                       error = get_pname_tclass(so_tcdbg.so_tcdbg_pname, &so_tcdbg.so_tcdbg_tclass);
+                       error = get_pname_tclass(&so_tcdbg);
                        break;
-               
+
                case SO_TCDBG_COUNT:
                        lck_mtx_lock(tclass_lock);
                        so_tcdbg.so_tcdbg_count = tfp_count;
@@ -492,7 +569,7 @@ sogetopt_tcdbg(struct socket *so, struct sockopt *sopt)
                                error = EINVAL;
                                break;
                        }
-                       len = alloc_count * sizeof(struct so_tcdbg);
+                       len = alloc_count * sizeof (struct so_tcdbg);
                        lck_mtx_unlock(tclass_lock);
 
                        buf = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
@@ -513,33 +590,35 @@ sogetopt_tcdbg(struct socket *so, struct sockopt *sopt)
                                } else {
                                        ptr->so_tcdbg_cmd = SO_TCDBG_PNAME;
                                        ptr->so_tcdbg_pid = -1;
-                                       strlcpy(ptr->so_tcdbg_pname, tfp->tfp_pname, sizeof(ptr->so_tcdbg_pname));
+                                       strlcpy(ptr->so_tcdbg_pname,
+                                           tfp->tfp_pname,
+                                           sizeof (ptr->so_tcdbg_pname));
                                }
                                ptr->so_tcdbg_tclass = tfp->tfp_class;
                                ptr++;
                        }
-                       
+
                        lck_mtx_unlock(tclass_lock);
                        }
                        break;
-               
+
                default:
                        error = EINVAL;
                        break;
-               
        }
 
        socket_lock(so, 0);
 
        if (error == 0) {
                if (buf == NULL) {
-                       error = sooptcopyout(sopt, &so_tcdbg, sizeof(struct so_tcdbg));
+                       error = sooptcopyout(sopt, &so_tcdbg,
+                           sizeof (struct so_tcdbg));
                } else {
                        error = sooptcopyout(sopt, buf, len);
                        _FREE(buf, M_TEMP);
                }
        }
-       return error;
+       return (error);
 }
 
 
@@ -547,78 +626,121 @@ __private_extern__ int
 so_set_traffic_class(struct socket *so, int optval)
 {
        int error = 0;
-       
-       if (optval < SO_TC_BE || optval > SO_TC_VO) {
+
+       if (optval < SO_TC_BE || optval > SO_TC_CTL) {
                error = EINVAL;
        } else {
-               so->so_traffic_class = optval;
-       
-               if ((INP_SOCKAF(so) == AF_INET || INP_SOCKAF(so) == AF_INET6) && 
-                       INP_SOCKTYPE(so) == SOCK_STREAM) {
-                       set_tcp_stream_priority(so);
+               switch (optval) {
+               case _SO_TC_BK:
+                       optval = SO_TC_BK;
+                       break;
+               case _SO_TC_VI:
+                       optval = SO_TC_VI;
+                       break;
+               case _SO_TC_VO:
+                       optval = SO_TC_VO;
+                       break;
+               default:
+                       if (!SO_VALID_TC(optval))
+                               error = EINVAL;
+                       break;
+               }
+
+               if (error == 0) {
+                       int oldval = so->so_traffic_class;
+
+                       VERIFY(SO_VALID_TC(optval));
+                       so->so_traffic_class = optval;
+
+                       if ((INP_SOCKAF(so) == AF_INET ||
+                           INP_SOCKAF(so) == AF_INET6) &&
+                           INP_SOCKTYPE(so) == SOCK_STREAM) {
+                               set_tcp_stream_priority(so);
+
+                               /* Set/unset use of Large Receive Offload */
+                               so_set_lro(so, optval);
+                       }
+
+                       if ((INP_SOCKAF(so) == AF_INET ||
+                           INP_SOCKAF(so) == AF_INET6) &&
+                           optval != oldval && (optval == SO_TC_BK_SYS ||
+                           oldval == SO_TC_BK_SYS)) {
+                               /*
+                                * If the app switches from BK_SYS to something
+                                * else, resume the socket if it was suspended.
+                                */
+                               if (oldval == SO_TC_BK_SYS)
+                                       inp_reset_fc_state(so->so_pcb);
+
+                               SOTHROTTLELOG(("throttle[%d]: so %p [%d,%d] "
+                                   "opportunistic %s\n", so->last_pid,
+                                   so, INP_SOCKAF(so), INP_SOCKTYPE(so),
+                                   (optval == SO_TC_BK_SYS) ? "ON" : "OFF"));
+                       }
                }
        }
-       return error;
+       return (error);
 }
 
 __private_extern__ void
 so_set_default_traffic_class(struct socket *so)
 {
-       int sotc = SO_TC_BE;
+       int sotc = -1;
 
-       if (tfp_count > 0 && (INP_SOCKAF(so) == AF_INET || INP_SOCKAF(so) == AF_INET6)) {
-               sotc = get_tclass_for_curr_proc();
+       if (tfp_count > 0 &&
+           (INP_SOCKAF(so) == AF_INET || INP_SOCKAF(so) == AF_INET6)) {
+               get_tclass_for_curr_proc(&sotc);
        }
-       
-       so->so_traffic_class = sotc;
-       
-       return;
+
+       so->so_traffic_class = (sotc != -1) ? sotc : SO_TC_BE;
 }
 
+__private_extern__ int
+so_set_opportunistic(struct socket *so, int optval)
+{
+       return (so_set_traffic_class(so, (optval == 0) ?
+           SO_TC_BE : SO_TC_BK_SYS));
+}
 
 __private_extern__ int
-mbuf_traffic_class_from_control(struct mbuf *control)
+so_get_opportunistic(struct socket *so)
+{
+       return (so->so_traffic_class == SO_TC_BK_SYS);
+}
+
+__private_extern__ mbuf_svc_class_t
+mbuf_service_class_from_control(struct mbuf *control)
 {
        struct cmsghdr *cm;
-       
-       for (cm = M_FIRST_CMSGHDR(control); 
-                cm != NULL; 
-                cm = M_NXT_CMSGHDR(control, cm)) {
+       mbuf_svc_class_t msc = MBUF_SC_UNSPEC;
+
+       for (cm = M_FIRST_CMSGHDR(control); cm != NULL;
+           cm = M_NXT_CMSGHDR(control, cm)) {
                int tc;
 
-               if (cm->cmsg_len < sizeof(struct cmsghdr))
+               if (cm->cmsg_len < sizeof (struct cmsghdr))
                        break;
-               
+
                if (cm->cmsg_level != SOL_SOCKET ||
-                       cm->cmsg_type != SO_TRAFFIC_CLASS)
+                   cm->cmsg_type != SO_TRAFFIC_CLASS)
                        continue;
-               if (cm->cmsg_len != CMSG_LEN(sizeof(int)))
+               if (cm->cmsg_len != CMSG_LEN(sizeof (int)))
                        continue;
-               
-               tc = *(int *)CMSG_DATA(cm);
-               
-               switch (tc) {
-                       case SO_TC_BE:
-                               return MBUF_TC_BE;
-                       case SO_TC_BK:
-                               return MBUF_TC_BK;
-                       case SO_TC_VI:
-                               return MBUF_TC_VI;
-                       case SO_TC_VO:
-                               return MBUF_TC_VO;
-                       default:
-                               break;
-               }
+
+               tc = *(int *)(void *)CMSG_DATA(cm);
+               msc = so_tc2msc(tc);
+               if (MBUF_VALID_SC(msc))
+                       break;
        }
-       
-       return MBUF_TC_UNSPEC;
+
+       return (msc);
 }
 
 __private_extern__  int
-dscp_code_from_mbuf_tclass(int mtc)
+dscp_code_from_mbuf_tclass(mbuf_traffic_class_t mtc)
 {
        int dscp_code;
-       
+
        switch (mtc) {
                default:
                case MBUF_TC_BE:
@@ -634,56 +756,65 @@ dscp_code_from_mbuf_tclass(int mtc)
                        dscp_code = 0x30;
                        break;
        }
-       
-       return dscp_code;
+
+       return (dscp_code);
 }
 
 __private_extern__ void
 so_recv_data_stat(struct socket *so, struct mbuf *m, size_t off)
 {
-       uint32_t sotc = m->m_pkthdr.prio;
+       uint32_t sotc = m_get_traffic_class(m);
 
        if (sotc >= SO_TC_STATS_MAX)
                sotc = SO_TC_BE;
-       
-       so->so_tc_stats[sotc].rxpackets += 1;
-       so->so_tc_stats[sotc].rxbytes += ((m->m_flags & M_PKTHDR) ? m->m_pkthdr.len : 0) + off;
 
-       return;
+       so->so_tc_stats[sotc].rxpackets += 1;
+       so->so_tc_stats[sotc].rxbytes +=
+           ((m->m_flags & M_PKTHDR) ? m->m_pkthdr.len : 0) + off;
 }
 
 __private_extern__ void
 set_tcp_stream_priority(struct socket *so)
 {
        struct tcpcb *tp = intotcpcb(sotoinpcb(so));
+       int old_cc = tp->tcp_cc_index;
+       int recvbg = IS_TCP_RECV_BG(so);
 
-       /* If the socket was marked as a background socket or if the
-        * traffic class is set to background with traffic class socket 
-        * option then make both send and recv side of the stream to be 
-        * background. The variable sotcdb which can be set with sysctl 
+       /*
+        * If the socket was marked as a background socket or if the
+        * traffic class is set to background with traffic class socket
+        * option then make both send and recv side of the stream to be
+        * background. The variable sotcdb which can be set with sysctl
         * is used to disable these settings for testing.
         */
-       if (soisbackground(so) || so->so_traffic_class == SO_TC_BK) {
+       if (soisthrottled(so) || IS_SO_TC_BACKGROUND(so->so_traffic_class)) {
                if ((sotcdb & SOTCDB_NO_SENDTCPBG) != 0) {
-                       if (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX)
+                       if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX)
                                tcp_set_foreground_cc(so);
                } else {
-                       if (tp->tcp_cc_index != TCP_CC_ALGO_BACKGROUND_INDEX)
+                       if (old_cc != TCP_CC_ALGO_BACKGROUND_INDEX)
                                tcp_set_background_cc(so);
                }
-               
+
                /* Set receive side background flags */
-               if ((sotcdb & SOTCDB_NO_RECVTCPBG) != 0) {
-                       so->so_traffic_mgt_flags &= ~(TRAFFIC_MGT_TCP_RECVBG);
-               } else {
-                       so->so_traffic_mgt_flags |= TRAFFIC_MGT_TCP_RECVBG;
-               }
+               if ((sotcdb & SOTCDB_NO_RECVTCPBG) != 0)
+                       tcp_clear_recv_bg(so);
+               else
+                       tcp_set_recv_bg(so);
        } else {
-               so->so_traffic_mgt_flags &= ~(TRAFFIC_MGT_TCP_RECVBG);
-               if (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX)
+               tcp_clear_recv_bg(so);
+               if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX)
                        tcp_set_foreground_cc(so);
        }
-       return;
+
+       if (old_cc != tp->tcp_cc_index || recvbg != IS_TCP_RECV_BG(so)) {
+               SOTHROTTLELOG(("throttle[%d]: so %p [%d,%d] TCP %s send; "
+                  "%s recv\n", so->last_pid, so, INP_SOCKAF(so),
+                  INP_SOCKTYPE(so),
+                  (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) ?
+                  "background" : "foreground",
+                  IS_TCP_RECV_BG(so) ? "background" : "foreground"));
+       }
 }
 
 /*
@@ -692,119 +823,126 @@ set_tcp_stream_priority(struct socket *so)
  * - set the DSCP code following the WMM mapping
  */
 __private_extern__ void
-set_packet_tclass(struct mbuf *m, struct socket *so, int in_mtc, int isipv6)
+set_packet_service_class(struct mbuf *m, struct socket *so,
+    mbuf_svc_class_t in_msc, u_int32_t flags)
 {
-       int mtc = MBUF_TC_BE; /* Best effort by default */
-       struct inpcb *inp = sotoinpcb(so);       /* in6pcb and inpcb are the same */
+       mbuf_svc_class_t msc = MBUF_SC_BE;         /* Best effort by default */
+       struct inpcb *inp = sotoinpcb(so); /* in6pcb and inpcb are the same */
        struct ip *ip = mtod(m, struct ip *);
 #if INET6
        struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
 #endif /* INET6 */
-       
+       int isipv6 = ((flags & PKT_SCF_IPV6) != 0) ? 1 : 0; 
+
        if (!(m->m_flags & M_PKTHDR))
                return;
-       
-       /* 
+
+       /*
         * Here is the precedence:
         * 1) TRAFFIC_MGT_SO_BACKGROUND trumps all
         * 2) Traffic class passed via ancillary data to sendmsdg(2)
         * 3) Traffic class socket option last
         */
-       if (soisbackground(so)) {
-               mtc = MBUF_TC_BK;
-       } else if (in_mtc != MBUF_TC_UNSPEC) {
-               if (in_mtc >= MBUF_TC_BE && in_mtc <= MBUF_TC_VO)
-                       mtc = in_mtc;
+       if (in_msc != MBUF_SC_UNSPEC) {
+               if (in_msc >= MBUF_SC_BE && in_msc <= MBUF_SC_CTL)
+                       msc = in_msc;
        } else {
-               switch (so->so_traffic_class) {
-                       case SO_TC_BE:
-                               mtc = MBUF_TC_BE;
-                               break;
-                       case SO_TC_BK:
-                               mtc = MBUF_TC_BK;
-                               break;
-                       case SO_TC_VI:
-                               mtc = MBUF_TC_VI;
-                               break;
-                       case SO_TC_VO:
-                               mtc = MBUF_TC_VO;
-                               break;
-                       default:
-                               break;
-               }
+               VERIFY(SO_VALID_TC(so->so_traffic_class));
+               msc = so_tc2msc(so->so_traffic_class);
+               /* Assert because tc must have been valid */
+               VERIFY(MBUF_VALID_SC(msc));
        }
-       
+
+       /*
+        * If TRAFFIC_MGT_SO_BACKGROUND is set, depress the priority.
+        */
+       if (soisthrottled(so) && !IS_MBUF_SC_BACKGROUND(msc))
+               msc = MBUF_SC_BK;
+
        /*
-        * Set the traffic class in the mbuf packet header prio field
+        * Set the traffic class in the mbuf packet header svc field
         */
-       if ((sotcdb & SOTCDB_NO_MTC))
+       if (sotcdb & SOTCDB_NO_MTC)
                goto no_mbtc;
-       m->m_pkthdr.prio = mtc;
-       
+
+       /* Elevate service class if the packet is a pure TCP ACK.
+        * We can do this only when the flow is not a background
+        * flow and the outgoing interface supports 
+        * transmit-start model.
+        */
+       if (!IS_MBUF_SC_BACKGROUND(msc) && (flags & PKT_SCF_TCP_ACK))
+               msc = MBUF_SC_CTL;
+
+       (void) m_set_service_class(m, msc);
+
+       /*
+        * Set the privileged traffic auxiliary flag if applicable, or clear it.
+        */
+       if (!(sotcdb & SOTCDB_NO_PRIVILEGED) && soisprivilegedtraffic(so) &&
+           msc != MBUF_SC_UNSPEC)
+               m->m_pkthdr.aux_flags |= MAUXF_PRIO_PRIVILEGED;
+       else
+               m->m_pkthdr.aux_flags &= ~MAUXF_PRIO_PRIVILEGED;
+
 no_mbtc:
        /*
-         * Quick exit when best effort
+        * Quick exit when best effort
         */
-       if (mtc == MBUF_TC_BE)
+       if (msc == MBUF_SC_BE)
                goto no_dscp;
+
        /*
-        * Now let set the DSCP code in IPv4 or IPv6 header
-        * By default do this only for local traffic if a code is not already set
+        * The default behavior is for the networking stack to not set the
+        * DSCP code, based on SOTCDB_NO_DSCP being set.  If the flag is
+        * cleared, set the DSCP code in IPv4 or IPv6 header only for local
+        * traffic, if it is not already set.  <rdar://problem/11277343>
         */
-       if ((sotcdb & SOTCDB_NO_DSCP))
+       if (sotcdb & SOTCDB_NO_DSCP)
                goto no_dscp;
-               
+
        /*
-        * Test if a IP TOS or IPV6 TCLASS has already been set on the socket or the raw packet
+        * Test if a IP TOS or IPV6 TCLASS has already been set
+        * on the socket or the raw packet.
         */
-       if ((sotcdb & SOTCDB_NO_DSCPTST) == 0) {
+       if (!(sotcdb & SOTCDB_NO_DSCPTST)) {
 #if INET6
-               if (isipv6) 
-               {
-                       if ((so->so_type == SOCK_RAW && (ip6->ip6_flow & htonl(0xff << 20)) != 0) ||
-                           (inp->in6p_outputopts && inp->in6p_outputopts->ip6po_tclass != -1))
+               if (isipv6) {
+                       if ((so->so_type == SOCK_RAW &&
+                           (ip6->ip6_flow & htonl(0xff << 20)) != 0) ||
+                           (inp->in6p_outputopts &&
+                           inp->in6p_outputopts->ip6po_tclass != -1))
                                goto no_dscp;
-               } 
-               else 
+               } else
 #endif /* INET6 */
-               {
-                       if ((so->so_type == SOCK_RAW && (inp->inp_flags & INP_HDRINCL)) ||
-                               inp->inp_ip_tos != 0)
-                               goto no_dscp;
-               }
+               if ((so->so_type == SOCK_RAW &&
+                   (inp->inp_flags & INP_HDRINCL)) ||
+                   inp->inp_ip_tos != 0)
+                       goto no_dscp;
        }
-       
+
        /*
         * Test if destination is local
         */
-       if ((sotcdb & SOTCDB_NO_LCLTST) == 0) {
+       if (!(sotcdb & SOTCDB_NO_LCLTST)) {
                int islocal = 0;
-               struct route *ro = &inp->inp_route;
+               struct rtentry *rt = inp->inp_route.ro_rt;
 
                if (so->so_type == SOCK_STREAM) {
-                       struct tcpcb *tp = intotcpcb(inp);
-                       
-                       if ((tp->t_flags & TF_LOCAL))
+                       if (intotcpcb(inp)->t_flags & TF_LOCAL)
                                islocal = 1;
-               }
-               else
-#if INET6
-               if (isipv6) 
-               {
-                       if ((ro != NULL && ro->ro_rt != NULL &&
-                                (ro->ro_rt->rt_gateway->sa_family == AF_LINK ||
-                                 (ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK))) ||
-                                in6addr_local(&ip6->ip6_dst))
+               } else if (rt != NULL &&
+                   (rt->rt_gateway->sa_family == AF_LINK ||
+                   (rt->rt_ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)))) {
+                       if (!(rt->rt_ifp->if_flags & IFF_POINTOPOINT))
                                islocal = 1;
-               } 
-               else
+               } else
+#if INET6
+               if (isipv6 && in6addr_local(&ip6->ip6_dst)) {
+                       islocal = 1;
+               } else
 #endif /* INET6 */
-               {
-                       if ((ro != NULL && ro->ro_rt != NULL && 
-                                (ro->ro_rt->rt_gateway->sa_family == AF_LINK ||
-                                 (ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK))) ||
-                                inaddr_local(ip->ip_dst))
-                               islocal = 1;
+               if (inaddr_local(ip->ip_dst)) {
+                       islocal = 1;
                }
                if (islocal == 0)
                        goto no_dscp;
@@ -812,28 +950,38 @@ no_mbtc:
 
 #if INET6
        if (isipv6)
-               ip6->ip6_flow |=
-                       htonl(dscp_code_from_mbuf_tclass(m->m_pkthdr.prio) << 20);
+               ip6->ip6_flow |= htonl(dscp_code_from_mbuf_tclass(
+                   m_get_traffic_class(m)) << 20);
        else
 #endif /* INET6 */
-               ip->ip_tos |= dscp_code_from_mbuf_tclass(m->m_pkthdr.prio) << 2;
-       
+               ip->ip_tos |= dscp_code_from_mbuf_tclass(
+                   m_get_traffic_class(m)) << 2;
+
 no_dscp:
        /*
         * For TCP with background traffic class switch CC algo based on sysctl
         */
-       if (so->so_type == SOCK_STREAM) {
+       if (so->so_type == SOCK_STREAM)
                set_tcp_stream_priority(so);
-       }
-       
+
+       so_tc_update_stats(m, so, msc);
+}
+
+__private_extern__ void
+so_tc_update_stats(struct mbuf *m, struct socket *so, mbuf_svc_class_t msc)
+{
+       mbuf_traffic_class_t mtc;
+
        /*
         * Assume socket and mbuf traffic class values are the same
-        * Also assume the socket lock is held
+        * Also assume the socket lock is held.  Note that the stats
+        * at the socket layer are reduced down to the legacy traffic
+        * classes; we could/should potentially expand so_tc_stats[].
         */
+       mtc = MBUF_SC2TC(msc);
+       VERIFY(mtc < SO_TC_STATS_MAX);
        so->so_tc_stats[mtc].txpackets += 1;
        so->so_tc_stats[mtc].txbytes += m->m_pkthdr.len;
-       
-       return;
 }
 
 __private_extern__ void
@@ -842,9 +990,100 @@ socket_tclass_init(void)
        tclass_lck_grp_attr = lck_grp_attr_alloc_init();
        tclass_lck_grp = lck_grp_alloc_init("tclass", tclass_lck_grp_attr);
        tclass_lck_attr = lck_attr_alloc_init();
-       if ((tclass_lock = lck_mtx_alloc_init(tclass_lck_grp, tclass_lck_attr)) == NULL) {
-                       panic("failed to allocate memory for tclass\n");
+       lck_mtx_init(tclass_lock, tclass_lck_grp, tclass_lck_attr);
+}
+
+__private_extern__ mbuf_svc_class_t
+so_tc2msc(int tc)
+{
+       mbuf_svc_class_t msc;
+
+       switch (tc) {
+       case SO_TC_BK_SYS:
+               msc = MBUF_SC_BK_SYS;
+               break;
+       case SO_TC_BK:
+       case _SO_TC_BK:
+               msc = MBUF_SC_BK;
+               break;
+       case SO_TC_BE:
+               msc = MBUF_SC_BE;
+               break;
+       case SO_TC_RD:
+               msc = MBUF_SC_RD;
+               break;
+       case SO_TC_OAM:
+               msc = MBUF_SC_OAM;
+               break;
+       case SO_TC_AV:
+               msc = MBUF_SC_AV;
+               break;
+       case SO_TC_RV:
+               msc = MBUF_SC_RV;
+               break;
+       case SO_TC_VI:
+       case _SO_TC_VI:
+               msc = MBUF_SC_VI;
+               break;
+       case SO_TC_VO:
+       case _SO_TC_VO:
+               msc = MBUF_SC_VO;
+               break;
+       case SO_TC_CTL:
+               msc = MBUF_SC_CTL;
+               break;
+       case SO_TC_ALL:
+       default:
+               msc = MBUF_SC_UNSPEC;
+               break;
        }
+
+       return (msc);
 }
 
+__private_extern__ int
+so_svc2tc(mbuf_svc_class_t svc)
+{
+       switch (svc) {
+       case MBUF_SC_UNSPEC:
+               return SO_TC_BE;
+       case MBUF_SC_BK_SYS:
+               return SO_TC_BK_SYS;
+       case MBUF_SC_BK:
+               return SO_TC_BK;
+       case MBUF_SC_BE:
+               return SO_TC_BE;
+       case MBUF_SC_RD:
+               return SO_TC_RD;
+       case MBUF_SC_OAM:
+               return SO_TC_OAM;
+       case MBUF_SC_AV:
+               return SO_TC_AV;
+       case MBUF_SC_RV:
+               return SO_TC_RV;
+       case MBUF_SC_VI:
+               return SO_TC_VI;
+       case MBUF_SC_VO:
+               return SO_TC_VO;
+       case MBUF_SC_CTL:
+               return SO_TC_CTL;
+       default:
+               return SO_TC_BE;
+       }
+}
+
+/*
+ * LRO is turned on for AV streaming and background classes.
+ */
+static void
+so_set_lro(struct socket *so, int optval)
+{
+       if ((optval == SO_TC_BK) ||
+            (optval == SO_TC_BK_SYS) ||
+           (optval == SO_TC_AV)) {
+               so->so_flags |= SOF_USELRO;
+       } else {
+               so->so_flags &= ~SOF_USELRO;
+       }
+}