X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/13f56ec4e58bf8687e2a68032c093c0213dd519b..316670eb35587141e969394ae8537d66b9211e80:/bsd/netinet/in_tclass.c diff --git a/bsd/netinet/in_tclass.c b/bsd/netinet/in_tclass.c index 54b5fcc1d..02d9ccc86 100644 --- a/bsd/netinet/in_tclass.c +++ b/bsd/netinet/in_tclass.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2009-2011 Apple Inc. All rights reserved. + * Copyright (c) 2009-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * @@ -60,32 +60,32 @@ extern char *proc_name_address(void *p); static int tfp_count = 0; -static TAILQ_HEAD(, tclass_for_proc) tfp_head = TAILQ_HEAD_INITIALIZER(tfp_head); +static TAILQ_HEAD(, tclass_for_proc) tfp_head = + TAILQ_HEAD_INITIALIZER(tfp_head); struct tclass_for_proc { TAILQ_ENTRY(tclass_for_proc) tfp_link; - int tfp_class; - pid_t tfp_pid; - char tfp_pname[MAXCOMLEN + 1]; + int tfp_class; + pid_t tfp_pid; + char tfp_pname[MAXCOMLEN + 1]; }; -extern void tcp_set_background_cc(struct socket *); -extern void tcp_set_foreground_cc(struct socket *); - -int dscp_code_from_mbuf_tclass(int ); - -static int get_pid_tclass(pid_t , int *); -static int get_pname_tclass(const char * , int *); -static int set_pid_tclass(pid_t , int ); -static int set_pname_tclass(const char * , int ); +static int dscp_code_from_mbuf_tclass(mbuf_traffic_class_t); +static int get_pid_tclass(struct so_tcdbg *); +static int get_pname_tclass(struct so_tcdbg *); +static int set_pid_tclass(struct so_tcdbg *); +static int set_pname_tclass(struct so_tcdbg *); +static int flush_pid_tclass(struct so_tcdbg *); static int purge_tclass_for_proc(void); static int flush_tclass_for_proc(void); +static void so_set_lro(struct socket*, int); +int get_tclass_for_curr_proc(int *); - -static lck_grp_attr_t *tclass_lck_grp_attr = NULL; /* mutex group attributes */ -static lck_grp_t *tclass_lck_grp = NULL; /* mutex group definition */ -static lck_attr_t *tclass_lck_attr = NULL; /* mutex attributes */ -static lck_mtx_t *tclass_lock = NULL; +static lck_grp_attr_t *tclass_lck_grp_attr = NULL; /* mutex group attributes */ +static lck_grp_t *tclass_lck_grp = NULL; /* mutex group definition */ +static lck_attr_t *tclass_lck_attr = NULL; /* mutex attributes */ +decl_lck_mtx_data(static, tclass_lock_data); +static lck_mtx_t *tclass_lock = &tclass_lock_data; /* * Must be called with tclass_lock held @@ -94,12 +94,12 @@ static struct tclass_for_proc * find_tfp_by_pid(pid_t pid) { struct tclass_for_proc *tfp; - + TAILQ_FOREACH(tfp, &tfp_head, tfp_link) { if (tfp->tfp_pid == pid) break; } - return tfp; + return (tfp); } /* @@ -109,36 +109,39 @@ static struct tclass_for_proc * find_tfp_by_pname(const char *pname) { struct tclass_for_proc *tfp; - + TAILQ_FOREACH(tfp, &tfp_head, tfp_link) { - if (strncmp(pname, tfp->tfp_pname, sizeof(tfp->tfp_pname)) == 0) + if (strncmp(pname, tfp->tfp_pname, + sizeof (tfp->tfp_pname)) == 0) break; } - return tfp; + return (tfp); } -static int -get_tclass_for_curr_proc(void) +__private_extern__ int +get_tclass_for_curr_proc(int *sotc) { - struct tclass_for_proc *tfp; - int sotc = SO_TC_BE; + struct tclass_for_proc *tfp = NULL; proc_t p = current_proc(); /* Not ref counted */ pid_t pid = proc_pid(p); char *pname = proc_name_address(p); - + + *sotc = -1; + lck_mtx_lock(tclass_lock); - + TAILQ_FOREACH(tfp, &tfp_head, tfp_link) { - if ((tfp->tfp_pid == pid) || - (tfp->tfp_pid == -1 && strncmp(pname, tfp->tfp_pname, sizeof(tfp->tfp_pname)) == 0)) { - sotc = tfp->tfp_class; + if ((tfp->tfp_pid == pid) || (tfp->tfp_pid == -1 && + strncmp(pname, tfp->tfp_pname, + sizeof (tfp->tfp_pname)) == 0)) { + *sotc = tfp->tfp_class; break; - } + } } lck_mtx_unlock(tclass_lock); - return sotc; + return ((tfp == NULL) ? 0 : 1); } /* @@ -154,13 +157,13 @@ purge_tclass_for_proc(void) TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) { proc_t p; - + if (tfp->tfp_pid == -1) continue; if ((p = proc_find(tfp->tfp_pid)) == NULL) { tfp_count--; TAILQ_REMOVE(&tfp_head, tfp, tfp_link); - + _FREE(tfp, M_TEMP); } else { proc_rele(p); @@ -168,8 +171,8 @@ purge_tclass_for_proc(void) } lck_mtx_unlock(tclass_lock); - - return error; + + return (error); } /* @@ -200,10 +203,10 @@ flush_tclass_for_proc(void) TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) { free_tclass_for_proc(tfp); } - + lck_mtx_unlock(tclass_lock); - - return error; + + return (error); } @@ -211,40 +214,39 @@ flush_tclass_for_proc(void) * Must be called with tclass_lock held */ static struct tclass_for_proc * -alloc_tclass_for_proc(pid_t pid, const char *pname, int tclass) +alloc_tclass_for_proc(pid_t pid, const char *pname) { struct tclass_for_proc *tfp; - + if (pid == -1 && pname == NULL) - return NULL; + return (NULL); - tfp = _MALLOC(sizeof(struct tclass_for_proc), M_TEMP, M_NOWAIT | M_ZERO); + tfp = _MALLOC(sizeof (struct tclass_for_proc), M_TEMP, M_NOWAIT|M_ZERO); if (tfp == NULL) - return NULL; - + return (NULL); + tfp->tfp_pid = pid; - tfp->tfp_class = tclass; /* - * Add per pid entries before per proc name so we can find + * Add per pid entries before per proc name so we can find * a specific instance of a process before the general name base entry. */ if (pid != -1) { TAILQ_INSERT_HEAD(&tfp_head, tfp, tfp_link); } else { - strlcpy(tfp->tfp_pname, pname, sizeof(tfp->tfp_pname)); + strlcpy(tfp->tfp_pname, pname, sizeof (tfp->tfp_pname)); TAILQ_INSERT_TAIL(&tfp_head, tfp, tfp_link); } - + tfp_count++; - return tfp; + return (tfp); } /* * -1 for tclass means to remove the entry */ -int -set_pid_tclass(pid_t pid, int tclass) +int +set_pid_tclass(struct so_tcdbg *so_tcdbg) { int error = EINVAL; proc_t p = NULL; @@ -252,205 +254,279 @@ set_pid_tclass(pid_t pid, int tclass) struct fileproc *fp; struct tclass_for_proc *tfp; int i; + pid_t pid = so_tcdbg->so_tcdbg_pid; + int tclass = so_tcdbg->so_tcdbg_tclass; p = proc_find(pid); if (p == NULL) { - printf("set_pid_tclass proc_find(%d) \n", pid); + printf("%s proc_find(%d) failed\n", __func__, pid); goto done; } - + /* Need a tfp */ lck_mtx_lock(tclass_lock); - + tfp = find_tfp_by_pid(pid); - if (tclass == -1) { - if (tfp != NULL) { - free_tclass_for_proc(tfp); - error = 0; - } - lck_mtx_unlock(tclass_lock); - goto done; - } else { + if (tfp == NULL) { + tfp = alloc_tclass_for_proc(pid, NULL); if (tfp == NULL) { - tfp = alloc_tclass_for_proc(pid, NULL, tclass); - if (tfp == NULL) { - lck_mtx_unlock(tclass_lock); - error = ENOBUFS; - goto done; - } - } else { - tfp->tfp_class = tclass; + lck_mtx_unlock(tclass_lock); + error = ENOBUFS; + goto done; } } + tfp->tfp_class = tclass; + lck_mtx_unlock(tclass_lock); if (tfp != NULL) { proc_fdlock(p); - + fdp = p->p_fd; for (i = 0; i < fdp->fd_nfiles; i++) { struct socket *so; - + fp = fdp->fd_ofiles[i]; - if (fp == NULL || (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 || - fp->f_fglob->fg_type != DTYPE_SOCKET) + if (fp == NULL || + (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 || + fp->f_fglob->fg_type != DTYPE_SOCKET) continue; - + so = (struct socket *)fp->f_fglob->fg_data; - if (so->so_proto->pr_domain->dom_family != AF_INET && - so->so_proto->pr_domain->dom_family != AF_INET6) + if (so->so_proto->pr_domain->dom_family != AF_INET && + so->so_proto->pr_domain->dom_family != AF_INET6) continue; socket_lock(so, 1); - error = so_set_traffic_class(so, tclass != -1 ? tclass : SO_TC_BE); - socket_unlock(so, 1); - if (error != 0) { - printf("set_pid_tclass so_set_traffic_class(%p, %d) failed %d\n", so, tclass, error); - error = 0; + if (tclass != -1) { + error = so_set_traffic_class(so, tclass); + if (error != 0) { + printf("%s: so_set_traffic_class" + "(so=%p, fd=%d, tclass=%d) " + "failed %d\n", __func__, + so, i, tclass, error); + error = 0; + } } + socket_unlock(so, 1); } - + proc_fdunlock(p); } - - error = 0; + + error = 0; done: if (p != NULL) proc_rele(p); - - return error; + + return (error); } -int -set_pname_tclass(const char *pname, int tclass) +int +set_pname_tclass(struct so_tcdbg *so_tcdbg) { int error = EINVAL; struct tclass_for_proc *tfp; lck_mtx_lock(tclass_lock); - - tfp = find_tfp_by_pname(pname); - if (tclass == -1) { - if (tfp != NULL) - free_tclass_for_proc(tfp); - } else { + + tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname); + if (tfp == NULL) { + tfp = alloc_tclass_for_proc(-1, so_tcdbg->so_tcdbg_pname); if (tfp == NULL) { - tfp = alloc_tclass_for_proc(-1, pname, tclass); - if (tfp == NULL) { - lck_mtx_unlock(tclass_lock); - error = ENOBUFS; - goto done; - } - } else { - tfp->tfp_class = tclass; + lck_mtx_unlock(tclass_lock); + error = ENOBUFS; + goto done; } } + tfp->tfp_class = so_tcdbg->so_tcdbg_tclass; + lck_mtx_unlock(tclass_lock); - - error = 0; + + error = 0; done: - - return error; + + return (error); } -int -get_pid_tclass(pid_t pid, int *tclass) +static int +flush_pid_tclass(struct so_tcdbg *so_tcdbg) +{ + pid_t pid = so_tcdbg->so_tcdbg_pid; + int tclass = so_tcdbg->so_tcdbg_tclass; + struct filedesc *fdp; + int error = EINVAL; + proc_t p; + int i; + + p = proc_find(pid); + if (p == PROC_NULL) { + printf("%s proc_find(%d) failed\n", __func__, pid); + goto done; + } + + proc_fdlock(p); + fdp = p->p_fd; + for (i = 0; i < fdp->fd_nfiles; i++) { + struct socket *so; + struct fileproc *fp; + + fp = fdp->fd_ofiles[i]; + if (fp == NULL || + (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 || + fp->f_fglob->fg_type != DTYPE_SOCKET) + continue; + + so = (struct socket *)fp->f_fglob->fg_data; + error = sock_setsockopt(so, SOL_SOCKET, SO_FLUSH, &tclass, + sizeof (tclass)); + if (error != 0) { + printf("%s: setsockopt(SO_FLUSH) (so=%p, fd=%d, " + "tclass=%d) failed %d\n", __func__, so, i, tclass, + error); + error = 0; + } + } + proc_fdunlock(p); + + error = 0; +done: + if (p != PROC_NULL) + proc_rele(p); + + return (error); +} + +int +get_pid_tclass(struct so_tcdbg *so_tcdbg) { int error = EINVAL; proc_t p = NULL; struct tclass_for_proc *tfp; - - *tclass = -1; /* Means not set */ + pid_t pid = so_tcdbg->so_tcdbg_pid; + + so_tcdbg->so_tcdbg_tclass = -1; /* Means not set */ + so_tcdbg->so_tcdbg_opportunistic = -1; /* Means not set */ p = proc_find(pid); if (p == NULL) { - printf("get_pid_tclass proc_find(%d) \n", pid); + printf("%s proc_find(%d) failed\n", __func__, pid); goto done; } - + /* Need a tfp */ lck_mtx_lock(tclass_lock); - + tfp = find_tfp_by_pid(pid); if (tfp != NULL) { - *tclass = tfp->tfp_class ; + so_tcdbg->so_tcdbg_tclass = tfp->tfp_class; error = 0; } lck_mtx_unlock(tclass_lock); done: if (p != NULL) proc_rele(p); - - return error; + + return (error); } -int -get_pname_tclass(const char *pname, int *tclass) +int +get_pname_tclass(struct so_tcdbg *so_tcdbg) { int error = EINVAL; struct tclass_for_proc *tfp; - - *tclass = -1; /* Means not set */ + + so_tcdbg->so_tcdbg_tclass = -1; /* Means not set */ + so_tcdbg->so_tcdbg_opportunistic = -1; /* Means not set */ /* Need a tfp */ lck_mtx_lock(tclass_lock); - - tfp = find_tfp_by_pname(pname); + + tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname); if (tfp != NULL) { - *tclass = tfp->tfp_class ; + so_tcdbg->so_tcdbg_tclass = tfp->tfp_class; error = 0; } lck_mtx_unlock(tclass_lock); - - return error; + + return (error); } +static int +delete_tclass_for_pid_pname(struct so_tcdbg *so_tcdbg) +{ + int error = EINVAL; + pid_t pid = so_tcdbg->so_tcdbg_pid; + struct tclass_for_proc *tfp = NULL; + + lck_mtx_lock(tclass_lock); + if (pid != -1) + tfp = find_tfp_by_pid(pid); + else + tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname); + + if (tfp != NULL) { + free_tclass_for_proc(tfp); + error = 0; + } + + lck_mtx_unlock(tclass_lock); + + return (error); +} /* * Setting options requires privileges */ -__private_extern__ int +__private_extern__ int so_set_tcdbg(struct socket *so, struct so_tcdbg *so_tcdbg) { int error = 0; - + if ((so->so_state & SS_PRIV) == 0) - return EPERM; + return (EPERM); socket_unlock(so, 0); switch (so_tcdbg->so_tcdbg_cmd) { case SO_TCDBG_PID: - error = set_pid_tclass(so_tcdbg->so_tcdbg_pid, so_tcdbg->so_tcdbg_tclass); + error = set_pid_tclass(so_tcdbg); break; - + case SO_TCDBG_PNAME: - error = set_pname_tclass(so_tcdbg->so_tcdbg_pname, so_tcdbg->so_tcdbg_tclass); + error = set_pname_tclass(so_tcdbg); break; - + case SO_TCDBG_PURGE: error = purge_tclass_for_proc(); break; - + case SO_TCDBG_FLUSH: error = flush_tclass_for_proc(); break; - + + case SO_TCDBG_DELETE: + error = delete_tclass_for_pid_pname(so_tcdbg); + break; + + case SO_TCDBG_TCFLUSH_PID: + error = flush_pid_tclass(so_tcdbg); + break; + default: error = EINVAL; break; - } socket_lock(so, 0); - return error; + return (error); } /* * Not required to be privileged to get */ -__private_extern__ int +__private_extern__ int sogetopt_tcdbg(struct socket *so, struct sockopt *sopt) { int error = 0; @@ -458,23 +534,24 @@ sogetopt_tcdbg(struct socket *so, struct sockopt *sopt) void *buf = NULL; size_t len = sopt->sopt_valsize; - error = sooptcopyin(sopt, &so_tcdbg, sizeof(struct so_tcdbg), sizeof(struct so_tcdbg)); + error = sooptcopyin(sopt, &so_tcdbg, sizeof (struct so_tcdbg), + sizeof (struct so_tcdbg)); if (error != 0) - return error; - + return (error); + sopt->sopt_valsize = len; - + socket_unlock(so, 0); switch (so_tcdbg.so_tcdbg_cmd) { case SO_TCDBG_PID: - error = get_pid_tclass(so_tcdbg.so_tcdbg_pid, &so_tcdbg.so_tcdbg_tclass); + error = get_pid_tclass(&so_tcdbg); break; - + case SO_TCDBG_PNAME: - error = get_pname_tclass(so_tcdbg.so_tcdbg_pname, &so_tcdbg.so_tcdbg_tclass); + error = get_pname_tclass(&so_tcdbg); break; - + case SO_TCDBG_COUNT: lck_mtx_lock(tclass_lock); so_tcdbg.so_tcdbg_count = tfp_count; @@ -492,7 +569,7 @@ sogetopt_tcdbg(struct socket *so, struct sockopt *sopt) error = EINVAL; break; } - len = alloc_count * sizeof(struct so_tcdbg); + len = alloc_count * sizeof (struct so_tcdbg); lck_mtx_unlock(tclass_lock); buf = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO); @@ -513,33 +590,35 @@ sogetopt_tcdbg(struct socket *so, struct sockopt *sopt) } else { ptr->so_tcdbg_cmd = SO_TCDBG_PNAME; ptr->so_tcdbg_pid = -1; - strlcpy(ptr->so_tcdbg_pname, tfp->tfp_pname, sizeof(ptr->so_tcdbg_pname)); + strlcpy(ptr->so_tcdbg_pname, + tfp->tfp_pname, + sizeof (ptr->so_tcdbg_pname)); } ptr->so_tcdbg_tclass = tfp->tfp_class; ptr++; } - + lck_mtx_unlock(tclass_lock); } break; - + default: error = EINVAL; break; - } socket_lock(so, 0); if (error == 0) { if (buf == NULL) { - error = sooptcopyout(sopt, &so_tcdbg, sizeof(struct so_tcdbg)); + error = sooptcopyout(sopt, &so_tcdbg, + sizeof (struct so_tcdbg)); } else { error = sooptcopyout(sopt, buf, len); _FREE(buf, M_TEMP); } } - return error; + return (error); } @@ -547,78 +626,121 @@ __private_extern__ int so_set_traffic_class(struct socket *so, int optval) { int error = 0; - - if (optval < SO_TC_BE || optval > SO_TC_VO) { + + if (optval < SO_TC_BE || optval > SO_TC_CTL) { error = EINVAL; } else { - so->so_traffic_class = optval; - - if ((INP_SOCKAF(so) == AF_INET || INP_SOCKAF(so) == AF_INET6) && - INP_SOCKTYPE(so) == SOCK_STREAM) { - set_tcp_stream_priority(so); + switch (optval) { + case _SO_TC_BK: + optval = SO_TC_BK; + break; + case _SO_TC_VI: + optval = SO_TC_VI; + break; + case _SO_TC_VO: + optval = SO_TC_VO; + break; + default: + if (!SO_VALID_TC(optval)) + error = EINVAL; + break; + } + + if (error == 0) { + int oldval = so->so_traffic_class; + + VERIFY(SO_VALID_TC(optval)); + so->so_traffic_class = optval; + + if ((INP_SOCKAF(so) == AF_INET || + INP_SOCKAF(so) == AF_INET6) && + INP_SOCKTYPE(so) == SOCK_STREAM) { + set_tcp_stream_priority(so); + + /* Set/unset use of Large Receive Offload */ + so_set_lro(so, optval); + } + + if ((INP_SOCKAF(so) == AF_INET || + INP_SOCKAF(so) == AF_INET6) && + optval != oldval && (optval == SO_TC_BK_SYS || + oldval == SO_TC_BK_SYS)) { + /* + * If the app switches from BK_SYS to something + * else, resume the socket if it was suspended. + */ + if (oldval == SO_TC_BK_SYS) + inp_reset_fc_state(so->so_pcb); + + SOTHROTTLELOG(("throttle[%d]: so %p [%d,%d] " + "opportunistic %s\n", so->last_pid, + so, INP_SOCKAF(so), INP_SOCKTYPE(so), + (optval == SO_TC_BK_SYS) ? "ON" : "OFF")); + } } } - return error; + return (error); } __private_extern__ void so_set_default_traffic_class(struct socket *so) { - int sotc = SO_TC_BE; + int sotc = -1; - if (tfp_count > 0 && (INP_SOCKAF(so) == AF_INET || INP_SOCKAF(so) == AF_INET6)) { - sotc = get_tclass_for_curr_proc(); + if (tfp_count > 0 && + (INP_SOCKAF(so) == AF_INET || INP_SOCKAF(so) == AF_INET6)) { + get_tclass_for_curr_proc(&sotc); } - - so->so_traffic_class = sotc; - - return; + + so->so_traffic_class = (sotc != -1) ? sotc : SO_TC_BE; } +__private_extern__ int +so_set_opportunistic(struct socket *so, int optval) +{ + return (so_set_traffic_class(so, (optval == 0) ? + SO_TC_BE : SO_TC_BK_SYS)); +} __private_extern__ int -mbuf_traffic_class_from_control(struct mbuf *control) +so_get_opportunistic(struct socket *so) +{ + return (so->so_traffic_class == SO_TC_BK_SYS); +} + +__private_extern__ mbuf_svc_class_t +mbuf_service_class_from_control(struct mbuf *control) { struct cmsghdr *cm; - - for (cm = M_FIRST_CMSGHDR(control); - cm != NULL; - cm = M_NXT_CMSGHDR(control, cm)) { + mbuf_svc_class_t msc = MBUF_SC_UNSPEC; + + for (cm = M_FIRST_CMSGHDR(control); cm != NULL; + cm = M_NXT_CMSGHDR(control, cm)) { int tc; - if (cm->cmsg_len < sizeof(struct cmsghdr)) + if (cm->cmsg_len < sizeof (struct cmsghdr)) break; - + if (cm->cmsg_level != SOL_SOCKET || - cm->cmsg_type != SO_TRAFFIC_CLASS) + cm->cmsg_type != SO_TRAFFIC_CLASS) continue; - if (cm->cmsg_len != CMSG_LEN(sizeof(int))) + if (cm->cmsg_len != CMSG_LEN(sizeof (int))) continue; - - tc = *(int *)CMSG_DATA(cm); - - switch (tc) { - case SO_TC_BE: - return MBUF_TC_BE; - case SO_TC_BK: - return MBUF_TC_BK; - case SO_TC_VI: - return MBUF_TC_VI; - case SO_TC_VO: - return MBUF_TC_VO; - default: - break; - } + + tc = *(int *)(void *)CMSG_DATA(cm); + msc = so_tc2msc(tc); + if (MBUF_VALID_SC(msc)) + break; } - - return MBUF_TC_UNSPEC; + + return (msc); } __private_extern__ int -dscp_code_from_mbuf_tclass(int mtc) +dscp_code_from_mbuf_tclass(mbuf_traffic_class_t mtc) { int dscp_code; - + switch (mtc) { default: case MBUF_TC_BE: @@ -634,56 +756,65 @@ dscp_code_from_mbuf_tclass(int mtc) dscp_code = 0x30; break; } - - return dscp_code; + + return (dscp_code); } __private_extern__ void so_recv_data_stat(struct socket *so, struct mbuf *m, size_t off) { - uint32_t sotc = m->m_pkthdr.prio; + uint32_t sotc = m_get_traffic_class(m); if (sotc >= SO_TC_STATS_MAX) sotc = SO_TC_BE; - - so->so_tc_stats[sotc].rxpackets += 1; - so->so_tc_stats[sotc].rxbytes += ((m->m_flags & M_PKTHDR) ? m->m_pkthdr.len : 0) + off; - return; + so->so_tc_stats[sotc].rxpackets += 1; + so->so_tc_stats[sotc].rxbytes += + ((m->m_flags & M_PKTHDR) ? m->m_pkthdr.len : 0) + off; } __private_extern__ void set_tcp_stream_priority(struct socket *so) { struct tcpcb *tp = intotcpcb(sotoinpcb(so)); + int old_cc = tp->tcp_cc_index; + int recvbg = IS_TCP_RECV_BG(so); - /* If the socket was marked as a background socket or if the - * traffic class is set to background with traffic class socket - * option then make both send and recv side of the stream to be - * background. The variable sotcdb which can be set with sysctl + /* + * If the socket was marked as a background socket or if the + * traffic class is set to background with traffic class socket + * option then make both send and recv side of the stream to be + * background. The variable sotcdb which can be set with sysctl * is used to disable these settings for testing. */ - if (soisbackground(so) || so->so_traffic_class == SO_TC_BK) { + if (soisthrottled(so) || IS_SO_TC_BACKGROUND(so->so_traffic_class)) { if ((sotcdb & SOTCDB_NO_SENDTCPBG) != 0) { - if (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) + if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX) tcp_set_foreground_cc(so); } else { - if (tp->tcp_cc_index != TCP_CC_ALGO_BACKGROUND_INDEX) + if (old_cc != TCP_CC_ALGO_BACKGROUND_INDEX) tcp_set_background_cc(so); } - + /* Set receive side background flags */ - if ((sotcdb & SOTCDB_NO_RECVTCPBG) != 0) { - so->so_traffic_mgt_flags &= ~(TRAFFIC_MGT_TCP_RECVBG); - } else { - so->so_traffic_mgt_flags |= TRAFFIC_MGT_TCP_RECVBG; - } + if ((sotcdb & SOTCDB_NO_RECVTCPBG) != 0) + tcp_clear_recv_bg(so); + else + tcp_set_recv_bg(so); } else { - so->so_traffic_mgt_flags &= ~(TRAFFIC_MGT_TCP_RECVBG); - if (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) + tcp_clear_recv_bg(so); + if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX) tcp_set_foreground_cc(so); } - return; + + if (old_cc != tp->tcp_cc_index || recvbg != IS_TCP_RECV_BG(so)) { + SOTHROTTLELOG(("throttle[%d]: so %p [%d,%d] TCP %s send; " + "%s recv\n", so->last_pid, so, INP_SOCKAF(so), + INP_SOCKTYPE(so), + (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) ? + "background" : "foreground", + IS_TCP_RECV_BG(so) ? "background" : "foreground")); + } } /* @@ -692,119 +823,126 @@ set_tcp_stream_priority(struct socket *so) * - set the DSCP code following the WMM mapping */ __private_extern__ void -set_packet_tclass(struct mbuf *m, struct socket *so, int in_mtc, int isipv6) +set_packet_service_class(struct mbuf *m, struct socket *so, + mbuf_svc_class_t in_msc, u_int32_t flags) { - int mtc = MBUF_TC_BE; /* Best effort by default */ - struct inpcb *inp = sotoinpcb(so); /* in6pcb and inpcb are the same */ + mbuf_svc_class_t msc = MBUF_SC_BE; /* Best effort by default */ + struct inpcb *inp = sotoinpcb(so); /* in6pcb and inpcb are the same */ struct ip *ip = mtod(m, struct ip *); #if INET6 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); #endif /* INET6 */ - + int isipv6 = ((flags & PKT_SCF_IPV6) != 0) ? 1 : 0; + if (!(m->m_flags & M_PKTHDR)) return; - - /* + + /* * Here is the precedence: * 1) TRAFFIC_MGT_SO_BACKGROUND trumps all * 2) Traffic class passed via ancillary data to sendmsdg(2) * 3) Traffic class socket option last */ - if (soisbackground(so)) { - mtc = MBUF_TC_BK; - } else if (in_mtc != MBUF_TC_UNSPEC) { - if (in_mtc >= MBUF_TC_BE && in_mtc <= MBUF_TC_VO) - mtc = in_mtc; + if (in_msc != MBUF_SC_UNSPEC) { + if (in_msc >= MBUF_SC_BE && in_msc <= MBUF_SC_CTL) + msc = in_msc; } else { - switch (so->so_traffic_class) { - case SO_TC_BE: - mtc = MBUF_TC_BE; - break; - case SO_TC_BK: - mtc = MBUF_TC_BK; - break; - case SO_TC_VI: - mtc = MBUF_TC_VI; - break; - case SO_TC_VO: - mtc = MBUF_TC_VO; - break; - default: - break; - } + VERIFY(SO_VALID_TC(so->so_traffic_class)); + msc = so_tc2msc(so->so_traffic_class); + /* Assert because tc must have been valid */ + VERIFY(MBUF_VALID_SC(msc)); } - + + /* + * If TRAFFIC_MGT_SO_BACKGROUND is set, depress the priority. + */ + if (soisthrottled(so) && !IS_MBUF_SC_BACKGROUND(msc)) + msc = MBUF_SC_BK; + /* - * Set the traffic class in the mbuf packet header prio field + * Set the traffic class in the mbuf packet header svc field */ - if ((sotcdb & SOTCDB_NO_MTC)) + if (sotcdb & SOTCDB_NO_MTC) goto no_mbtc; - m->m_pkthdr.prio = mtc; - + + /* Elevate service class if the packet is a pure TCP ACK. + * We can do this only when the flow is not a background + * flow and the outgoing interface supports + * transmit-start model. + */ + if (!IS_MBUF_SC_BACKGROUND(msc) && (flags & PKT_SCF_TCP_ACK)) + msc = MBUF_SC_CTL; + + (void) m_set_service_class(m, msc); + + /* + * Set the privileged traffic auxiliary flag if applicable, or clear it. + */ + if (!(sotcdb & SOTCDB_NO_PRIVILEGED) && soisprivilegedtraffic(so) && + msc != MBUF_SC_UNSPEC) + m->m_pkthdr.aux_flags |= MAUXF_PRIO_PRIVILEGED; + else + m->m_pkthdr.aux_flags &= ~MAUXF_PRIO_PRIVILEGED; + no_mbtc: /* - * Quick exit when best effort + * Quick exit when best effort */ - if (mtc == MBUF_TC_BE) + if (msc == MBUF_SC_BE) goto no_dscp; + /* - * Now let set the DSCP code in IPv4 or IPv6 header - * By default do this only for local traffic if a code is not already set + * The default behavior is for the networking stack to not set the + * DSCP code, based on SOTCDB_NO_DSCP being set. If the flag is + * cleared, set the DSCP code in IPv4 or IPv6 header only for local + * traffic, if it is not already set. */ - if ((sotcdb & SOTCDB_NO_DSCP)) + if (sotcdb & SOTCDB_NO_DSCP) goto no_dscp; - + /* - * Test if a IP TOS or IPV6 TCLASS has already been set on the socket or the raw packet + * Test if a IP TOS or IPV6 TCLASS has already been set + * on the socket or the raw packet. */ - if ((sotcdb & SOTCDB_NO_DSCPTST) == 0) { + if (!(sotcdb & SOTCDB_NO_DSCPTST)) { #if INET6 - if (isipv6) - { - if ((so->so_type == SOCK_RAW && (ip6->ip6_flow & htonl(0xff << 20)) != 0) || - (inp->in6p_outputopts && inp->in6p_outputopts->ip6po_tclass != -1)) + if (isipv6) { + if ((so->so_type == SOCK_RAW && + (ip6->ip6_flow & htonl(0xff << 20)) != 0) || + (inp->in6p_outputopts && + inp->in6p_outputopts->ip6po_tclass != -1)) goto no_dscp; - } - else + } else #endif /* INET6 */ - { - if ((so->so_type == SOCK_RAW && (inp->inp_flags & INP_HDRINCL)) || - inp->inp_ip_tos != 0) - goto no_dscp; - } + if ((so->so_type == SOCK_RAW && + (inp->inp_flags & INP_HDRINCL)) || + inp->inp_ip_tos != 0) + goto no_dscp; } - + /* * Test if destination is local */ - if ((sotcdb & SOTCDB_NO_LCLTST) == 0) { + if (!(sotcdb & SOTCDB_NO_LCLTST)) { int islocal = 0; - struct route *ro = &inp->inp_route; + struct rtentry *rt = inp->inp_route.ro_rt; if (so->so_type == SOCK_STREAM) { - struct tcpcb *tp = intotcpcb(inp); - - if ((tp->t_flags & TF_LOCAL)) + if (intotcpcb(inp)->t_flags & TF_LOCAL) islocal = 1; - } - else -#if INET6 - if (isipv6) - { - if ((ro != NULL && ro->ro_rt != NULL && - (ro->ro_rt->rt_gateway->sa_family == AF_LINK || - (ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK))) || - in6addr_local(&ip6->ip6_dst)) + } else if (rt != NULL && + (rt->rt_gateway->sa_family == AF_LINK || + (rt->rt_ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)))) { + if (!(rt->rt_ifp->if_flags & IFF_POINTOPOINT)) islocal = 1; - } - else + } else +#if INET6 + if (isipv6 && in6addr_local(&ip6->ip6_dst)) { + islocal = 1; + } else #endif /* INET6 */ - { - if ((ro != NULL && ro->ro_rt != NULL && - (ro->ro_rt->rt_gateway->sa_family == AF_LINK || - (ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK))) || - inaddr_local(ip->ip_dst)) - islocal = 1; + if (inaddr_local(ip->ip_dst)) { + islocal = 1; } if (islocal == 0) goto no_dscp; @@ -812,28 +950,38 @@ no_mbtc: #if INET6 if (isipv6) - ip6->ip6_flow |= - htonl(dscp_code_from_mbuf_tclass(m->m_pkthdr.prio) << 20); + ip6->ip6_flow |= htonl(dscp_code_from_mbuf_tclass( + m_get_traffic_class(m)) << 20); else #endif /* INET6 */ - ip->ip_tos |= dscp_code_from_mbuf_tclass(m->m_pkthdr.prio) << 2; - + ip->ip_tos |= dscp_code_from_mbuf_tclass( + m_get_traffic_class(m)) << 2; + no_dscp: /* * For TCP with background traffic class switch CC algo based on sysctl */ - if (so->so_type == SOCK_STREAM) { + if (so->so_type == SOCK_STREAM) set_tcp_stream_priority(so); - } - + + so_tc_update_stats(m, so, msc); +} + +__private_extern__ void +so_tc_update_stats(struct mbuf *m, struct socket *so, mbuf_svc_class_t msc) +{ + mbuf_traffic_class_t mtc; + /* * Assume socket and mbuf traffic class values are the same - * Also assume the socket lock is held + * Also assume the socket lock is held. Note that the stats + * at the socket layer are reduced down to the legacy traffic + * classes; we could/should potentially expand so_tc_stats[]. */ + mtc = MBUF_SC2TC(msc); + VERIFY(mtc < SO_TC_STATS_MAX); so->so_tc_stats[mtc].txpackets += 1; so->so_tc_stats[mtc].txbytes += m->m_pkthdr.len; - - return; } __private_extern__ void @@ -842,9 +990,100 @@ socket_tclass_init(void) tclass_lck_grp_attr = lck_grp_attr_alloc_init(); tclass_lck_grp = lck_grp_alloc_init("tclass", tclass_lck_grp_attr); tclass_lck_attr = lck_attr_alloc_init(); - if ((tclass_lock = lck_mtx_alloc_init(tclass_lck_grp, tclass_lck_attr)) == NULL) { - panic("failed to allocate memory for tclass\n"); + lck_mtx_init(tclass_lock, tclass_lck_grp, tclass_lck_attr); +} + +__private_extern__ mbuf_svc_class_t +so_tc2msc(int tc) +{ + mbuf_svc_class_t msc; + + switch (tc) { + case SO_TC_BK_SYS: + msc = MBUF_SC_BK_SYS; + break; + case SO_TC_BK: + case _SO_TC_BK: + msc = MBUF_SC_BK; + break; + case SO_TC_BE: + msc = MBUF_SC_BE; + break; + case SO_TC_RD: + msc = MBUF_SC_RD; + break; + case SO_TC_OAM: + msc = MBUF_SC_OAM; + break; + case SO_TC_AV: + msc = MBUF_SC_AV; + break; + case SO_TC_RV: + msc = MBUF_SC_RV; + break; + case SO_TC_VI: + case _SO_TC_VI: + msc = MBUF_SC_VI; + break; + case SO_TC_VO: + case _SO_TC_VO: + msc = MBUF_SC_VO; + break; + case SO_TC_CTL: + msc = MBUF_SC_CTL; + break; + case SO_TC_ALL: + default: + msc = MBUF_SC_UNSPEC; + break; } + + return (msc); } +__private_extern__ int +so_svc2tc(mbuf_svc_class_t svc) +{ + switch (svc) { + case MBUF_SC_UNSPEC: + return SO_TC_BE; + case MBUF_SC_BK_SYS: + return SO_TC_BK_SYS; + case MBUF_SC_BK: + return SO_TC_BK; + case MBUF_SC_BE: + return SO_TC_BE; + case MBUF_SC_RD: + return SO_TC_RD; + case MBUF_SC_OAM: + return SO_TC_OAM; + case MBUF_SC_AV: + return SO_TC_AV; + case MBUF_SC_RV: + return SO_TC_RV; + case MBUF_SC_VI: + return SO_TC_VI; + case MBUF_SC_VO: + return SO_TC_VO; + case MBUF_SC_CTL: + return SO_TC_CTL; + default: + return SO_TC_BE; + } +} + +/* + * LRO is turned on for AV streaming and background classes. + */ +static void +so_set_lro(struct socket *so, int optval) +{ + if ((optval == SO_TC_BK) || + (optval == SO_TC_BK_SYS) || + (optval == SO_TC_AV)) { + so->so_flags |= SOF_USELRO; + } else { + so->so_flags &= ~SOF_USELRO; + } +}