2 * Copyright (c) 2009-2011 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <sys/systm.h>
30 #include <sys/kernel.h>
31 #include <sys/types.h>
32 #include <sys/filedesc.h>
33 #include <sys/file_internal.h>
35 #include <sys/socket.h>
36 #include <sys/socketvar.h>
37 #include <sys/errno.h>
38 #include <sys/protosw.h>
39 #include <sys/domain.h>
41 #include <sys/queue.h>
44 #include <net/route.h>
46 #include <netinet/in.h>
47 #include <netinet/in_var.h>
48 #include <netinet/in_pcb.h>
49 #include <netinet/ip.h>
50 #include <netinet/ip_var.h>
51 #include <netinet/ip6.h>
52 #include <netinet6/ip6_var.h>
53 #include <netinet/udp.h>
54 #include <netinet/udp_var.h>
55 #include <netinet/tcp.h>
56 #include <netinet/tcp_var.h>
57 #include <netinet/tcp_cc.h>
59 extern char *proc_name_address(void *p
);
61 static int tfp_count
= 0;
63 static TAILQ_HEAD(, tclass_for_proc
) tfp_head
= TAILQ_HEAD_INITIALIZER(tfp_head
);
65 struct tclass_for_proc
{
66 TAILQ_ENTRY(tclass_for_proc
) tfp_link
;
69 char tfp_pname
[MAXCOMLEN
+ 1];
72 extern void tcp_set_background_cc(struct socket
*);
73 extern void tcp_set_foreground_cc(struct socket
*);
75 int dscp_code_from_mbuf_tclass(int );
77 static int get_pid_tclass(pid_t
, int *);
78 static int get_pname_tclass(const char * , int *);
79 static int set_pid_tclass(pid_t
, int );
80 static int set_pname_tclass(const char * , int );
81 static int purge_tclass_for_proc(void);
82 static int flush_tclass_for_proc(void);
85 static lck_grp_attr_t
*tclass_lck_grp_attr
= NULL
; /* mutex group attributes */
86 static lck_grp_t
*tclass_lck_grp
= NULL
; /* mutex group definition */
87 static lck_attr_t
*tclass_lck_attr
= NULL
; /* mutex attributes */
88 static lck_mtx_t
*tclass_lock
= NULL
;
91 * Must be called with tclass_lock held
93 static struct tclass_for_proc
*
94 find_tfp_by_pid(pid_t pid
)
96 struct tclass_for_proc
*tfp
;
98 TAILQ_FOREACH(tfp
, &tfp_head
, tfp_link
) {
99 if (tfp
->tfp_pid
== pid
)
106 * Must be called with tclass_lock held
108 static struct tclass_for_proc
*
109 find_tfp_by_pname(const char *pname
)
111 struct tclass_for_proc
*tfp
;
113 TAILQ_FOREACH(tfp
, &tfp_head
, tfp_link
) {
114 if (strncmp(pname
, tfp
->tfp_pname
, sizeof(tfp
->tfp_pname
)) == 0)
121 get_tclass_for_curr_proc(void)
123 struct tclass_for_proc
*tfp
;
125 proc_t p
= current_proc(); /* Not ref counted */
126 pid_t pid
= proc_pid(p
);
127 char *pname
= proc_name_address(p
);
129 lck_mtx_lock(tclass_lock
);
131 TAILQ_FOREACH(tfp
, &tfp_head
, tfp_link
) {
132 if ((tfp
->tfp_pid
== pid
) ||
133 (tfp
->tfp_pid
== -1 && strncmp(pname
, tfp
->tfp_pname
, sizeof(tfp
->tfp_pname
)) == 0)) {
134 sotc
= tfp
->tfp_class
;
139 lck_mtx_unlock(tclass_lock
);
145 * Purge entries with PIDs of exited processes
148 purge_tclass_for_proc(void)
151 struct tclass_for_proc
*tfp
, *tvar
;
153 lck_mtx_lock(tclass_lock
);
155 TAILQ_FOREACH_SAFE(tfp
, &tfp_head
, tfp_link
, tvar
) {
158 if (tfp
->tfp_pid
== -1)
160 if ((p
= proc_find(tfp
->tfp_pid
)) == NULL
) {
162 TAILQ_REMOVE(&tfp_head
, tfp
, tfp_link
);
170 lck_mtx_unlock(tclass_lock
);
177 * Must be called with tclass_lock held
180 free_tclass_for_proc(struct tclass_for_proc
*tfp
)
185 TAILQ_REMOVE(&tfp_head
, tfp
, tfp_link
);
193 flush_tclass_for_proc(void)
196 struct tclass_for_proc
*tfp
, *tvar
;
198 lck_mtx_lock(tclass_lock
);
200 TAILQ_FOREACH_SAFE(tfp
, &tfp_head
, tfp_link
, tvar
) {
201 free_tclass_for_proc(tfp
);
204 lck_mtx_unlock(tclass_lock
);
211 * Must be called with tclass_lock held
213 static struct tclass_for_proc
*
214 alloc_tclass_for_proc(pid_t pid
, const char *pname
, int tclass
)
216 struct tclass_for_proc
*tfp
;
218 if (pid
== -1 && pname
== NULL
)
221 tfp
= _MALLOC(sizeof(struct tclass_for_proc
), M_TEMP
, M_NOWAIT
| M_ZERO
);
226 tfp
->tfp_class
= tclass
;
228 * Add per pid entries before per proc name so we can find
229 * a specific instance of a process before the general name base entry.
232 TAILQ_INSERT_HEAD(&tfp_head
, tfp
, tfp_link
);
234 strlcpy(tfp
->tfp_pname
, pname
, sizeof(tfp
->tfp_pname
));
235 TAILQ_INSERT_TAIL(&tfp_head
, tfp
, tfp_link
);
244 * -1 for tclass means to remove the entry
247 set_pid_tclass(pid_t pid
, int tclass
)
251 struct filedesc
*fdp
;
253 struct tclass_for_proc
*tfp
;
258 printf("set_pid_tclass proc_find(%d) \n", pid
);
263 lck_mtx_lock(tclass_lock
);
265 tfp
= find_tfp_by_pid(pid
);
268 free_tclass_for_proc(tfp
);
271 lck_mtx_unlock(tclass_lock
);
275 tfp
= alloc_tclass_for_proc(pid
, NULL
, tclass
);
277 lck_mtx_unlock(tclass_lock
);
282 tfp
->tfp_class
= tclass
;
285 lck_mtx_unlock(tclass_lock
);
291 for (i
= 0; i
< fdp
->fd_nfiles
; i
++) {
294 fp
= fdp
->fd_ofiles
[i
];
295 if (fp
== NULL
|| (fdp
->fd_ofileflags
[i
] & UF_RESERVED
) != 0 ||
296 fp
->f_fglob
->fg_type
!= DTYPE_SOCKET
)
299 so
= (struct socket
*)fp
->f_fglob
->fg_data
;
300 if (so
->so_proto
->pr_domain
->dom_family
!= AF_INET
&&
301 so
->so_proto
->pr_domain
->dom_family
!= AF_INET6
)
304 error
= so_set_traffic_class(so
, tclass
!= -1 ? tclass
: SO_TC_BE
);
305 socket_unlock(so
, 1);
307 printf("set_pid_tclass so_set_traffic_class(%p, %d) failed %d\n", so
, tclass
, error
);
324 set_pname_tclass(const char *pname
, int tclass
)
327 struct tclass_for_proc
*tfp
;
329 lck_mtx_lock(tclass_lock
);
331 tfp
= find_tfp_by_pname(pname
);
334 free_tclass_for_proc(tfp
);
337 tfp
= alloc_tclass_for_proc(-1, pname
, tclass
);
339 lck_mtx_unlock(tclass_lock
);
344 tfp
->tfp_class
= tclass
;
347 lck_mtx_unlock(tclass_lock
);
356 get_pid_tclass(pid_t pid
, int *tclass
)
360 struct tclass_for_proc
*tfp
;
362 *tclass
= -1; /* Means not set */
366 printf("get_pid_tclass proc_find(%d) \n", pid
);
371 lck_mtx_lock(tclass_lock
);
373 tfp
= find_tfp_by_pid(pid
);
375 *tclass
= tfp
->tfp_class
;
378 lck_mtx_unlock(tclass_lock
);
387 get_pname_tclass(const char *pname
, int *tclass
)
390 struct tclass_for_proc
*tfp
;
392 *tclass
= -1; /* Means not set */
395 lck_mtx_lock(tclass_lock
);
397 tfp
= find_tfp_by_pname(pname
);
399 *tclass
= tfp
->tfp_class
;
402 lck_mtx_unlock(tclass_lock
);
410 * Setting options requires privileges
412 __private_extern__
int
413 so_set_tcdbg(struct socket
*so
, struct so_tcdbg
*so_tcdbg
)
417 if ((so
->so_state
& SS_PRIV
) == 0)
420 socket_unlock(so
, 0);
422 switch (so_tcdbg
->so_tcdbg_cmd
) {
424 error
= set_pid_tclass(so_tcdbg
->so_tcdbg_pid
, so_tcdbg
->so_tcdbg_tclass
);
428 error
= set_pname_tclass(so_tcdbg
->so_tcdbg_pname
, so_tcdbg
->so_tcdbg_tclass
);
432 error
= purge_tclass_for_proc();
436 error
= flush_tclass_for_proc();
451 * Not required to be privileged to get
453 __private_extern__
int
454 sogetopt_tcdbg(struct socket
*so
, struct sockopt
*sopt
)
457 struct so_tcdbg so_tcdbg
;
459 size_t len
= sopt
->sopt_valsize
;
461 error
= sooptcopyin(sopt
, &so_tcdbg
, sizeof(struct so_tcdbg
), sizeof(struct so_tcdbg
));
465 sopt
->sopt_valsize
= len
;
467 socket_unlock(so
, 0);
469 switch (so_tcdbg
.so_tcdbg_cmd
) {
471 error
= get_pid_tclass(so_tcdbg
.so_tcdbg_pid
, &so_tcdbg
.so_tcdbg_tclass
);
475 error
= get_pname_tclass(so_tcdbg
.so_tcdbg_pname
, &so_tcdbg
.so_tcdbg_tclass
);
479 lck_mtx_lock(tclass_lock
);
480 so_tcdbg
.so_tcdbg_count
= tfp_count
;
481 lck_mtx_unlock(tclass_lock
);
484 case SO_TCDBG_LIST
: {
485 struct tclass_for_proc
*tfp
;
487 struct so_tcdbg
*ptr
;
489 lck_mtx_lock(tclass_lock
);
490 if ((alloc_count
= tfp_count
) == 0) {
491 lck_mtx_unlock(tclass_lock
);
495 len
= alloc_count
* sizeof(struct so_tcdbg
);
496 lck_mtx_unlock(tclass_lock
);
498 buf
= _MALLOC(len
, M_TEMP
, M_WAITOK
| M_ZERO
);
504 lck_mtx_lock(tclass_lock
);
506 ptr
= (struct so_tcdbg
*)buf
;
507 TAILQ_FOREACH(tfp
, &tfp_head
, tfp_link
) {
508 if (++n
> alloc_count
)
510 if (tfp
->tfp_pid
!= -1) {
511 ptr
->so_tcdbg_cmd
= SO_TCDBG_PID
;
512 ptr
->so_tcdbg_pid
= tfp
->tfp_pid
;
514 ptr
->so_tcdbg_cmd
= SO_TCDBG_PNAME
;
515 ptr
->so_tcdbg_pid
= -1;
516 strlcpy(ptr
->so_tcdbg_pname
, tfp
->tfp_pname
, sizeof(ptr
->so_tcdbg_pname
));
518 ptr
->so_tcdbg_tclass
= tfp
->tfp_class
;
522 lck_mtx_unlock(tclass_lock
);
536 error
= sooptcopyout(sopt
, &so_tcdbg
, sizeof(struct so_tcdbg
));
538 error
= sooptcopyout(sopt
, buf
, len
);
546 __private_extern__
int
547 so_set_traffic_class(struct socket
*so
, int optval
)
551 if (optval
< SO_TC_BE
|| optval
> SO_TC_VO
) {
554 so
->so_traffic_class
= optval
;
556 if ((INP_SOCKAF(so
) == AF_INET
|| INP_SOCKAF(so
) == AF_INET6
) &&
557 INP_SOCKTYPE(so
) == SOCK_STREAM
) {
558 set_tcp_stream_priority(so
);
564 __private_extern__
void
565 so_set_default_traffic_class(struct socket
*so
)
569 if (tfp_count
> 0 && (INP_SOCKAF(so
) == AF_INET
|| INP_SOCKAF(so
) == AF_INET6
)) {
570 sotc
= get_tclass_for_curr_proc();
573 so
->so_traffic_class
= sotc
;
579 __private_extern__
int
580 mbuf_traffic_class_from_control(struct mbuf
*control
)
584 for (cm
= M_FIRST_CMSGHDR(control
);
586 cm
= M_NXT_CMSGHDR(control
, cm
)) {
589 if (cm
->cmsg_len
< sizeof(struct cmsghdr
))
592 if (cm
->cmsg_level
!= SOL_SOCKET
||
593 cm
->cmsg_type
!= SO_TRAFFIC_CLASS
)
595 if (cm
->cmsg_len
!= CMSG_LEN(sizeof(int)))
598 tc
= *(int *)CMSG_DATA(cm
);
614 return MBUF_TC_UNSPEC
;
617 __private_extern__
int
618 dscp_code_from_mbuf_tclass(int mtc
)
641 __private_extern__
void
642 so_recv_data_stat(struct socket
*so
, struct mbuf
*m
, size_t off
)
644 uint32_t sotc
= m
->m_pkthdr
.prio
;
646 if (sotc
>= SO_TC_STATS_MAX
)
649 so
->so_tc_stats
[sotc
].rxpackets
+= 1;
650 so
->so_tc_stats
[sotc
].rxbytes
+= ((m
->m_flags
& M_PKTHDR
) ? m
->m_pkthdr
.len
: 0) + off
;
655 __private_extern__
void
656 set_tcp_stream_priority(struct socket
*so
)
658 struct tcpcb
*tp
= intotcpcb(sotoinpcb(so
));
660 /* If the socket was marked as a background socket or if the
661 * traffic class is set to background with traffic class socket
662 * option then make both send and recv side of the stream to be
663 * background. The variable sotcdb which can be set with sysctl
664 * is used to disable these settings for testing.
666 if (soisbackground(so
) || so
->so_traffic_class
== SO_TC_BK
) {
667 if ((sotcdb
& SOTCDB_NO_SENDTCPBG
) != 0) {
668 if (tp
->tcp_cc_index
== TCP_CC_ALGO_BACKGROUND_INDEX
)
669 tcp_set_foreground_cc(so
);
671 if (tp
->tcp_cc_index
!= TCP_CC_ALGO_BACKGROUND_INDEX
)
672 tcp_set_background_cc(so
);
675 /* Set receive side background flags */
676 if ((sotcdb
& SOTCDB_NO_RECVTCPBG
) != 0) {
677 so
->so_traffic_mgt_flags
&= ~(TRAFFIC_MGT_TCP_RECVBG
);
679 so
->so_traffic_mgt_flags
|= TRAFFIC_MGT_TCP_RECVBG
;
682 so
->so_traffic_mgt_flags
&= ~(TRAFFIC_MGT_TCP_RECVBG
);
683 if (tp
->tcp_cc_index
== TCP_CC_ALGO_BACKGROUND_INDEX
)
684 tcp_set_foreground_cc(so
);
690 * Set traffic class to an IPv4 or IPv6 packet
692 * - set the DSCP code following the WMM mapping
694 __private_extern__
void
695 set_packet_tclass(struct mbuf
*m
, struct socket
*so
, int in_mtc
, int isipv6
)
697 int mtc
= MBUF_TC_BE
; /* Best effort by default */
698 struct inpcb
*inp
= sotoinpcb(so
); /* in6pcb and inpcb are the same */
699 struct ip
*ip
= mtod(m
, struct ip
*);
701 struct ip6_hdr
*ip6
= mtod(m
, struct ip6_hdr
*);
704 if (!(m
->m_flags
& M_PKTHDR
))
708 * Here is the precedence:
709 * 1) TRAFFIC_MGT_SO_BACKGROUND trumps all
710 * 2) Traffic class passed via ancillary data to sendmsdg(2)
711 * 3) Traffic class socket option last
713 if (soisbackground(so
)) {
715 } else if (in_mtc
!= MBUF_TC_UNSPEC
) {
716 if (in_mtc
>= MBUF_TC_BE
&& in_mtc
<= MBUF_TC_VO
)
719 switch (so
->so_traffic_class
) {
738 * Set the traffic class in the mbuf packet header prio field
740 if ((sotcdb
& SOTCDB_NO_MTC
))
742 m
->m_pkthdr
.prio
= mtc
;
746 * Quick exit when best effort
748 if (mtc
== MBUF_TC_BE
)
751 * Now let set the DSCP code in IPv4 or IPv6 header
752 * By default do this only for local traffic if a code is not already set
754 if ((sotcdb
& SOTCDB_NO_DSCP
))
758 * Test if a IP TOS or IPV6 TCLASS has already been set on the socket or the raw packet
760 if ((sotcdb
& SOTCDB_NO_DSCPTST
) == 0) {
764 if ((so
->so_type
== SOCK_RAW
&& (ip6
->ip6_flow
& htonl(0xff << 20)) != 0) ||
765 (inp
->in6p_outputopts
&& inp
->in6p_outputopts
->ip6po_tclass
!= -1))
771 if ((so
->so_type
== SOCK_RAW
&& (inp
->inp_flags
& INP_HDRINCL
)) ||
772 inp
->inp_ip_tos
!= 0)
778 * Test if destination is local
780 if ((sotcdb
& SOTCDB_NO_LCLTST
) == 0) {
782 struct route
*ro
= &inp
->inp_route
;
784 if (so
->so_type
== SOCK_STREAM
) {
785 struct tcpcb
*tp
= intotcpcb(inp
);
787 if ((tp
->t_flags
& TF_LOCAL
))
794 if ((ro
!= NULL
&& ro
->ro_rt
!= NULL
&&
795 (ro
->ro_rt
->rt_gateway
->sa_family
== AF_LINK
||
796 (ro
->ro_rt
->rt_ifp
->if_flags
& IFF_LOOPBACK
))) ||
797 in6addr_local(&ip6
->ip6_dst
))
803 if ((ro
!= NULL
&& ro
->ro_rt
!= NULL
&&
804 (ro
->ro_rt
->rt_gateway
->sa_family
== AF_LINK
||
805 (ro
->ro_rt
->rt_ifp
->if_flags
& IFF_LOOPBACK
))) ||
806 inaddr_local(ip
->ip_dst
))
816 htonl(dscp_code_from_mbuf_tclass(m
->m_pkthdr
.prio
) << 20);
819 ip
->ip_tos
|= dscp_code_from_mbuf_tclass(m
->m_pkthdr
.prio
) << 2;
823 * For TCP with background traffic class switch CC algo based on sysctl
825 if (so
->so_type
== SOCK_STREAM
) {
826 set_tcp_stream_priority(so
);
830 * Assume socket and mbuf traffic class values are the same
831 * Also assume the socket lock is held
833 so
->so_tc_stats
[mtc
].txpackets
+= 1;
834 so
->so_tc_stats
[mtc
].txbytes
+= m
->m_pkthdr
.len
;
839 __private_extern__
void
840 socket_tclass_init(void)
842 tclass_lck_grp_attr
= lck_grp_attr_alloc_init();
843 tclass_lck_grp
= lck_grp_alloc_init("tclass", tclass_lck_grp_attr
);
844 tclass_lck_attr
= lck_attr_alloc_init();
845 if ((tclass_lock
= lck_mtx_alloc_init(tclass_lck_grp
, tclass_lck_attr
)) == NULL
) {
846 panic("failed to allocate memory for tclass\n");