2 * Copyright (c) 2009-2014 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <sys/systm.h>
30 #include <sys/kernel.h>
31 #include <sys/types.h>
32 #include <sys/filedesc.h>
33 #include <sys/file_internal.h>
35 #include <sys/socket.h>
36 #include <sys/socketvar.h>
37 #include <sys/errno.h>
38 #include <sys/protosw.h>
39 #include <sys/domain.h>
41 #include <sys/queue.h>
44 #include <net/route.h>
46 #include <netinet/in.h>
47 #include <netinet/in_var.h>
48 #include <netinet/in_pcb.h>
49 #include <netinet/ip.h>
50 #include <netinet/ip_var.h>
51 #include <netinet/ip6.h>
52 #include <netinet6/ip6_var.h>
53 #include <netinet/udp.h>
54 #include <netinet/udp_var.h>
55 #include <netinet/tcp.h>
56 #include <netinet/tcp_var.h>
57 #include <netinet/tcp_cc.h>
58 #include <netinet/lro_ext.h>
60 extern char *proc_name_address(void *p
);
62 static int tfp_count
= 0;
64 static TAILQ_HEAD(, tclass_for_proc
) tfp_head
=
65 TAILQ_HEAD_INITIALIZER(tfp_head
);
67 struct tclass_for_proc
{
68 TAILQ_ENTRY(tclass_for_proc
) tfp_link
;
71 char tfp_pname
[MAXCOMLEN
+ 1];
74 static int dscp_code_from_mbuf_tclass(mbuf_traffic_class_t
);
75 static int get_pid_tclass(struct so_tcdbg
*);
76 static int get_pname_tclass(struct so_tcdbg
*);
77 static int set_pid_tclass(struct so_tcdbg
*);
78 static int set_pname_tclass(struct so_tcdbg
*);
79 static int flush_pid_tclass(struct so_tcdbg
*);
80 static int purge_tclass_for_proc(void);
81 static int flush_tclass_for_proc(void);
82 int get_tclass_for_curr_proc(int *);
83 static inline int so_throttle_best_effort(struct socket
* ,struct ifnet
*);
85 static lck_grp_attr_t
*tclass_lck_grp_attr
= NULL
; /* mutex group attributes */
86 static lck_grp_t
*tclass_lck_grp
= NULL
; /* mutex group definition */
87 static lck_attr_t
*tclass_lck_attr
= NULL
; /* mutex attributes */
88 decl_lck_mtx_data(static, tclass_lock_data
);
89 static lck_mtx_t
*tclass_lock
= &tclass_lock_data
;
92 * If there is no foreground activity on the interface for bg_switch_time
93 * seconds, the background connections can switch to foreground TCP
96 #define TCP_BG_SWITCH_TIME 2 /* seconds */
99 * Must be called with tclass_lock held
101 static struct tclass_for_proc
*
102 find_tfp_by_pid(pid_t pid
)
104 struct tclass_for_proc
*tfp
;
106 TAILQ_FOREACH(tfp
, &tfp_head
, tfp_link
) {
107 if (tfp
->tfp_pid
== pid
)
114 * Must be called with tclass_lock held
116 static struct tclass_for_proc
*
117 find_tfp_by_pname(const char *pname
)
119 struct tclass_for_proc
*tfp
;
121 TAILQ_FOREACH(tfp
, &tfp_head
, tfp_link
) {
122 if (strncmp(pname
, tfp
->tfp_pname
,
123 sizeof (tfp
->tfp_pname
)) == 0)
129 __private_extern__
int
130 get_tclass_for_curr_proc(int *sotc
)
132 struct tclass_for_proc
*tfp
= NULL
;
133 proc_t p
= current_proc(); /* Not ref counted */
134 pid_t pid
= proc_pid(p
);
135 char *pname
= proc_name_address(p
);
139 lck_mtx_lock(tclass_lock
);
141 TAILQ_FOREACH(tfp
, &tfp_head
, tfp_link
) {
142 if ((tfp
->tfp_pid
== pid
) || (tfp
->tfp_pid
== -1 &&
143 strncmp(pname
, tfp
->tfp_pname
,
144 sizeof (tfp
->tfp_pname
)) == 0)) {
145 *sotc
= tfp
->tfp_class
;
150 lck_mtx_unlock(tclass_lock
);
152 return ((tfp
== NULL
) ? 0 : 1);
156 * Purge entries with PIDs of exited processes
159 purge_tclass_for_proc(void)
162 struct tclass_for_proc
*tfp
, *tvar
;
164 lck_mtx_lock(tclass_lock
);
166 TAILQ_FOREACH_SAFE(tfp
, &tfp_head
, tfp_link
, tvar
) {
169 if (tfp
->tfp_pid
== -1)
171 if ((p
= proc_find(tfp
->tfp_pid
)) == NULL
) {
173 TAILQ_REMOVE(&tfp_head
, tfp
, tfp_link
);
181 lck_mtx_unlock(tclass_lock
);
188 * Must be called with tclass_lock held
191 free_tclass_for_proc(struct tclass_for_proc
*tfp
)
196 TAILQ_REMOVE(&tfp_head
, tfp
, tfp_link
);
204 flush_tclass_for_proc(void)
207 struct tclass_for_proc
*tfp
, *tvar
;
209 lck_mtx_lock(tclass_lock
);
211 TAILQ_FOREACH_SAFE(tfp
, &tfp_head
, tfp_link
, tvar
) {
212 free_tclass_for_proc(tfp
);
215 lck_mtx_unlock(tclass_lock
);
222 * Must be called with tclass_lock held
224 static struct tclass_for_proc
*
225 alloc_tclass_for_proc(pid_t pid
, const char *pname
)
227 struct tclass_for_proc
*tfp
;
229 if (pid
== -1 && pname
== NULL
)
232 tfp
= _MALLOC(sizeof (struct tclass_for_proc
), M_TEMP
, M_NOWAIT
|M_ZERO
);
238 * Add per pid entries before per proc name so we can find
239 * a specific instance of a process before the general name base entry.
242 TAILQ_INSERT_HEAD(&tfp_head
, tfp
, tfp_link
);
244 strlcpy(tfp
->tfp_pname
, pname
, sizeof (tfp
->tfp_pname
));
245 TAILQ_INSERT_TAIL(&tfp_head
, tfp
, tfp_link
);
254 * -1 for tclass means to remove the entry
257 set_pid_tclass(struct so_tcdbg
*so_tcdbg
)
261 struct filedesc
*fdp
;
263 struct tclass_for_proc
*tfp
;
265 pid_t pid
= so_tcdbg
->so_tcdbg_pid
;
266 int tclass
= so_tcdbg
->so_tcdbg_tclass
;
270 printf("%s proc_find(%d) failed\n", __func__
, pid
);
275 lck_mtx_lock(tclass_lock
);
277 tfp
= find_tfp_by_pid(pid
);
279 tfp
= alloc_tclass_for_proc(pid
, NULL
);
281 lck_mtx_unlock(tclass_lock
);
286 tfp
->tfp_class
= tclass
;
288 lck_mtx_unlock(tclass_lock
);
294 for (i
= 0; i
< fdp
->fd_nfiles
; i
++) {
297 fp
= fdp
->fd_ofiles
[i
];
299 (fdp
->fd_ofileflags
[i
] & UF_RESERVED
) != 0 ||
300 FILEGLOB_DTYPE(fp
->f_fglob
) != DTYPE_SOCKET
)
303 so
= (struct socket
*)fp
->f_fglob
->fg_data
;
304 if (SOCK_DOM(so
) != PF_INET
&& SOCK_DOM(so
) != PF_INET6
)
308 error
= so_set_traffic_class(so
, tclass
);
310 printf("%s: so_set_traffic_class"
311 "(so=0x%llx, fd=%d, tclass=%d) "
312 "failed %d\n", __func__
,
313 (uint64_t)VM_KERNEL_ADDRPERM(so
),
318 socket_unlock(so
, 1);
333 set_pname_tclass(struct so_tcdbg
*so_tcdbg
)
336 struct tclass_for_proc
*tfp
;
338 lck_mtx_lock(tclass_lock
);
340 tfp
= find_tfp_by_pname(so_tcdbg
->so_tcdbg_pname
);
342 tfp
= alloc_tclass_for_proc(-1, so_tcdbg
->so_tcdbg_pname
);
344 lck_mtx_unlock(tclass_lock
);
349 tfp
->tfp_class
= so_tcdbg
->so_tcdbg_tclass
;
351 lck_mtx_unlock(tclass_lock
);
360 flush_pid_tclass(struct so_tcdbg
*so_tcdbg
)
362 pid_t pid
= so_tcdbg
->so_tcdbg_pid
;
363 int tclass
= so_tcdbg
->so_tcdbg_tclass
;
364 struct filedesc
*fdp
;
370 if (p
== PROC_NULL
) {
371 printf("%s proc_find(%d) failed\n", __func__
, pid
);
377 for (i
= 0; i
< fdp
->fd_nfiles
; i
++) {
381 fp
= fdp
->fd_ofiles
[i
];
383 (fdp
->fd_ofileflags
[i
] & UF_RESERVED
) != 0 ||
384 FILEGLOB_DTYPE(fp
->f_fglob
) != DTYPE_SOCKET
)
387 so
= (struct socket
*)fp
->f_fglob
->fg_data
;
388 error
= sock_setsockopt(so
, SOL_SOCKET
, SO_FLUSH
, &tclass
,
391 printf("%s: setsockopt(SO_FLUSH) (so=0x%llx, fd=%d, "
392 "tclass=%d) failed %d\n", __func__
,
393 (uint64_t)VM_KERNEL_ADDRPERM(so
), i
, tclass
,
409 get_pid_tclass(struct so_tcdbg
*so_tcdbg
)
413 struct tclass_for_proc
*tfp
;
414 pid_t pid
= so_tcdbg
->so_tcdbg_pid
;
416 so_tcdbg
->so_tcdbg_tclass
= -1; /* Means not set */
417 so_tcdbg
->so_tcdbg_opportunistic
= -1; /* Means not set */
421 printf("%s proc_find(%d) failed\n", __func__
, pid
);
426 lck_mtx_lock(tclass_lock
);
428 tfp
= find_tfp_by_pid(pid
);
430 so_tcdbg
->so_tcdbg_tclass
= tfp
->tfp_class
;
433 lck_mtx_unlock(tclass_lock
);
442 get_pname_tclass(struct so_tcdbg
*so_tcdbg
)
445 struct tclass_for_proc
*tfp
;
447 so_tcdbg
->so_tcdbg_tclass
= -1; /* Means not set */
448 so_tcdbg
->so_tcdbg_opportunistic
= -1; /* Means not set */
451 lck_mtx_lock(tclass_lock
);
453 tfp
= find_tfp_by_pname(so_tcdbg
->so_tcdbg_pname
);
455 so_tcdbg
->so_tcdbg_tclass
= tfp
->tfp_class
;
458 lck_mtx_unlock(tclass_lock
);
464 delete_tclass_for_pid_pname(struct so_tcdbg
*so_tcdbg
)
467 pid_t pid
= so_tcdbg
->so_tcdbg_pid
;
468 struct tclass_for_proc
*tfp
= NULL
;
470 lck_mtx_lock(tclass_lock
);
473 tfp
= find_tfp_by_pid(pid
);
475 tfp
= find_tfp_by_pname(so_tcdbg
->so_tcdbg_pname
);
478 free_tclass_for_proc(tfp
);
482 lck_mtx_unlock(tclass_lock
);
488 * Setting options requires privileges
490 __private_extern__
int
491 so_set_tcdbg(struct socket
*so
, struct so_tcdbg
*so_tcdbg
)
495 if ((so
->so_state
& SS_PRIV
) == 0)
498 socket_unlock(so
, 0);
500 switch (so_tcdbg
->so_tcdbg_cmd
) {
502 error
= set_pid_tclass(so_tcdbg
);
506 error
= set_pname_tclass(so_tcdbg
);
510 error
= purge_tclass_for_proc();
514 error
= flush_tclass_for_proc();
517 case SO_TCDBG_DELETE
:
518 error
= delete_tclass_for_pid_pname(so_tcdbg
);
521 case SO_TCDBG_TCFLUSH_PID
:
522 error
= flush_pid_tclass(so_tcdbg
);
536 * Not required to be privileged to get
538 __private_extern__
int
539 sogetopt_tcdbg(struct socket
*so
, struct sockopt
*sopt
)
542 struct so_tcdbg so_tcdbg
;
544 size_t len
= sopt
->sopt_valsize
;
546 error
= sooptcopyin(sopt
, &so_tcdbg
, sizeof (struct so_tcdbg
),
547 sizeof (struct so_tcdbg
));
551 sopt
->sopt_valsize
= len
;
553 socket_unlock(so
, 0);
555 switch (so_tcdbg
.so_tcdbg_cmd
) {
557 error
= get_pid_tclass(&so_tcdbg
);
561 error
= get_pname_tclass(&so_tcdbg
);
565 lck_mtx_lock(tclass_lock
);
566 so_tcdbg
.so_tcdbg_count
= tfp_count
;
567 lck_mtx_unlock(tclass_lock
);
570 case SO_TCDBG_LIST
: {
571 struct tclass_for_proc
*tfp
;
573 struct so_tcdbg
*ptr
;
575 lck_mtx_lock(tclass_lock
);
576 if ((alloc_count
= tfp_count
) == 0) {
577 lck_mtx_unlock(tclass_lock
);
581 len
= alloc_count
* sizeof (struct so_tcdbg
);
582 lck_mtx_unlock(tclass_lock
);
584 buf
= _MALLOC(len
, M_TEMP
, M_WAITOK
| M_ZERO
);
590 lck_mtx_lock(tclass_lock
);
592 ptr
= (struct so_tcdbg
*)buf
;
593 TAILQ_FOREACH(tfp
, &tfp_head
, tfp_link
) {
594 if (++n
> alloc_count
)
596 if (tfp
->tfp_pid
!= -1) {
597 ptr
->so_tcdbg_cmd
= SO_TCDBG_PID
;
598 ptr
->so_tcdbg_pid
= tfp
->tfp_pid
;
600 ptr
->so_tcdbg_cmd
= SO_TCDBG_PNAME
;
601 ptr
->so_tcdbg_pid
= -1;
602 strlcpy(ptr
->so_tcdbg_pname
,
604 sizeof (ptr
->so_tcdbg_pname
));
606 ptr
->so_tcdbg_tclass
= tfp
->tfp_class
;
610 lck_mtx_unlock(tclass_lock
);
623 error
= sooptcopyout(sopt
, &so_tcdbg
,
624 sizeof (struct so_tcdbg
));
626 error
= sooptcopyout(sopt
, buf
, len
);
634 __private_extern__
int
635 so_set_traffic_class(struct socket
*so
, int optval
)
639 if (optval
< SO_TC_BE
|| optval
> SO_TC_CTL
) {
653 if (!SO_VALID_TC(optval
))
659 int oldval
= so
->so_traffic_class
;
661 VERIFY(SO_VALID_TC(optval
));
662 so
->so_traffic_class
= optval
;
664 if ((SOCK_DOM(so
) == PF_INET
||
665 SOCK_DOM(so
) == PF_INET6
) &&
666 SOCK_TYPE(so
) == SOCK_STREAM
)
667 set_tcp_stream_priority(so
);
669 if ((SOCK_DOM(so
) == PF_INET
||
670 SOCK_DOM(so
) == PF_INET6
) &&
671 optval
!= oldval
&& (optval
== SO_TC_BK_SYS
||
672 oldval
== SO_TC_BK_SYS
)) {
674 * If the app switches from BK_SYS to something
675 * else, resume the socket if it was suspended.
677 if (oldval
== SO_TC_BK_SYS
)
678 inp_reset_fc_state(so
->so_pcb
);
680 SOTHROTTLELOG(("throttle[%d]: so 0x%llx "
681 "[%d,%d] opportunistic %s\n", so
->last_pid
,
682 (uint64_t)VM_KERNEL_ADDRPERM(so
),
683 SOCK_DOM(so
), SOCK_TYPE(so
),
684 (optval
== SO_TC_BK_SYS
) ? "ON" : "OFF"));
691 __private_extern__
void
692 so_set_default_traffic_class(struct socket
*so
)
697 (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
)) {
698 get_tclass_for_curr_proc(&sotc
);
701 so
->so_traffic_class
= (sotc
!= -1) ? sotc
: SO_TC_BE
;
704 __private_extern__
int
705 so_set_opportunistic(struct socket
*so
, int optval
)
707 return (so_set_traffic_class(so
, (optval
== 0) ?
708 SO_TC_BE
: SO_TC_BK_SYS
));
711 __private_extern__
int
712 so_get_opportunistic(struct socket
*so
)
714 return (so
->so_traffic_class
== SO_TC_BK_SYS
);
717 __private_extern__ mbuf_svc_class_t
718 mbuf_service_class_from_control(struct mbuf
*control
)
721 mbuf_svc_class_t msc
= MBUF_SC_UNSPEC
;
723 for (cm
= M_FIRST_CMSGHDR(control
); cm
!= NULL
;
724 cm
= M_NXT_CMSGHDR(control
, cm
)) {
727 if (cm
->cmsg_len
< sizeof (struct cmsghdr
))
730 if (cm
->cmsg_level
!= SOL_SOCKET
||
731 cm
->cmsg_type
!= SO_TRAFFIC_CLASS
)
733 if (cm
->cmsg_len
!= CMSG_LEN(sizeof (int)))
736 tc
= *(int *)(void *)CMSG_DATA(cm
);
738 if (MBUF_VALID_SC(msc
))
745 __private_extern__
int
746 dscp_code_from_mbuf_tclass(mbuf_traffic_class_t mtc
)
769 __private_extern__
void
770 so_recv_data_stat(struct socket
*so
, struct mbuf
*m
, size_t off
)
772 uint32_t sotc
= m_get_traffic_class(m
);
774 if (sotc
>= SO_TC_STATS_MAX
)
777 so
->so_tc_stats
[sotc
].rxpackets
+= 1;
778 so
->so_tc_stats
[sotc
].rxbytes
+=
779 ((m
->m_flags
& M_PKTHDR
) ? m
->m_pkthdr
.len
: 0) + off
;
782 __private_extern__
void
783 so_inc_recv_data_stat(struct socket
*so
, size_t pkts
, size_t bytes
, uint32_t tc
)
785 if (tc
>= SO_TC_STATS_MAX
)
788 so
->so_tc_stats
[tc
].rxpackets
+= pkts
;
789 so
->so_tc_stats
[tc
].rxbytes
+=bytes
;
793 so_throttle_best_effort(struct socket
*so
, struct ifnet
*ifp
)
795 u_int32_t uptime
= net_uptime();
796 return (soissrcbesteffort(so
) &&
797 net_io_policy_throttle_best_effort
== 1 &&
798 ifp
->if_rt_sendts
> 0 &&
799 (int)(uptime
- ifp
->if_rt_sendts
) <= TCP_BG_SWITCH_TIME
);
802 __private_extern__
void
803 set_tcp_stream_priority(struct socket
*so
)
805 struct inpcb
*inp
= sotoinpcb(so
);
806 struct tcpcb
*tp
= intotcpcb(inp
);
807 struct ifnet
*outifp
;
808 u_char old_cc
= tp
->tcp_cc_index
;
809 int recvbg
= IS_TCP_RECV_BG(so
);
810 bool is_local
= false, fg_active
= false;
813 VERIFY((SOCK_CHECK_DOM(so
, PF_INET
)
814 || SOCK_CHECK_DOM(so
, PF_INET6
))
815 && SOCK_CHECK_TYPE(so
, SOCK_STREAM
)
816 && SOCK_CHECK_PROTO(so
, IPPROTO_TCP
));
818 /* Return if the socket is in a terminal state */
819 if (inp
->inp_state
== INPCB_STATE_DEAD
)
822 outifp
= inp
->inp_last_outifp
;
823 uptime
= net_uptime();
826 * If the socket was marked as a background socket or if the
827 * traffic class is set to background with traffic class socket
828 * option then make both send and recv side of the stream to be
829 * background. The variable sotcdb which can be set with sysctl
830 * is used to disable these settings for testing.
832 if (outifp
== NULL
|| (outifp
->if_flags
& IFF_LOOPBACK
))
835 /* Check if there has been recent foreground activity */
836 if (outifp
!= NULL
) {
838 * If the traffic source is background, check if
839 * if it can be switched to foreground. This can
840 * happen when there is no indication of foreground
843 if (soissrcbackground(so
) &&
844 ((outifp
->if_fg_sendts
> 0 &&
845 (int)(uptime
- outifp
->if_fg_sendts
) <=
846 TCP_BG_SWITCH_TIME
) || net_io_policy_throttled
))
850 * The traffic source is best-effort -- check if
851 * the policy to throttle best effort is enabled
852 * and there was realtime activity on this
853 * interface recently. If this is true, enable
854 * algorithms that respond to increased latency
855 * on best-effort traffic.
857 if (so_throttle_best_effort(so
, outifp
))
862 * System initiated background traffic like cloud uploads should
863 * always use background delay sensitive algorithms. This will
864 * make the stream more responsive to other streams on the user's
865 * network and it will minimize latency induced.
867 if (fg_active
|| IS_SO_TC_BACKGROUNDSYSTEM(so
->so_traffic_class
)) {
869 * If the interface that the connection is using is
870 * loopback, do not use background congestion
873 * If there has been recent foreground activity or if
874 * there was an indication that a foreground application
875 * is going to use networking (net_io_policy_throttled),
876 * switch the backgroung streams to use background
877 * congestion control algorithm. Otherwise, even background
878 * flows can move into foreground.
880 if ((sotcdb
& SOTCDB_NO_SENDTCPBG
) != 0 || is_local
||
881 !IS_SO_TC_BACKGROUNDSYSTEM(so
->so_traffic_class
)) {
882 if (old_cc
== TCP_CC_ALGO_BACKGROUND_INDEX
)
883 tcp_set_foreground_cc(so
);
885 if (old_cc
!= TCP_CC_ALGO_BACKGROUND_INDEX
)
886 tcp_set_background_cc(so
);
889 /* Set receive side background flags */
890 if ((sotcdb
& SOTCDB_NO_RECVTCPBG
) != 0 || is_local
||
891 !IS_SO_TC_BACKGROUNDSYSTEM(so
->so_traffic_class
)) {
892 tcp_clear_recv_bg(so
);
897 tcp_clear_recv_bg(so
);
898 if (old_cc
== TCP_CC_ALGO_BACKGROUND_INDEX
)
899 tcp_set_foreground_cc(so
);
902 if (old_cc
!= tp
->tcp_cc_index
|| recvbg
!= IS_TCP_RECV_BG(so
)) {
903 SOTHROTTLELOG(("throttle[%d]: so 0x%llx [%d,%d] TCP %s send; "
904 "%s recv\n", so
->last_pid
, (uint64_t)VM_KERNEL_ADDRPERM(so
),
905 SOCK_DOM(so
), SOCK_TYPE(so
),
906 (tp
->tcp_cc_index
== TCP_CC_ALGO_BACKGROUND_INDEX
) ?
907 "background" : "foreground",
908 IS_TCP_RECV_BG(so
) ? "background" : "foreground"));
913 * Set traffic class to an IPv4 or IPv6 packet
915 * - set the DSCP code following the WMM mapping
917 __private_extern__
void
918 set_packet_service_class(struct mbuf
*m
, struct socket
*so
,
919 mbuf_svc_class_t in_msc
, u_int32_t flags
)
921 mbuf_svc_class_t msc
= MBUF_SC_BE
; /* Best effort by default */
922 struct inpcb
*inp
= sotoinpcb(so
); /* in6pcb and inpcb are the same */
923 struct ip
*ip
= mtod(m
, struct ip
*);
925 struct ip6_hdr
*ip6
= mtod(m
, struct ip6_hdr
*);
927 int isipv6
= ((flags
& PKT_SCF_IPV6
) != 0) ? 1 : 0;
929 if (!(m
->m_flags
& M_PKTHDR
))
933 * Here is the precedence:
934 * 1) TRAFFIC_MGT_SO_BACKGROUND trumps all
935 * 2) Traffic class passed via ancillary data to sendmsdg(2)
936 * 3) Traffic class socket option last
938 if (in_msc
!= MBUF_SC_UNSPEC
) {
939 if (in_msc
>= MBUF_SC_BE
&& in_msc
<= MBUF_SC_CTL
)
942 VERIFY(SO_VALID_TC(so
->so_traffic_class
));
943 msc
= so_tc2msc(so
->so_traffic_class
);
944 /* Assert because tc must have been valid */
945 VERIFY(MBUF_VALID_SC(msc
));
949 * If TRAFFIC_MGT_SO_BACKGROUND is set or policy to throttle
950 * best effort is set, depress the priority.
952 if (!IS_MBUF_SC_BACKGROUND(msc
) && soisthrottled(so
))
955 if (IS_MBUF_SC_BESTEFFORT(msc
) && inp
->inp_last_outifp
!= NULL
&&
956 so_throttle_best_effort(so
, inp
->inp_last_outifp
))
959 if (soissrcbackground(so
))
960 m
->m_pkthdr
.pkt_flags
|= PKTF_SO_BACKGROUND
;
962 if (soissrcrealtime(so
) || IS_MBUF_SC_REALTIME(msc
))
963 m
->m_pkthdr
.pkt_flags
|= PKTF_SO_REALTIME
;
965 * Set the traffic class in the mbuf packet header svc field
967 if (sotcdb
& SOTCDB_NO_MTC
)
970 /* Elevate service class if the packet is a pure TCP ACK.
971 * We can do this only when the flow is not a background
972 * flow and the outgoing interface supports
973 * transmit-start model.
975 if (!IS_MBUF_SC_BACKGROUND(msc
) && (flags
& PKT_SCF_TCP_ACK
))
978 (void) m_set_service_class(m
, msc
);
981 * Set the privileged traffic auxiliary flag if applicable,
984 if (!(sotcdb
& SOTCDB_NO_PRIVILEGED
) && soisprivilegedtraffic(so
) &&
985 msc
!= MBUF_SC_UNSPEC
)
986 m
->m_pkthdr
.pkt_flags
|= PKTF_PRIO_PRIVILEGED
;
988 m
->m_pkthdr
.pkt_flags
&= ~PKTF_PRIO_PRIVILEGED
;
992 * Quick exit when best effort
994 if (msc
== MBUF_SC_BE
)
998 * The default behavior is for the networking stack to not set the
999 * DSCP code, based on SOTCDB_NO_DSCP being set. If the flag is
1000 * cleared, set the DSCP code in IPv4 or IPv6 header only for local
1001 * traffic, if it is not already set. <rdar://problem/11277343>
1003 if (sotcdb
& SOTCDB_NO_DSCP
)
1007 * Test if a IP TOS or IPV6 TCLASS has already been set
1008 * on the socket or the raw packet.
1010 if (!(sotcdb
& SOTCDB_NO_DSCPTST
)) {
1013 if ((so
->so_type
== SOCK_RAW
&&
1014 (ip6
->ip6_flow
& htonl(0xff << 20)) != 0) ||
1015 (inp
->in6p_outputopts
&&
1016 inp
->in6p_outputopts
->ip6po_tclass
!= -1))
1020 if ((so
->so_type
== SOCK_RAW
&&
1021 (inp
->inp_flags
& INP_HDRINCL
)) ||
1022 inp
->inp_ip_tos
!= 0)
1027 * Test if destination is local
1029 if (!(sotcdb
& SOTCDB_NO_LCLTST
)) {
1031 struct rtentry
*rt
= inp
->inp_route
.ro_rt
;
1033 if (so
->so_type
== SOCK_STREAM
) {
1034 if (intotcpcb(inp
)->t_flags
& TF_LOCAL
)
1036 } else if (rt
!= NULL
&&
1037 (rt
->rt_gateway
->sa_family
== AF_LINK
||
1038 (rt
->rt_ifp
->if_flags
& (IFF_LOOPBACK
|IFF_POINTOPOINT
)))) {
1039 if (!(rt
->rt_ifp
->if_flags
& IFF_POINTOPOINT
))
1043 if (isipv6
&& in6addr_local(&ip6
->ip6_dst
)) {
1047 if (inaddr_local(ip
->ip_dst
)) {
1056 ip6
->ip6_flow
|= htonl(dscp_code_from_mbuf_tclass(
1057 m_get_traffic_class(m
)) << 20);
1060 ip
->ip_tos
|= dscp_code_from_mbuf_tclass(
1061 m_get_traffic_class(m
)) << 2;
1065 * For TCP with background traffic class switch CC algo based on sysctl
1067 if (so
->so_type
== SOCK_STREAM
)
1068 set_tcp_stream_priority(so
);
1070 so_tc_update_stats(m
, so
, msc
);
1073 __private_extern__
void
1074 so_tc_update_stats(struct mbuf
*m
, struct socket
*so
, mbuf_svc_class_t msc
)
1076 mbuf_traffic_class_t mtc
;
1079 * Assume socket and mbuf traffic class values are the same
1080 * Also assume the socket lock is held. Note that the stats
1081 * at the socket layer are reduced down to the legacy traffic
1082 * classes; we could/should potentially expand so_tc_stats[].
1084 mtc
= MBUF_SC2TC(msc
);
1085 VERIFY(mtc
< SO_TC_STATS_MAX
);
1086 so
->so_tc_stats
[mtc
].txpackets
+= 1;
1087 so
->so_tc_stats
[mtc
].txbytes
+= m
->m_pkthdr
.len
;
1090 __private_extern__
void
1091 socket_tclass_init(void)
1093 _CASSERT(_SO_TC_MAX
== SO_TC_STATS_MAX
);
1095 tclass_lck_grp_attr
= lck_grp_attr_alloc_init();
1096 tclass_lck_grp
= lck_grp_alloc_init("tclass", tclass_lck_grp_attr
);
1097 tclass_lck_attr
= lck_attr_alloc_init();
1098 lck_mtx_init(tclass_lock
, tclass_lck_grp
, tclass_lck_attr
);
1101 __private_extern__ mbuf_svc_class_t
1104 mbuf_svc_class_t msc
;
1108 msc
= MBUF_SC_BK_SYS
;
1142 msc
= MBUF_SC_UNSPEC
;
1149 __private_extern__
int
1150 so_svc2tc(mbuf_svc_class_t svc
)
1153 case MBUF_SC_UNSPEC
:
1155 case MBUF_SC_BK_SYS
:
1156 return SO_TC_BK_SYS
;
1181 * LRO is turned on for AV streaming class.
1184 so_set_lro(struct socket
*so
, int optval
)
1186 if (optval
== SO_TC_AV
) {
1187 so
->so_flags
|= SOF_USELRO
;
1189 if (so
->so_flags
& SOF_USELRO
) {
1190 /* transition to non LRO class */
1191 so
->so_flags
&= ~SOF_USELRO
;
1192 struct inpcb
*inp
= sotoinpcb(so
);
1193 struct tcpcb
*tp
= NULL
;
1195 tp
= intotcpcb(inp
);
1196 if (tp
&& (tp
->t_flagsext
& TF_LRO_OFFLOADED
)) {
1197 tcp_lro_remove_state(inp
->inp_laddr
,
1201 tp
->t_flagsext
&= ~TF_LRO_OFFLOADED
;