]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/in_tclass.c
20a37fd9f2ec6f4eeb8c2e8797368fc4243b1234
[apple/xnu.git] / bsd / netinet / in_tclass.c
1 /*
2 * Copyright (c) 2009-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <sys/systm.h>
30 #include <sys/kernel.h>
31 #include <sys/types.h>
32 #include <sys/filedesc.h>
33 #include <sys/file_internal.h>
34 #include <sys/proc.h>
35 #include <sys/socket.h>
36 #include <sys/socketvar.h>
37 #include <sys/errno.h>
38 #include <sys/protosw.h>
39 #include <sys/domain.h>
40 #include <sys/mbuf.h>
41 #include <sys/queue.h>
42
43 #include <net/if.h>
44 #include <net/route.h>
45
46 #include <netinet/in.h>
47 #include <netinet/in_var.h>
48 #include <netinet/in_pcb.h>
49 #include <netinet/ip.h>
50 #include <netinet/ip_var.h>
51 #include <netinet/ip6.h>
52 #include <netinet6/ip6_var.h>
53 #include <netinet/udp.h>
54 #include <netinet/udp_var.h>
55 #include <netinet/tcp.h>
56 #include <netinet/tcp_var.h>
57 #include <netinet/tcp_cc.h>
58 #include <netinet/lro_ext.h>
59
60 extern char *proc_name_address(void *p);
61
62 static int tfp_count = 0;
63
64 static TAILQ_HEAD(, tclass_for_proc) tfp_head =
65 TAILQ_HEAD_INITIALIZER(tfp_head);
66
67 struct tclass_for_proc {
68 TAILQ_ENTRY(tclass_for_proc) tfp_link;
69 int tfp_class;
70 pid_t tfp_pid;
71 char tfp_pname[MAXCOMLEN + 1];
72 };
73
74 static int dscp_code_from_mbuf_tclass(mbuf_traffic_class_t);
75 static int get_pid_tclass(struct so_tcdbg *);
76 static int get_pname_tclass(struct so_tcdbg *);
77 static int set_pid_tclass(struct so_tcdbg *);
78 static int set_pname_tclass(struct so_tcdbg *);
79 static int flush_pid_tclass(struct so_tcdbg *);
80 static int purge_tclass_for_proc(void);
81 static int flush_tclass_for_proc(void);
82 int get_tclass_for_curr_proc(int *);
83 static inline int so_throttle_best_effort(struct socket* ,struct ifnet *);
84
85 static lck_grp_attr_t *tclass_lck_grp_attr = NULL; /* mutex group attributes */
86 static lck_grp_t *tclass_lck_grp = NULL; /* mutex group definition */
87 static lck_attr_t *tclass_lck_attr = NULL; /* mutex attributes */
88 decl_lck_mtx_data(static, tclass_lock_data);
89 static lck_mtx_t *tclass_lock = &tclass_lock_data;
90
91 /*
92 * If there is no foreground activity on the interface for bg_switch_time
93 * seconds, the background connections can switch to foreground TCP
94 * congestion control.
95 */
96 #define TCP_BG_SWITCH_TIME 2 /* seconds */
97
98 /*
99 * Must be called with tclass_lock held
100 */
101 static struct tclass_for_proc *
102 find_tfp_by_pid(pid_t pid)
103 {
104 struct tclass_for_proc *tfp;
105
106 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
107 if (tfp->tfp_pid == pid)
108 break;
109 }
110 return (tfp);
111 }
112
113 /*
114 * Must be called with tclass_lock held
115 */
116 static struct tclass_for_proc *
117 find_tfp_by_pname(const char *pname)
118 {
119 struct tclass_for_proc *tfp;
120
121 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
122 if (strncmp(pname, tfp->tfp_pname,
123 sizeof (tfp->tfp_pname)) == 0)
124 break;
125 }
126 return (tfp);
127 }
128
129 __private_extern__ int
130 get_tclass_for_curr_proc(int *sotc)
131 {
132 struct tclass_for_proc *tfp = NULL;
133 proc_t p = current_proc(); /* Not ref counted */
134 pid_t pid = proc_pid(p);
135 char *pname = proc_name_address(p);
136
137 *sotc = -1;
138
139 lck_mtx_lock(tclass_lock);
140
141 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
142 if ((tfp->tfp_pid == pid) || (tfp->tfp_pid == -1 &&
143 strncmp(pname, tfp->tfp_pname,
144 sizeof (tfp->tfp_pname)) == 0)) {
145 *sotc = tfp->tfp_class;
146 break;
147 }
148 }
149
150 lck_mtx_unlock(tclass_lock);
151
152 return ((tfp == NULL) ? 0 : 1);
153 }
154
155 /*
156 * Purge entries with PIDs of exited processes
157 */
158 int
159 purge_tclass_for_proc(void)
160 {
161 int error = 0;
162 struct tclass_for_proc *tfp, *tvar;
163
164 lck_mtx_lock(tclass_lock);
165
166 TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) {
167 proc_t p;
168
169 if (tfp->tfp_pid == -1)
170 continue;
171 if ((p = proc_find(tfp->tfp_pid)) == NULL) {
172 tfp_count--;
173 TAILQ_REMOVE(&tfp_head, tfp, tfp_link);
174
175 _FREE(tfp, M_TEMP);
176 } else {
177 proc_rele(p);
178 }
179 }
180
181 lck_mtx_unlock(tclass_lock);
182
183 return (error);
184 }
185
186 /*
187 * Remove one entry
188 * Must be called with tclass_lock held
189 */
190 static void
191 free_tclass_for_proc(struct tclass_for_proc *tfp)
192 {
193 if (tfp == NULL)
194 return;
195 tfp_count--;
196 TAILQ_REMOVE(&tfp_head, tfp, tfp_link);
197 _FREE(tfp, M_TEMP);
198 }
199
200 /*
201 * Remove all entries
202 */
203 int
204 flush_tclass_for_proc(void)
205 {
206 int error = 0;
207 struct tclass_for_proc *tfp, *tvar;
208
209 lck_mtx_lock(tclass_lock);
210
211 TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) {
212 free_tclass_for_proc(tfp);
213 }
214
215 lck_mtx_unlock(tclass_lock);
216
217 return (error);
218
219 }
220
221 /*
222 * Must be called with tclass_lock held
223 */
224 static struct tclass_for_proc *
225 alloc_tclass_for_proc(pid_t pid, const char *pname)
226 {
227 struct tclass_for_proc *tfp;
228
229 if (pid == -1 && pname == NULL)
230 return (NULL);
231
232 tfp = _MALLOC(sizeof (struct tclass_for_proc), M_TEMP, M_NOWAIT|M_ZERO);
233 if (tfp == NULL)
234 return (NULL);
235
236 tfp->tfp_pid = pid;
237 /*
238 * Add per pid entries before per proc name so we can find
239 * a specific instance of a process before the general name base entry.
240 */
241 if (pid != -1) {
242 TAILQ_INSERT_HEAD(&tfp_head, tfp, tfp_link);
243 } else {
244 strlcpy(tfp->tfp_pname, pname, sizeof (tfp->tfp_pname));
245 TAILQ_INSERT_TAIL(&tfp_head, tfp, tfp_link);
246 }
247
248 tfp_count++;
249
250 return (tfp);
251 }
252
253 /*
254 * -1 for tclass means to remove the entry
255 */
256 int
257 set_pid_tclass(struct so_tcdbg *so_tcdbg)
258 {
259 int error = EINVAL;
260 proc_t p = NULL;
261 struct filedesc *fdp;
262 struct fileproc *fp;
263 struct tclass_for_proc *tfp;
264 int i;
265 pid_t pid = so_tcdbg->so_tcdbg_pid;
266 int tclass = so_tcdbg->so_tcdbg_tclass;
267
268 p = proc_find(pid);
269 if (p == NULL) {
270 printf("%s proc_find(%d) failed\n", __func__, pid);
271 goto done;
272 }
273
274 /* Need a tfp */
275 lck_mtx_lock(tclass_lock);
276
277 tfp = find_tfp_by_pid(pid);
278 if (tfp == NULL) {
279 tfp = alloc_tclass_for_proc(pid, NULL);
280 if (tfp == NULL) {
281 lck_mtx_unlock(tclass_lock);
282 error = ENOBUFS;
283 goto done;
284 }
285 }
286 tfp->tfp_class = tclass;
287
288 lck_mtx_unlock(tclass_lock);
289
290 if (tfp != NULL) {
291 proc_fdlock(p);
292
293 fdp = p->p_fd;
294 for (i = 0; i < fdp->fd_nfiles; i++) {
295 struct socket *so;
296
297 fp = fdp->fd_ofiles[i];
298 if (fp == NULL ||
299 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
300 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
301 continue;
302
303 so = (struct socket *)fp->f_fglob->fg_data;
304 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6)
305 continue;
306 socket_lock(so, 1);
307 if (tclass != -1) {
308 error = so_set_traffic_class(so, tclass);
309 if (error != 0) {
310 printf("%s: so_set_traffic_class"
311 "(so=0x%llx, fd=%d, tclass=%d) "
312 "failed %d\n", __func__,
313 (uint64_t)VM_KERNEL_ADDRPERM(so),
314 i, tclass, error);
315 error = 0;
316 }
317 }
318 socket_unlock(so, 1);
319 }
320
321 proc_fdunlock(p);
322 }
323
324 error = 0;
325 done:
326 if (p != NULL)
327 proc_rele(p);
328
329 return (error);
330 }
331
332 int
333 set_pname_tclass(struct so_tcdbg *so_tcdbg)
334 {
335 int error = EINVAL;
336 struct tclass_for_proc *tfp;
337
338 lck_mtx_lock(tclass_lock);
339
340 tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname);
341 if (tfp == NULL) {
342 tfp = alloc_tclass_for_proc(-1, so_tcdbg->so_tcdbg_pname);
343 if (tfp == NULL) {
344 lck_mtx_unlock(tclass_lock);
345 error = ENOBUFS;
346 goto done;
347 }
348 }
349 tfp->tfp_class = so_tcdbg->so_tcdbg_tclass;
350
351 lck_mtx_unlock(tclass_lock);
352
353 error = 0;
354 done:
355
356 return (error);
357 }
358
359 static int
360 flush_pid_tclass(struct so_tcdbg *so_tcdbg)
361 {
362 pid_t pid = so_tcdbg->so_tcdbg_pid;
363 int tclass = so_tcdbg->so_tcdbg_tclass;
364 struct filedesc *fdp;
365 int error = EINVAL;
366 proc_t p;
367 int i;
368
369 p = proc_find(pid);
370 if (p == PROC_NULL) {
371 printf("%s proc_find(%d) failed\n", __func__, pid);
372 goto done;
373 }
374
375 proc_fdlock(p);
376 fdp = p->p_fd;
377 for (i = 0; i < fdp->fd_nfiles; i++) {
378 struct socket *so;
379 struct fileproc *fp;
380
381 fp = fdp->fd_ofiles[i];
382 if (fp == NULL ||
383 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
384 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
385 continue;
386
387 so = (struct socket *)fp->f_fglob->fg_data;
388 error = sock_setsockopt(so, SOL_SOCKET, SO_FLUSH, &tclass,
389 sizeof (tclass));
390 if (error != 0) {
391 printf("%s: setsockopt(SO_FLUSH) (so=0x%llx, fd=%d, "
392 "tclass=%d) failed %d\n", __func__,
393 (uint64_t)VM_KERNEL_ADDRPERM(so), i, tclass,
394 error);
395 error = 0;
396 }
397 }
398 proc_fdunlock(p);
399
400 error = 0;
401 done:
402 if (p != PROC_NULL)
403 proc_rele(p);
404
405 return (error);
406 }
407
408 int
409 get_pid_tclass(struct so_tcdbg *so_tcdbg)
410 {
411 int error = EINVAL;
412 proc_t p = NULL;
413 struct tclass_for_proc *tfp;
414 pid_t pid = so_tcdbg->so_tcdbg_pid;
415
416 so_tcdbg->so_tcdbg_tclass = -1; /* Means not set */
417 so_tcdbg->so_tcdbg_opportunistic = -1; /* Means not set */
418
419 p = proc_find(pid);
420 if (p == NULL) {
421 printf("%s proc_find(%d) failed\n", __func__, pid);
422 goto done;
423 }
424
425 /* Need a tfp */
426 lck_mtx_lock(tclass_lock);
427
428 tfp = find_tfp_by_pid(pid);
429 if (tfp != NULL) {
430 so_tcdbg->so_tcdbg_tclass = tfp->tfp_class;
431 error = 0;
432 }
433 lck_mtx_unlock(tclass_lock);
434 done:
435 if (p != NULL)
436 proc_rele(p);
437
438 return (error);
439 }
440
441 int
442 get_pname_tclass(struct so_tcdbg *so_tcdbg)
443 {
444 int error = EINVAL;
445 struct tclass_for_proc *tfp;
446
447 so_tcdbg->so_tcdbg_tclass = -1; /* Means not set */
448 so_tcdbg->so_tcdbg_opportunistic = -1; /* Means not set */
449
450 /* Need a tfp */
451 lck_mtx_lock(tclass_lock);
452
453 tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname);
454 if (tfp != NULL) {
455 so_tcdbg->so_tcdbg_tclass = tfp->tfp_class;
456 error = 0;
457 }
458 lck_mtx_unlock(tclass_lock);
459
460 return (error);
461 }
462
463 static int
464 delete_tclass_for_pid_pname(struct so_tcdbg *so_tcdbg)
465 {
466 int error = EINVAL;
467 pid_t pid = so_tcdbg->so_tcdbg_pid;
468 struct tclass_for_proc *tfp = NULL;
469
470 lck_mtx_lock(tclass_lock);
471
472 if (pid != -1)
473 tfp = find_tfp_by_pid(pid);
474 else
475 tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname);
476
477 if (tfp != NULL) {
478 free_tclass_for_proc(tfp);
479 error = 0;
480 }
481
482 lck_mtx_unlock(tclass_lock);
483
484 return (error);
485 }
486
487 /*
488 * Setting options requires privileges
489 */
490 __private_extern__ int
491 so_set_tcdbg(struct socket *so, struct so_tcdbg *so_tcdbg)
492 {
493 int error = 0;
494
495 if ((so->so_state & SS_PRIV) == 0)
496 return (EPERM);
497
498 socket_unlock(so, 0);
499
500 switch (so_tcdbg->so_tcdbg_cmd) {
501 case SO_TCDBG_PID:
502 error = set_pid_tclass(so_tcdbg);
503 break;
504
505 case SO_TCDBG_PNAME:
506 error = set_pname_tclass(so_tcdbg);
507 break;
508
509 case SO_TCDBG_PURGE:
510 error = purge_tclass_for_proc();
511 break;
512
513 case SO_TCDBG_FLUSH:
514 error = flush_tclass_for_proc();
515 break;
516
517 case SO_TCDBG_DELETE:
518 error = delete_tclass_for_pid_pname(so_tcdbg);
519 break;
520
521 case SO_TCDBG_TCFLUSH_PID:
522 error = flush_pid_tclass(so_tcdbg);
523 break;
524
525 default:
526 error = EINVAL;
527 break;
528 }
529
530 socket_lock(so, 0);
531
532 return (error);
533 }
534
535 /*
536 * Not required to be privileged to get
537 */
538 __private_extern__ int
539 sogetopt_tcdbg(struct socket *so, struct sockopt *sopt)
540 {
541 int error = 0;
542 struct so_tcdbg so_tcdbg;
543 void *buf = NULL;
544 size_t len = sopt->sopt_valsize;
545
546 error = sooptcopyin(sopt, &so_tcdbg, sizeof (struct so_tcdbg),
547 sizeof (struct so_tcdbg));
548 if (error != 0)
549 return (error);
550
551 sopt->sopt_valsize = len;
552
553 socket_unlock(so, 0);
554
555 switch (so_tcdbg.so_tcdbg_cmd) {
556 case SO_TCDBG_PID:
557 error = get_pid_tclass(&so_tcdbg);
558 break;
559
560 case SO_TCDBG_PNAME:
561 error = get_pname_tclass(&so_tcdbg);
562 break;
563
564 case SO_TCDBG_COUNT:
565 lck_mtx_lock(tclass_lock);
566 so_tcdbg.so_tcdbg_count = tfp_count;
567 lck_mtx_unlock(tclass_lock);
568 break;
569
570 case SO_TCDBG_LIST: {
571 struct tclass_for_proc *tfp;
572 int n, alloc_count;
573 struct so_tcdbg *ptr;
574
575 lck_mtx_lock(tclass_lock);
576 if ((alloc_count = tfp_count) == 0) {
577 lck_mtx_unlock(tclass_lock);
578 error = EINVAL;
579 break;
580 }
581 len = alloc_count * sizeof (struct so_tcdbg);
582 lck_mtx_unlock(tclass_lock);
583
584 buf = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
585 if (buf == NULL) {
586 error = ENOBUFS;
587 break;
588 }
589
590 lck_mtx_lock(tclass_lock);
591 n = 0;
592 ptr = (struct so_tcdbg *)buf;
593 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
594 if (++n > alloc_count)
595 break;
596 if (tfp->tfp_pid != -1) {
597 ptr->so_tcdbg_cmd = SO_TCDBG_PID;
598 ptr->so_tcdbg_pid = tfp->tfp_pid;
599 } else {
600 ptr->so_tcdbg_cmd = SO_TCDBG_PNAME;
601 ptr->so_tcdbg_pid = -1;
602 strlcpy(ptr->so_tcdbg_pname,
603 tfp->tfp_pname,
604 sizeof (ptr->so_tcdbg_pname));
605 }
606 ptr->so_tcdbg_tclass = tfp->tfp_class;
607 ptr++;
608 }
609
610 lck_mtx_unlock(tclass_lock);
611 }
612 break;
613
614 default:
615 error = EINVAL;
616 break;
617 }
618
619 socket_lock(so, 0);
620
621 if (error == 0) {
622 if (buf == NULL) {
623 error = sooptcopyout(sopt, &so_tcdbg,
624 sizeof (struct so_tcdbg));
625 } else {
626 error = sooptcopyout(sopt, buf, len);
627 _FREE(buf, M_TEMP);
628 }
629 }
630 return (error);
631 }
632
633
634 __private_extern__ int
635 so_set_traffic_class(struct socket *so, int optval)
636 {
637 int error = 0;
638
639 if (optval < SO_TC_BE || optval > SO_TC_CTL) {
640 error = EINVAL;
641 } else {
642 switch (optval) {
643 case _SO_TC_BK:
644 optval = SO_TC_BK;
645 break;
646 case _SO_TC_VI:
647 optval = SO_TC_VI;
648 break;
649 case _SO_TC_VO:
650 optval = SO_TC_VO;
651 break;
652 default:
653 if (!SO_VALID_TC(optval))
654 error = EINVAL;
655 break;
656 }
657
658 if (error == 0) {
659 int oldval = so->so_traffic_class;
660
661 VERIFY(SO_VALID_TC(optval));
662 so->so_traffic_class = optval;
663
664 if ((SOCK_DOM(so) == PF_INET ||
665 SOCK_DOM(so) == PF_INET6) &&
666 SOCK_TYPE(so) == SOCK_STREAM)
667 set_tcp_stream_priority(so);
668
669 if ((SOCK_DOM(so) == PF_INET ||
670 SOCK_DOM(so) == PF_INET6) &&
671 optval != oldval && (optval == SO_TC_BK_SYS ||
672 oldval == SO_TC_BK_SYS)) {
673 /*
674 * If the app switches from BK_SYS to something
675 * else, resume the socket if it was suspended.
676 */
677 if (oldval == SO_TC_BK_SYS)
678 inp_reset_fc_state(so->so_pcb);
679
680 SOTHROTTLELOG(("throttle[%d]: so 0x%llx "
681 "[%d,%d] opportunistic %s\n", so->last_pid,
682 (uint64_t)VM_KERNEL_ADDRPERM(so),
683 SOCK_DOM(so), SOCK_TYPE(so),
684 (optval == SO_TC_BK_SYS) ? "ON" : "OFF"));
685 }
686 }
687 }
688 return (error);
689 }
690
691 __private_extern__ void
692 so_set_default_traffic_class(struct socket *so)
693 {
694 int sotc = -1;
695
696 if (tfp_count > 0 &&
697 (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)) {
698 get_tclass_for_curr_proc(&sotc);
699 }
700
701 so->so_traffic_class = (sotc != -1) ? sotc : SO_TC_BE;
702 }
703
704 __private_extern__ int
705 so_set_opportunistic(struct socket *so, int optval)
706 {
707 return (so_set_traffic_class(so, (optval == 0) ?
708 SO_TC_BE : SO_TC_BK_SYS));
709 }
710
711 __private_extern__ int
712 so_get_opportunistic(struct socket *so)
713 {
714 return (so->so_traffic_class == SO_TC_BK_SYS);
715 }
716
717 __private_extern__ mbuf_svc_class_t
718 mbuf_service_class_from_control(struct mbuf *control)
719 {
720 struct cmsghdr *cm;
721 mbuf_svc_class_t msc = MBUF_SC_UNSPEC;
722
723 for (cm = M_FIRST_CMSGHDR(control); cm != NULL;
724 cm = M_NXT_CMSGHDR(control, cm)) {
725 int tc;
726
727 if (cm->cmsg_len < sizeof (struct cmsghdr))
728 break;
729
730 if (cm->cmsg_level != SOL_SOCKET ||
731 cm->cmsg_type != SO_TRAFFIC_CLASS)
732 continue;
733 if (cm->cmsg_len != CMSG_LEN(sizeof (int)))
734 continue;
735
736 tc = *(int *)(void *)CMSG_DATA(cm);
737 msc = so_tc2msc(tc);
738 if (MBUF_VALID_SC(msc))
739 break;
740 }
741
742 return (msc);
743 }
744
745 __private_extern__ int
746 dscp_code_from_mbuf_tclass(mbuf_traffic_class_t mtc)
747 {
748 int dscp_code;
749
750 switch (mtc) {
751 default:
752 case MBUF_TC_BE:
753 dscp_code = 0;
754 break;
755 case MBUF_TC_BK:
756 dscp_code = 0x08;
757 break;
758 case MBUF_TC_VI:
759 dscp_code = 0x20;
760 break;
761 case MBUF_TC_VO:
762 dscp_code = 0x30;
763 break;
764 }
765
766 return (dscp_code);
767 }
768
769 __private_extern__ void
770 so_recv_data_stat(struct socket *so, struct mbuf *m, size_t off)
771 {
772 uint32_t sotc = m_get_traffic_class(m);
773
774 if (sotc >= SO_TC_STATS_MAX)
775 sotc = SO_TC_BE;
776
777 so->so_tc_stats[sotc].rxpackets += 1;
778 so->so_tc_stats[sotc].rxbytes +=
779 ((m->m_flags & M_PKTHDR) ? m->m_pkthdr.len : 0) + off;
780 }
781
782 __private_extern__ void
783 so_inc_recv_data_stat(struct socket *so, size_t pkts, size_t bytes, uint32_t tc)
784 {
785 if (tc >= SO_TC_STATS_MAX)
786 tc = SO_TC_BE;
787
788 so->so_tc_stats[tc].rxpackets += pkts;
789 so->so_tc_stats[tc].rxbytes +=bytes;
790 }
791
792 static inline int
793 so_throttle_best_effort(struct socket *so, struct ifnet *ifp)
794 {
795 u_int32_t uptime = net_uptime();
796 return (soissrcbesteffort(so) &&
797 net_io_policy_throttle_best_effort == 1 &&
798 ifp->if_rt_sendts > 0 &&
799 (int)(uptime - ifp->if_rt_sendts) <= TCP_BG_SWITCH_TIME);
800 }
801
802 __private_extern__ void
803 set_tcp_stream_priority(struct socket *so)
804 {
805 struct inpcb *inp = sotoinpcb(so);
806 struct tcpcb *tp = intotcpcb(inp);
807 struct ifnet *outifp;
808 u_char old_cc = tp->tcp_cc_index;
809 int recvbg = IS_TCP_RECV_BG(so);
810 bool is_local = false, fg_active = false;
811 u_int32_t uptime;
812
813 VERIFY((SOCK_CHECK_DOM(so, PF_INET)
814 || SOCK_CHECK_DOM(so, PF_INET6))
815 && SOCK_CHECK_TYPE(so, SOCK_STREAM)
816 && SOCK_CHECK_PROTO(so, IPPROTO_TCP));
817
818 /* Return if the socket is in a terminal state */
819 if (inp->inp_state == INPCB_STATE_DEAD)
820 return;
821
822 outifp = inp->inp_last_outifp;
823 uptime = net_uptime();
824
825 /*
826 * If the socket was marked as a background socket or if the
827 * traffic class is set to background with traffic class socket
828 * option then make both send and recv side of the stream to be
829 * background. The variable sotcdb which can be set with sysctl
830 * is used to disable these settings for testing.
831 */
832 if (outifp == NULL || (outifp->if_flags & IFF_LOOPBACK))
833 is_local = true;
834
835 /* Check if there has been recent foreground activity */
836 if (outifp != NULL) {
837 /*
838 * If the traffic source is background, check if
839 * if it can be switched to foreground. This can
840 * happen when there is no indication of foreground
841 * activity.
842 */
843 if (soissrcbackground(so) &&
844 ((outifp->if_fg_sendts > 0 &&
845 (int)(uptime - outifp->if_fg_sendts) <=
846 TCP_BG_SWITCH_TIME) || net_io_policy_throttled))
847 fg_active = true;
848
849 /*
850 * The traffic source is best-effort -- check if
851 * the policy to throttle best effort is enabled
852 * and there was realtime activity on this
853 * interface recently. If this is true, enable
854 * algorithms that respond to increased latency
855 * on best-effort traffic.
856 */
857 if (so_throttle_best_effort(so, outifp))
858 fg_active = true;
859 }
860
861 /*
862 * System initiated background traffic like cloud uploads should
863 * always use background delay sensitive algorithms. This will
864 * make the stream more responsive to other streams on the user's
865 * network and it will minimize latency induced.
866 */
867 if (fg_active || IS_SO_TC_BACKGROUNDSYSTEM(so->so_traffic_class)) {
868 /*
869 * If the interface that the connection is using is
870 * loopback, do not use background congestion
871 * control algorithm.
872 *
873 * If there has been recent foreground activity or if
874 * there was an indication that a foreground application
875 * is going to use networking (net_io_policy_throttled),
876 * switch the backgroung streams to use background
877 * congestion control algorithm. Otherwise, even background
878 * flows can move into foreground.
879 */
880 if ((sotcdb & SOTCDB_NO_SENDTCPBG) != 0 || is_local ||
881 !IS_SO_TC_BACKGROUNDSYSTEM(so->so_traffic_class)) {
882 if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX)
883 tcp_set_foreground_cc(so);
884 } else {
885 if (old_cc != TCP_CC_ALGO_BACKGROUND_INDEX)
886 tcp_set_background_cc(so);
887 }
888
889 /* Set receive side background flags */
890 if ((sotcdb & SOTCDB_NO_RECVTCPBG) != 0 || is_local ||
891 !IS_SO_TC_BACKGROUNDSYSTEM(so->so_traffic_class)) {
892 tcp_clear_recv_bg(so);
893 } else {
894 tcp_set_recv_bg(so);
895 }
896 } else {
897 tcp_clear_recv_bg(so);
898 if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX)
899 tcp_set_foreground_cc(so);
900 }
901
902 if (old_cc != tp->tcp_cc_index || recvbg != IS_TCP_RECV_BG(so)) {
903 SOTHROTTLELOG(("throttle[%d]: so 0x%llx [%d,%d] TCP %s send; "
904 "%s recv\n", so->last_pid, (uint64_t)VM_KERNEL_ADDRPERM(so),
905 SOCK_DOM(so), SOCK_TYPE(so),
906 (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) ?
907 "background" : "foreground",
908 IS_TCP_RECV_BG(so) ? "background" : "foreground"));
909 }
910 }
911
912 /*
913 * Set traffic class to an IPv4 or IPv6 packet
914 * - mark the mbuf
915 * - set the DSCP code following the WMM mapping
916 */
917 __private_extern__ void
918 set_packet_service_class(struct mbuf *m, struct socket *so,
919 mbuf_svc_class_t in_msc, u_int32_t flags)
920 {
921 mbuf_svc_class_t msc = MBUF_SC_BE; /* Best effort by default */
922 struct inpcb *inp = sotoinpcb(so); /* in6pcb and inpcb are the same */
923 struct ip *ip = mtod(m, struct ip *);
924 #if INET6
925 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
926 #endif /* INET6 */
927 int isipv6 = ((flags & PKT_SCF_IPV6) != 0) ? 1 : 0;
928
929 if (!(m->m_flags & M_PKTHDR))
930 return;
931
932 /*
933 * Here is the precedence:
934 * 1) TRAFFIC_MGT_SO_BACKGROUND trumps all
935 * 2) Traffic class passed via ancillary data to sendmsdg(2)
936 * 3) Traffic class socket option last
937 */
938 if (in_msc != MBUF_SC_UNSPEC) {
939 if (in_msc >= MBUF_SC_BE && in_msc <= MBUF_SC_CTL)
940 msc = in_msc;
941 } else {
942 VERIFY(SO_VALID_TC(so->so_traffic_class));
943 msc = so_tc2msc(so->so_traffic_class);
944 /* Assert because tc must have been valid */
945 VERIFY(MBUF_VALID_SC(msc));
946 }
947
948 /*
949 * If TRAFFIC_MGT_SO_BACKGROUND is set or policy to throttle
950 * best effort is set, depress the priority.
951 */
952 if (!IS_MBUF_SC_BACKGROUND(msc) && soisthrottled(so))
953 msc = MBUF_SC_BK;
954
955 if (IS_MBUF_SC_BESTEFFORT(msc) && inp->inp_last_outifp != NULL &&
956 so_throttle_best_effort(so, inp->inp_last_outifp))
957 msc = MBUF_SC_BK;
958
959 if (soissrcbackground(so))
960 m->m_pkthdr.pkt_flags |= PKTF_SO_BACKGROUND;
961
962 if (soissrcrealtime(so) || IS_MBUF_SC_REALTIME(msc))
963 m->m_pkthdr.pkt_flags |= PKTF_SO_REALTIME;
964 /*
965 * Set the traffic class in the mbuf packet header svc field
966 */
967 if (sotcdb & SOTCDB_NO_MTC)
968 goto no_mbtc;
969
970 /* Elevate service class if the packet is a pure TCP ACK.
971 * We can do this only when the flow is not a background
972 * flow and the outgoing interface supports
973 * transmit-start model.
974 */
975 if (!IS_MBUF_SC_BACKGROUND(msc) && (flags & PKT_SCF_TCP_ACK))
976 msc = MBUF_SC_CTL;
977
978 (void) m_set_service_class(m, msc);
979
980 /*
981 * Set the privileged traffic auxiliary flag if applicable,
982 * or clear it.
983 */
984 if (!(sotcdb & SOTCDB_NO_PRIVILEGED) && soisprivilegedtraffic(so) &&
985 msc != MBUF_SC_UNSPEC)
986 m->m_pkthdr.pkt_flags |= PKTF_PRIO_PRIVILEGED;
987 else
988 m->m_pkthdr.pkt_flags &= ~PKTF_PRIO_PRIVILEGED;
989
990 no_mbtc:
991 /*
992 * Quick exit when best effort
993 */
994 if (msc == MBUF_SC_BE)
995 goto no_dscp;
996
997 /*
998 * The default behavior is for the networking stack to not set the
999 * DSCP code, based on SOTCDB_NO_DSCP being set. If the flag is
1000 * cleared, set the DSCP code in IPv4 or IPv6 header only for local
1001 * traffic, if it is not already set. <rdar://problem/11277343>
1002 */
1003 if (sotcdb & SOTCDB_NO_DSCP)
1004 goto no_dscp;
1005
1006 /*
1007 * Test if a IP TOS or IPV6 TCLASS has already been set
1008 * on the socket or the raw packet.
1009 */
1010 if (!(sotcdb & SOTCDB_NO_DSCPTST)) {
1011 #if INET6
1012 if (isipv6) {
1013 if ((so->so_type == SOCK_RAW &&
1014 (ip6->ip6_flow & htonl(0xff << 20)) != 0) ||
1015 (inp->in6p_outputopts &&
1016 inp->in6p_outputopts->ip6po_tclass != -1))
1017 goto no_dscp;
1018 } else
1019 #endif /* INET6 */
1020 if ((so->so_type == SOCK_RAW &&
1021 (inp->inp_flags & INP_HDRINCL)) ||
1022 inp->inp_ip_tos != 0)
1023 goto no_dscp;
1024 }
1025
1026 /*
1027 * Test if destination is local
1028 */
1029 if (!(sotcdb & SOTCDB_NO_LCLTST)) {
1030 int islocal = 0;
1031 struct rtentry *rt = inp->inp_route.ro_rt;
1032
1033 if (so->so_type == SOCK_STREAM) {
1034 if (intotcpcb(inp)->t_flags & TF_LOCAL)
1035 islocal = 1;
1036 } else if (rt != NULL &&
1037 (rt->rt_gateway->sa_family == AF_LINK ||
1038 (rt->rt_ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)))) {
1039 if (!(rt->rt_ifp->if_flags & IFF_POINTOPOINT))
1040 islocal = 1;
1041 } else
1042 #if INET6
1043 if (isipv6 && in6addr_local(&ip6->ip6_dst)) {
1044 islocal = 1;
1045 } else
1046 #endif /* INET6 */
1047 if (inaddr_local(ip->ip_dst)) {
1048 islocal = 1;
1049 }
1050 if (islocal == 0)
1051 goto no_dscp;
1052 }
1053
1054 #if INET6
1055 if (isipv6)
1056 ip6->ip6_flow |= htonl(dscp_code_from_mbuf_tclass(
1057 m_get_traffic_class(m)) << 20);
1058 else
1059 #endif /* INET6 */
1060 ip->ip_tos |= dscp_code_from_mbuf_tclass(
1061 m_get_traffic_class(m)) << 2;
1062
1063 no_dscp:
1064 /*
1065 * For TCP with background traffic class switch CC algo based on sysctl
1066 */
1067 if (so->so_type == SOCK_STREAM)
1068 set_tcp_stream_priority(so);
1069
1070 so_tc_update_stats(m, so, msc);
1071 }
1072
1073 __private_extern__ void
1074 so_tc_update_stats(struct mbuf *m, struct socket *so, mbuf_svc_class_t msc)
1075 {
1076 mbuf_traffic_class_t mtc;
1077
1078 /*
1079 * Assume socket and mbuf traffic class values are the same
1080 * Also assume the socket lock is held. Note that the stats
1081 * at the socket layer are reduced down to the legacy traffic
1082 * classes; we could/should potentially expand so_tc_stats[].
1083 */
1084 mtc = MBUF_SC2TC(msc);
1085 VERIFY(mtc < SO_TC_STATS_MAX);
1086 so->so_tc_stats[mtc].txpackets += 1;
1087 so->so_tc_stats[mtc].txbytes += m->m_pkthdr.len;
1088 }
1089
1090 __private_extern__ void
1091 socket_tclass_init(void)
1092 {
1093 _CASSERT(_SO_TC_MAX == SO_TC_STATS_MAX);
1094
1095 tclass_lck_grp_attr = lck_grp_attr_alloc_init();
1096 tclass_lck_grp = lck_grp_alloc_init("tclass", tclass_lck_grp_attr);
1097 tclass_lck_attr = lck_attr_alloc_init();
1098 lck_mtx_init(tclass_lock, tclass_lck_grp, tclass_lck_attr);
1099 }
1100
1101 __private_extern__ mbuf_svc_class_t
1102 so_tc2msc(int tc)
1103 {
1104 mbuf_svc_class_t msc;
1105
1106 switch (tc) {
1107 case SO_TC_BK_SYS:
1108 msc = MBUF_SC_BK_SYS;
1109 break;
1110 case SO_TC_BK:
1111 case _SO_TC_BK:
1112 msc = MBUF_SC_BK;
1113 break;
1114 case SO_TC_BE:
1115 msc = MBUF_SC_BE;
1116 break;
1117 case SO_TC_RD:
1118 msc = MBUF_SC_RD;
1119 break;
1120 case SO_TC_OAM:
1121 msc = MBUF_SC_OAM;
1122 break;
1123 case SO_TC_AV:
1124 msc = MBUF_SC_AV;
1125 break;
1126 case SO_TC_RV:
1127 msc = MBUF_SC_RV;
1128 break;
1129 case SO_TC_VI:
1130 case _SO_TC_VI:
1131 msc = MBUF_SC_VI;
1132 break;
1133 case SO_TC_VO:
1134 case _SO_TC_VO:
1135 msc = MBUF_SC_VO;
1136 break;
1137 case SO_TC_CTL:
1138 msc = MBUF_SC_CTL;
1139 break;
1140 case SO_TC_ALL:
1141 default:
1142 msc = MBUF_SC_UNSPEC;
1143 break;
1144 }
1145
1146 return (msc);
1147 }
1148
1149 __private_extern__ int
1150 so_svc2tc(mbuf_svc_class_t svc)
1151 {
1152 switch (svc) {
1153 case MBUF_SC_UNSPEC:
1154 return SO_TC_BE;
1155 case MBUF_SC_BK_SYS:
1156 return SO_TC_BK_SYS;
1157 case MBUF_SC_BK:
1158 return SO_TC_BK;
1159 case MBUF_SC_BE:
1160 return SO_TC_BE;
1161 case MBUF_SC_RD:
1162 return SO_TC_RD;
1163 case MBUF_SC_OAM:
1164 return SO_TC_OAM;
1165 case MBUF_SC_AV:
1166 return SO_TC_AV;
1167 case MBUF_SC_RV:
1168 return SO_TC_RV;
1169 case MBUF_SC_VI:
1170 return SO_TC_VI;
1171 case MBUF_SC_VO:
1172 return SO_TC_VO;
1173 case MBUF_SC_CTL:
1174 return SO_TC_CTL;
1175 default:
1176 return SO_TC_BE;
1177 }
1178 }
1179
1180 /*
1181 * LRO is turned on for AV streaming class.
1182 */
1183 void
1184 so_set_lro(struct socket *so, int optval)
1185 {
1186 if (optval == SO_TC_AV) {
1187 so->so_flags |= SOF_USELRO;
1188 } else {
1189 if (so->so_flags & SOF_USELRO) {
1190 /* transition to non LRO class */
1191 so->so_flags &= ~SOF_USELRO;
1192 struct inpcb *inp = sotoinpcb(so);
1193 struct tcpcb *tp = NULL;
1194 if (inp) {
1195 tp = intotcpcb(inp);
1196 if (tp && (tp->t_flagsext & TF_LRO_OFFLOADED)) {
1197 tcp_lro_remove_state(inp->inp_laddr,
1198 inp->inp_faddr,
1199 inp->inp_lport,
1200 inp->inp_fport);
1201 tp->t_flagsext &= ~TF_LRO_OFFLOADED;
1202 }
1203 }
1204 }
1205 }
1206 }
1207