]> git.saurik.com Git - apple/xnu.git/blame - bsd/netinet/in_tclass.c
xnu-2782.10.72.tar.gz
[apple/xnu.git] / bsd / netinet / in_tclass.c
CommitLineData
6d2010ae 1/*
fe8ab488 2 * Copyright (c) 2009-2014 Apple Inc. All rights reserved.
6d2010ae
A
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/systm.h>
30#include <sys/kernel.h>
31#include <sys/types.h>
32#include <sys/filedesc.h>
33#include <sys/file_internal.h>
34#include <sys/proc.h>
35#include <sys/socket.h>
36#include <sys/socketvar.h>
37#include <sys/errno.h>
38#include <sys/protosw.h>
39#include <sys/domain.h>
40#include <sys/mbuf.h>
41#include <sys/queue.h>
42
43#include <net/if.h>
44#include <net/route.h>
45
46#include <netinet/in.h>
47#include <netinet/in_var.h>
48#include <netinet/in_pcb.h>
49#include <netinet/ip.h>
50#include <netinet/ip_var.h>
51#include <netinet/ip6.h>
52#include <netinet6/ip6_var.h>
53#include <netinet/udp.h>
54#include <netinet/udp_var.h>
55#include <netinet/tcp.h>
56#include <netinet/tcp_var.h>
57#include <netinet/tcp_cc.h>
39236c6e 58#include <netinet/lro_ext.h>
6d2010ae
A
59
60extern char *proc_name_address(void *p);
61
62static int tfp_count = 0;
63
316670eb
A
64static TAILQ_HEAD(, tclass_for_proc) tfp_head =
65 TAILQ_HEAD_INITIALIZER(tfp_head);
6d2010ae
A
66
67struct tclass_for_proc {
68 TAILQ_ENTRY(tclass_for_proc) tfp_link;
316670eb
A
69 int tfp_class;
70 pid_t tfp_pid;
71 char tfp_pname[MAXCOMLEN + 1];
6d2010ae
A
72};
73
316670eb
A
74static int dscp_code_from_mbuf_tclass(mbuf_traffic_class_t);
75static int get_pid_tclass(struct so_tcdbg *);
76static int get_pname_tclass(struct so_tcdbg *);
77static int set_pid_tclass(struct so_tcdbg *);
78static int set_pname_tclass(struct so_tcdbg *);
79static int flush_pid_tclass(struct so_tcdbg *);
6d2010ae
A
80static int purge_tclass_for_proc(void);
81static int flush_tclass_for_proc(void);
316670eb 82int get_tclass_for_curr_proc(int *);
6d2010ae 83
316670eb
A
84static lck_grp_attr_t *tclass_lck_grp_attr = NULL; /* mutex group attributes */
85static lck_grp_t *tclass_lck_grp = NULL; /* mutex group definition */
86static lck_attr_t *tclass_lck_attr = NULL; /* mutex attributes */
87decl_lck_mtx_data(static, tclass_lock_data);
88static lck_mtx_t *tclass_lock = &tclass_lock_data;
6d2010ae 89
39236c6e
A
90/*
91 * If there is no foreground activity on the interface for bg_switch_time
92 * seconds, the background connections can switch to foreground TCP
93 * congestion control.
94 */
95#define TCP_BG_SWITCH_TIME 2
96
6d2010ae
A
97/*
98 * Must be called with tclass_lock held
99 */
100static struct tclass_for_proc *
101find_tfp_by_pid(pid_t pid)
102{
103 struct tclass_for_proc *tfp;
316670eb 104
6d2010ae
A
105 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
106 if (tfp->tfp_pid == pid)
107 break;
108 }
316670eb 109 return (tfp);
6d2010ae
A
110}
111
112/*
113 * Must be called with tclass_lock held
114 */
115static struct tclass_for_proc *
116find_tfp_by_pname(const char *pname)
117{
118 struct tclass_for_proc *tfp;
316670eb 119
6d2010ae 120 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
316670eb
A
121 if (strncmp(pname, tfp->tfp_pname,
122 sizeof (tfp->tfp_pname)) == 0)
6d2010ae
A
123 break;
124 }
316670eb 125 return (tfp);
6d2010ae
A
126}
127
316670eb
A
128__private_extern__ int
129get_tclass_for_curr_proc(int *sotc)
6d2010ae 130{
316670eb 131 struct tclass_for_proc *tfp = NULL;
6d2010ae
A
132 proc_t p = current_proc(); /* Not ref counted */
133 pid_t pid = proc_pid(p);
134 char *pname = proc_name_address(p);
316670eb
A
135
136 *sotc = -1;
137
6d2010ae 138 lck_mtx_lock(tclass_lock);
316670eb 139
6d2010ae 140 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
316670eb
A
141 if ((tfp->tfp_pid == pid) || (tfp->tfp_pid == -1 &&
142 strncmp(pname, tfp->tfp_pname,
143 sizeof (tfp->tfp_pname)) == 0)) {
144 *sotc = tfp->tfp_class;
6d2010ae 145 break;
316670eb 146 }
6d2010ae
A
147 }
148
149 lck_mtx_unlock(tclass_lock);
150
316670eb 151 return ((tfp == NULL) ? 0 : 1);
6d2010ae
A
152}
153
154/*
155 * Purge entries with PIDs of exited processes
156 */
157int
158purge_tclass_for_proc(void)
159{
160 int error = 0;
161 struct tclass_for_proc *tfp, *tvar;
162
163 lck_mtx_lock(tclass_lock);
164
165 TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) {
166 proc_t p;
316670eb 167
6d2010ae
A
168 if (tfp->tfp_pid == -1)
169 continue;
170 if ((p = proc_find(tfp->tfp_pid)) == NULL) {
171 tfp_count--;
172 TAILQ_REMOVE(&tfp_head, tfp, tfp_link);
316670eb 173
6d2010ae
A
174 _FREE(tfp, M_TEMP);
175 } else {
176 proc_rele(p);
177 }
178 }
179
180 lck_mtx_unlock(tclass_lock);
316670eb
A
181
182 return (error);
6d2010ae
A
183}
184
185/*
186 * Remove one entry
187 * Must be called with tclass_lock held
188 */
189static void
190free_tclass_for_proc(struct tclass_for_proc *tfp)
191{
192 if (tfp == NULL)
193 return;
194 tfp_count--;
195 TAILQ_REMOVE(&tfp_head, tfp, tfp_link);
196 _FREE(tfp, M_TEMP);
197}
198
199/*
200 * Remove all entries
201 */
202int
203flush_tclass_for_proc(void)
204{
205 int error = 0;
206 struct tclass_for_proc *tfp, *tvar;
207
208 lck_mtx_lock(tclass_lock);
209
210 TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) {
211 free_tclass_for_proc(tfp);
212 }
316670eb 213
6d2010ae 214 lck_mtx_unlock(tclass_lock);
316670eb
A
215
216 return (error);
6d2010ae
A
217
218}
219
220/*
221 * Must be called with tclass_lock held
222 */
223static struct tclass_for_proc *
316670eb 224alloc_tclass_for_proc(pid_t pid, const char *pname)
6d2010ae
A
225{
226 struct tclass_for_proc *tfp;
316670eb 227
6d2010ae 228 if (pid == -1 && pname == NULL)
316670eb 229 return (NULL);
6d2010ae 230
316670eb 231 tfp = _MALLOC(sizeof (struct tclass_for_proc), M_TEMP, M_NOWAIT|M_ZERO);
6d2010ae 232 if (tfp == NULL)
316670eb
A
233 return (NULL);
234
6d2010ae 235 tfp->tfp_pid = pid;
6d2010ae 236 /*
316670eb 237 * Add per pid entries before per proc name so we can find
6d2010ae
A
238 * a specific instance of a process before the general name base entry.
239 */
240 if (pid != -1) {
241 TAILQ_INSERT_HEAD(&tfp_head, tfp, tfp_link);
242 } else {
316670eb 243 strlcpy(tfp->tfp_pname, pname, sizeof (tfp->tfp_pname));
6d2010ae
A
244 TAILQ_INSERT_TAIL(&tfp_head, tfp, tfp_link);
245 }
316670eb 246
6d2010ae
A
247 tfp_count++;
248
316670eb 249 return (tfp);
6d2010ae
A
250}
251
252/*
253 * -1 for tclass means to remove the entry
254 */
316670eb
A
255int
256set_pid_tclass(struct so_tcdbg *so_tcdbg)
6d2010ae
A
257{
258 int error = EINVAL;
259 proc_t p = NULL;
260 struct filedesc *fdp;
261 struct fileproc *fp;
262 struct tclass_for_proc *tfp;
263 int i;
316670eb
A
264 pid_t pid = so_tcdbg->so_tcdbg_pid;
265 int tclass = so_tcdbg->so_tcdbg_tclass;
6d2010ae
A
266
267 p = proc_find(pid);
268 if (p == NULL) {
316670eb 269 printf("%s proc_find(%d) failed\n", __func__, pid);
6d2010ae
A
270 goto done;
271 }
316670eb 272
6d2010ae
A
273 /* Need a tfp */
274 lck_mtx_lock(tclass_lock);
316670eb 275
6d2010ae 276 tfp = find_tfp_by_pid(pid);
316670eb
A
277 if (tfp == NULL) {
278 tfp = alloc_tclass_for_proc(pid, NULL);
6d2010ae 279 if (tfp == NULL) {
316670eb
A
280 lck_mtx_unlock(tclass_lock);
281 error = ENOBUFS;
282 goto done;
6d2010ae
A
283 }
284 }
316670eb
A
285 tfp->tfp_class = tclass;
286
6d2010ae
A
287 lck_mtx_unlock(tclass_lock);
288
289 if (tfp != NULL) {
290 proc_fdlock(p);
316670eb 291
6d2010ae
A
292 fdp = p->p_fd;
293 for (i = 0; i < fdp->fd_nfiles; i++) {
294 struct socket *so;
316670eb 295
6d2010ae 296 fp = fdp->fd_ofiles[i];
316670eb
A
297 if (fp == NULL ||
298 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
39236c6e 299 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
6d2010ae 300 continue;
316670eb 301
6d2010ae 302 so = (struct socket *)fp->f_fglob->fg_data;
39236c6e 303 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6)
6d2010ae
A
304 continue;
305 socket_lock(so, 1);
316670eb
A
306 if (tclass != -1) {
307 error = so_set_traffic_class(so, tclass);
308 if (error != 0) {
309 printf("%s: so_set_traffic_class"
39236c6e 310 "(so=0x%llx, fd=%d, tclass=%d) "
316670eb 311 "failed %d\n", __func__,
39236c6e
A
312 (uint64_t)VM_KERNEL_ADDRPERM(so),
313 i, tclass, error);
316670eb
A
314 error = 0;
315 }
6d2010ae 316 }
316670eb 317 socket_unlock(so, 1);
6d2010ae 318 }
316670eb 319
6d2010ae
A
320 proc_fdunlock(p);
321 }
316670eb
A
322
323 error = 0;
6d2010ae
A
324done:
325 if (p != NULL)
326 proc_rele(p);
316670eb
A
327
328 return (error);
6d2010ae
A
329}
330
316670eb
A
331int
332set_pname_tclass(struct so_tcdbg *so_tcdbg)
6d2010ae
A
333{
334 int error = EINVAL;
335 struct tclass_for_proc *tfp;
336
337 lck_mtx_lock(tclass_lock);
316670eb
A
338
339 tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname);
340 if (tfp == NULL) {
341 tfp = alloc_tclass_for_proc(-1, so_tcdbg->so_tcdbg_pname);
6d2010ae 342 if (tfp == NULL) {
316670eb
A
343 lck_mtx_unlock(tclass_lock);
344 error = ENOBUFS;
345 goto done;
6d2010ae
A
346 }
347 }
316670eb
A
348 tfp->tfp_class = so_tcdbg->so_tcdbg_tclass;
349
6d2010ae 350 lck_mtx_unlock(tclass_lock);
316670eb
A
351
352 error = 0;
6d2010ae 353done:
316670eb
A
354
355 return (error);
6d2010ae
A
356}
357
316670eb
A
358static int
359flush_pid_tclass(struct so_tcdbg *so_tcdbg)
360{
361 pid_t pid = so_tcdbg->so_tcdbg_pid;
362 int tclass = so_tcdbg->so_tcdbg_tclass;
363 struct filedesc *fdp;
364 int error = EINVAL;
365 proc_t p;
366 int i;
367
368 p = proc_find(pid);
369 if (p == PROC_NULL) {
370 printf("%s proc_find(%d) failed\n", __func__, pid);
371 goto done;
372 }
373
374 proc_fdlock(p);
375 fdp = p->p_fd;
376 for (i = 0; i < fdp->fd_nfiles; i++) {
377 struct socket *so;
378 struct fileproc *fp;
379
380 fp = fdp->fd_ofiles[i];
381 if (fp == NULL ||
382 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
39236c6e 383 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
316670eb
A
384 continue;
385
386 so = (struct socket *)fp->f_fglob->fg_data;
387 error = sock_setsockopt(so, SOL_SOCKET, SO_FLUSH, &tclass,
388 sizeof (tclass));
389 if (error != 0) {
39236c6e
A
390 printf("%s: setsockopt(SO_FLUSH) (so=0x%llx, fd=%d, "
391 "tclass=%d) failed %d\n", __func__,
392 (uint64_t)VM_KERNEL_ADDRPERM(so), i, tclass,
316670eb
A
393 error);
394 error = 0;
395 }
396 }
397 proc_fdunlock(p);
398
399 error = 0;
400done:
401 if (p != PROC_NULL)
402 proc_rele(p);
403
404 return (error);
405}
406
407int
408get_pid_tclass(struct so_tcdbg *so_tcdbg)
6d2010ae
A
409{
410 int error = EINVAL;
411 proc_t p = NULL;
412 struct tclass_for_proc *tfp;
316670eb
A
413 pid_t pid = so_tcdbg->so_tcdbg_pid;
414
415 so_tcdbg->so_tcdbg_tclass = -1; /* Means not set */
416 so_tcdbg->so_tcdbg_opportunistic = -1; /* Means not set */
6d2010ae
A
417
418 p = proc_find(pid);
419 if (p == NULL) {
316670eb 420 printf("%s proc_find(%d) failed\n", __func__, pid);
6d2010ae
A
421 goto done;
422 }
316670eb 423
6d2010ae
A
424 /* Need a tfp */
425 lck_mtx_lock(tclass_lock);
316670eb 426
6d2010ae
A
427 tfp = find_tfp_by_pid(pid);
428 if (tfp != NULL) {
316670eb 429 so_tcdbg->so_tcdbg_tclass = tfp->tfp_class;
6d2010ae
A
430 error = 0;
431 }
432 lck_mtx_unlock(tclass_lock);
433done:
434 if (p != NULL)
435 proc_rele(p);
316670eb
A
436
437 return (error);
6d2010ae
A
438}
439
316670eb
A
440int
441get_pname_tclass(struct so_tcdbg *so_tcdbg)
6d2010ae
A
442{
443 int error = EINVAL;
444 struct tclass_for_proc *tfp;
316670eb
A
445
446 so_tcdbg->so_tcdbg_tclass = -1; /* Means not set */
447 so_tcdbg->so_tcdbg_opportunistic = -1; /* Means not set */
6d2010ae
A
448
449 /* Need a tfp */
450 lck_mtx_lock(tclass_lock);
316670eb
A
451
452 tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname);
6d2010ae 453 if (tfp != NULL) {
316670eb 454 so_tcdbg->so_tcdbg_tclass = tfp->tfp_class;
6d2010ae
A
455 error = 0;
456 }
457 lck_mtx_unlock(tclass_lock);
316670eb
A
458
459 return (error);
6d2010ae
A
460}
461
316670eb
A
462static int
463delete_tclass_for_pid_pname(struct so_tcdbg *so_tcdbg)
464{
465 int error = EINVAL;
466 pid_t pid = so_tcdbg->so_tcdbg_pid;
467 struct tclass_for_proc *tfp = NULL;
468
469 lck_mtx_lock(tclass_lock);
6d2010ae 470
316670eb
A
471 if (pid != -1)
472 tfp = find_tfp_by_pid(pid);
473 else
474 tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname);
475
476 if (tfp != NULL) {
477 free_tclass_for_proc(tfp);
478 error = 0;
479 }
480
481 lck_mtx_unlock(tclass_lock);
482
483 return (error);
484}
6d2010ae
A
485
486/*
487 * Setting options requires privileges
488 */
316670eb 489__private_extern__ int
6d2010ae
A
490so_set_tcdbg(struct socket *so, struct so_tcdbg *so_tcdbg)
491{
492 int error = 0;
316670eb 493
6d2010ae 494 if ((so->so_state & SS_PRIV) == 0)
316670eb 495 return (EPERM);
6d2010ae
A
496
497 socket_unlock(so, 0);
498
499 switch (so_tcdbg->so_tcdbg_cmd) {
500 case SO_TCDBG_PID:
316670eb 501 error = set_pid_tclass(so_tcdbg);
6d2010ae 502 break;
316670eb 503
6d2010ae 504 case SO_TCDBG_PNAME:
316670eb 505 error = set_pname_tclass(so_tcdbg);
6d2010ae 506 break;
316670eb 507
6d2010ae
A
508 case SO_TCDBG_PURGE:
509 error = purge_tclass_for_proc();
510 break;
316670eb 511
6d2010ae
A
512 case SO_TCDBG_FLUSH:
513 error = flush_tclass_for_proc();
514 break;
316670eb
A
515
516 case SO_TCDBG_DELETE:
517 error = delete_tclass_for_pid_pname(so_tcdbg);
518 break;
519
520 case SO_TCDBG_TCFLUSH_PID:
521 error = flush_pid_tclass(so_tcdbg);
522 break;
523
6d2010ae
A
524 default:
525 error = EINVAL;
526 break;
6d2010ae
A
527 }
528
529 socket_lock(so, 0);
530
316670eb 531 return (error);
6d2010ae
A
532}
533
534/*
535 * Not required to be privileged to get
536 */
316670eb 537__private_extern__ int
6d2010ae
A
538sogetopt_tcdbg(struct socket *so, struct sockopt *sopt)
539{
540 int error = 0;
541 struct so_tcdbg so_tcdbg;
542 void *buf = NULL;
543 size_t len = sopt->sopt_valsize;
544
316670eb
A
545 error = sooptcopyin(sopt, &so_tcdbg, sizeof (struct so_tcdbg),
546 sizeof (struct so_tcdbg));
6d2010ae 547 if (error != 0)
316670eb
A
548 return (error);
549
6d2010ae 550 sopt->sopt_valsize = len;
316670eb 551
6d2010ae
A
552 socket_unlock(so, 0);
553
554 switch (so_tcdbg.so_tcdbg_cmd) {
555 case SO_TCDBG_PID:
316670eb 556 error = get_pid_tclass(&so_tcdbg);
6d2010ae 557 break;
316670eb 558
6d2010ae 559 case SO_TCDBG_PNAME:
316670eb 560 error = get_pname_tclass(&so_tcdbg);
6d2010ae 561 break;
316670eb 562
6d2010ae
A
563 case SO_TCDBG_COUNT:
564 lck_mtx_lock(tclass_lock);
565 so_tcdbg.so_tcdbg_count = tfp_count;
566 lck_mtx_unlock(tclass_lock);
567 break;
568
569 case SO_TCDBG_LIST: {
570 struct tclass_for_proc *tfp;
571 int n, alloc_count;
572 struct so_tcdbg *ptr;
573
574 lck_mtx_lock(tclass_lock);
575 if ((alloc_count = tfp_count) == 0) {
576 lck_mtx_unlock(tclass_lock);
577 error = EINVAL;
578 break;
579 }
316670eb 580 len = alloc_count * sizeof (struct so_tcdbg);
6d2010ae
A
581 lck_mtx_unlock(tclass_lock);
582
583 buf = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
584 if (buf == NULL) {
585 error = ENOBUFS;
586 break;
587 }
588
589 lck_mtx_lock(tclass_lock);
590 n = 0;
591 ptr = (struct so_tcdbg *)buf;
592 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
593 if (++n > alloc_count)
594 break;
595 if (tfp->tfp_pid != -1) {
596 ptr->so_tcdbg_cmd = SO_TCDBG_PID;
597 ptr->so_tcdbg_pid = tfp->tfp_pid;
598 } else {
599 ptr->so_tcdbg_cmd = SO_TCDBG_PNAME;
600 ptr->so_tcdbg_pid = -1;
316670eb
A
601 strlcpy(ptr->so_tcdbg_pname,
602 tfp->tfp_pname,
603 sizeof (ptr->so_tcdbg_pname));
6d2010ae
A
604 }
605 ptr->so_tcdbg_tclass = tfp->tfp_class;
606 ptr++;
607 }
316670eb 608
6d2010ae
A
609 lck_mtx_unlock(tclass_lock);
610 }
611 break;
316670eb 612
6d2010ae
A
613 default:
614 error = EINVAL;
615 break;
6d2010ae
A
616 }
617
618 socket_lock(so, 0);
619
620 if (error == 0) {
621 if (buf == NULL) {
316670eb
A
622 error = sooptcopyout(sopt, &so_tcdbg,
623 sizeof (struct so_tcdbg));
6d2010ae
A
624 } else {
625 error = sooptcopyout(sopt, buf, len);
626 _FREE(buf, M_TEMP);
627 }
628 }
316670eb 629 return (error);
6d2010ae
A
630}
631
632
633__private_extern__ int
634so_set_traffic_class(struct socket *so, int optval)
635{
636 int error = 0;
316670eb
A
637
638 if (optval < SO_TC_BE || optval > SO_TC_CTL) {
6d2010ae
A
639 error = EINVAL;
640 } else {
316670eb
A
641 switch (optval) {
642 case _SO_TC_BK:
643 optval = SO_TC_BK;
644 break;
645 case _SO_TC_VI:
646 optval = SO_TC_VI;
647 break;
648 case _SO_TC_VO:
649 optval = SO_TC_VO;
650 break;
651 default:
652 if (!SO_VALID_TC(optval))
653 error = EINVAL;
654 break;
655 }
656
657 if (error == 0) {
658 int oldval = so->so_traffic_class;
659
660 VERIFY(SO_VALID_TC(optval));
661 so->so_traffic_class = optval;
662
39236c6e
A
663 if ((SOCK_DOM(so) == PF_INET ||
664 SOCK_DOM(so) == PF_INET6) &&
665 SOCK_TYPE(so) == SOCK_STREAM)
316670eb
A
666 set_tcp_stream_priority(so);
667
39236c6e
A
668 if ((SOCK_DOM(so) == PF_INET ||
669 SOCK_DOM(so) == PF_INET6) &&
316670eb
A
670 optval != oldval && (optval == SO_TC_BK_SYS ||
671 oldval == SO_TC_BK_SYS)) {
672 /*
673 * If the app switches from BK_SYS to something
674 * else, resume the socket if it was suspended.
675 */
676 if (oldval == SO_TC_BK_SYS)
677 inp_reset_fc_state(so->so_pcb);
678
39236c6e
A
679 SOTHROTTLELOG(("throttle[%d]: so 0x%llx "
680 "[%d,%d] opportunistic %s\n", so->last_pid,
681 (uint64_t)VM_KERNEL_ADDRPERM(so),
682 SOCK_DOM(so), SOCK_TYPE(so),
316670eb
A
683 (optval == SO_TC_BK_SYS) ? "ON" : "OFF"));
684 }
6d2010ae
A
685 }
686 }
316670eb 687 return (error);
6d2010ae
A
688}
689
690__private_extern__ void
691so_set_default_traffic_class(struct socket *so)
692{
316670eb 693 int sotc = -1;
6d2010ae 694
316670eb 695 if (tfp_count > 0 &&
39236c6e 696 (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)) {
316670eb 697 get_tclass_for_curr_proc(&sotc);
6d2010ae 698 }
316670eb
A
699
700 so->so_traffic_class = (sotc != -1) ? sotc : SO_TC_BE;
6d2010ae
A
701}
702
316670eb
A
703__private_extern__ int
704so_set_opportunistic(struct socket *so, int optval)
705{
706 return (so_set_traffic_class(so, (optval == 0) ?
707 SO_TC_BE : SO_TC_BK_SYS));
708}
6d2010ae
A
709
710__private_extern__ int
316670eb
A
711so_get_opportunistic(struct socket *so)
712{
713 return (so->so_traffic_class == SO_TC_BK_SYS);
714}
715
716__private_extern__ mbuf_svc_class_t
717mbuf_service_class_from_control(struct mbuf *control)
6d2010ae
A
718{
719 struct cmsghdr *cm;
316670eb
A
720 mbuf_svc_class_t msc = MBUF_SC_UNSPEC;
721
722 for (cm = M_FIRST_CMSGHDR(control); cm != NULL;
723 cm = M_NXT_CMSGHDR(control, cm)) {
6d2010ae
A
724 int tc;
725
316670eb 726 if (cm->cmsg_len < sizeof (struct cmsghdr))
6d2010ae 727 break;
316670eb 728
6d2010ae 729 if (cm->cmsg_level != SOL_SOCKET ||
316670eb 730 cm->cmsg_type != SO_TRAFFIC_CLASS)
6d2010ae 731 continue;
316670eb 732 if (cm->cmsg_len != CMSG_LEN(sizeof (int)))
6d2010ae 733 continue;
316670eb
A
734
735 tc = *(int *)(void *)CMSG_DATA(cm);
736 msc = so_tc2msc(tc);
737 if (MBUF_VALID_SC(msc))
738 break;
6d2010ae 739 }
316670eb
A
740
741 return (msc);
6d2010ae
A
742}
743
744__private_extern__ int
316670eb 745dscp_code_from_mbuf_tclass(mbuf_traffic_class_t mtc)
6d2010ae
A
746{
747 int dscp_code;
316670eb 748
6d2010ae
A
749 switch (mtc) {
750 default:
751 case MBUF_TC_BE:
752 dscp_code = 0;
753 break;
754 case MBUF_TC_BK:
755 dscp_code = 0x08;
756 break;
757 case MBUF_TC_VI:
758 dscp_code = 0x20;
759 break;
760 case MBUF_TC_VO:
761 dscp_code = 0x30;
762 break;
763 }
316670eb
A
764
765 return (dscp_code);
6d2010ae
A
766}
767
768__private_extern__ void
769so_recv_data_stat(struct socket *so, struct mbuf *m, size_t off)
770{
316670eb 771 uint32_t sotc = m_get_traffic_class(m);
6d2010ae
A
772
773 if (sotc >= SO_TC_STATS_MAX)
774 sotc = SO_TC_BE;
6d2010ae 775
316670eb
A
776 so->so_tc_stats[sotc].rxpackets += 1;
777 so->so_tc_stats[sotc].rxbytes +=
778 ((m->m_flags & M_PKTHDR) ? m->m_pkthdr.len : 0) + off;
6d2010ae
A
779}
780
fe8ab488
A
781__private_extern__ void
782so_inc_recv_data_stat(struct socket *so, size_t pkts, size_t bytes, uint32_t tc)
783{
784 if (tc >= SO_TC_STATS_MAX)
785 tc = SO_TC_BE;
786
787 so->so_tc_stats[tc].rxpackets += pkts;
788 so->so_tc_stats[tc].rxbytes +=bytes;
789}
6d2010ae
A
790__private_extern__ void
791set_tcp_stream_priority(struct socket *so)
792{
39236c6e
A
793 struct inpcb *inp = sotoinpcb(so);
794 struct tcpcb *tp = intotcpcb(inp);
795 struct ifnet *outifp;
796 u_char old_cc = tp->tcp_cc_index;
316670eb 797 int recvbg = IS_TCP_RECV_BG(so);
39236c6e
A
798 bool is_local, fg_active = false;
799 u_int32_t uptime;
800
801 VERIFY((SOCK_CHECK_DOM(so, PF_INET)
802 || SOCK_CHECK_DOM(so, PF_INET6))
803 && SOCK_CHECK_TYPE(so, SOCK_STREAM)
804 && SOCK_CHECK_PROTO(so, IPPROTO_TCP));
fe8ab488
A
805
806 /* Return if the socket is in a terminal state */
807 if (inp->inp_state == INPCB_STATE_DEAD)
808 return;
809
39236c6e
A
810 outifp = inp->inp_last_outifp;
811 uptime = net_uptime();
6d2010ae 812
316670eb
A
813 /*
814 * If the socket was marked as a background socket or if the
815 * traffic class is set to background with traffic class socket
816 * option then make both send and recv side of the stream to be
817 * background. The variable sotcdb which can be set with sysctl
6d2010ae
A
818 * is used to disable these settings for testing.
819 */
39236c6e
A
820 if (soissrcbackground(so)) {
821 if (outifp == NULL || (outifp->if_flags & IFF_LOOPBACK))
822 is_local = true;
823 else
824 is_local = false;
825
826 /* Check if there has been recent foreground activity */
827 if ((outifp != NULL &&
828 outifp->if_fg_sendts > 0 &&
829 (int)(uptime - outifp->if_fg_sendts) <=
830 TCP_BG_SWITCH_TIME) ||
831 net_io_policy_throttled)
832 fg_active = true;
833
834 /*
835 * If the interface that the connection is using is
836 * loopback, do not use background congestion
837 * control algorithm.
838 *
839 * If there has been recent foreground activity or if
840 * there was an indication that a foreground application
841 * is going to use networking (net_io_policy_throttled),
842 * switch the backgroung streams to use background
843 * congestion control algorithm. Otherwise, even background
844 * flows can move into foreground.
845 */
846 if ((sotcdb & SOTCDB_NO_SENDTCPBG) != 0 ||
847 is_local || !fg_active) {
316670eb 848 if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX)
6d2010ae
A
849 tcp_set_foreground_cc(so);
850 } else {
316670eb 851 if (old_cc != TCP_CC_ALGO_BACKGROUND_INDEX)
6d2010ae
A
852 tcp_set_background_cc(so);
853 }
316670eb 854
6d2010ae 855 /* Set receive side background flags */
39236c6e
A
856 if ((sotcdb & SOTCDB_NO_RECVTCPBG) != 0 ||
857 is_local || !fg_active)
316670eb
A
858 tcp_clear_recv_bg(so);
859 else
860 tcp_set_recv_bg(so);
6d2010ae 861 } else {
316670eb
A
862 tcp_clear_recv_bg(so);
863 if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX)
6d2010ae
A
864 tcp_set_foreground_cc(so);
865 }
316670eb
A
866
867 if (old_cc != tp->tcp_cc_index || recvbg != IS_TCP_RECV_BG(so)) {
39236c6e
A
868 SOTHROTTLELOG(("throttle[%d]: so 0x%llx [%d,%d] TCP %s send; "
869 "%s recv\n", so->last_pid, (uint64_t)VM_KERNEL_ADDRPERM(so),
870 SOCK_DOM(so), SOCK_TYPE(so),
316670eb
A
871 (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) ?
872 "background" : "foreground",
873 IS_TCP_RECV_BG(so) ? "background" : "foreground"));
874 }
6d2010ae
A
875}
876
877/*
878 * Set traffic class to an IPv4 or IPv6 packet
879 * - mark the mbuf
880 * - set the DSCP code following the WMM mapping
881 */
882__private_extern__ void
316670eb
A
883set_packet_service_class(struct mbuf *m, struct socket *so,
884 mbuf_svc_class_t in_msc, u_int32_t flags)
6d2010ae 885{
316670eb
A
886 mbuf_svc_class_t msc = MBUF_SC_BE; /* Best effort by default */
887 struct inpcb *inp = sotoinpcb(so); /* in6pcb and inpcb are the same */
6d2010ae
A
888 struct ip *ip = mtod(m, struct ip *);
889#if INET6
890 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
891#endif /* INET6 */
316670eb
A
892 int isipv6 = ((flags & PKT_SCF_IPV6) != 0) ? 1 : 0;
893
6d2010ae
A
894 if (!(m->m_flags & M_PKTHDR))
895 return;
316670eb
A
896
897 /*
6d2010ae
A
898 * Here is the precedence:
899 * 1) TRAFFIC_MGT_SO_BACKGROUND trumps all
900 * 2) Traffic class passed via ancillary data to sendmsdg(2)
901 * 3) Traffic class socket option last
902 */
316670eb
A
903 if (in_msc != MBUF_SC_UNSPEC) {
904 if (in_msc >= MBUF_SC_BE && in_msc <= MBUF_SC_CTL)
905 msc = in_msc;
6d2010ae 906 } else {
316670eb
A
907 VERIFY(SO_VALID_TC(so->so_traffic_class));
908 msc = so_tc2msc(so->so_traffic_class);
909 /* Assert because tc must have been valid */
910 VERIFY(MBUF_VALID_SC(msc));
6d2010ae 911 }
316670eb
A
912
913 /*
914 * If TRAFFIC_MGT_SO_BACKGROUND is set, depress the priority.
915 */
916 if (soisthrottled(so) && !IS_MBUF_SC_BACKGROUND(msc))
917 msc = MBUF_SC_BK;
918
39236c6e
A
919 if (soissrcbackground(so))
920 m->m_pkthdr.pkt_flags |= PKTF_SO_BACKGROUND;
6d2010ae 921 /*
316670eb 922 * Set the traffic class in the mbuf packet header svc field
6d2010ae 923 */
316670eb 924 if (sotcdb & SOTCDB_NO_MTC)
6d2010ae 925 goto no_mbtc;
316670eb
A
926
927 /* Elevate service class if the packet is a pure TCP ACK.
928 * We can do this only when the flow is not a background
929 * flow and the outgoing interface supports
930 * transmit-start model.
931 */
932 if (!IS_MBUF_SC_BACKGROUND(msc) && (flags & PKT_SCF_TCP_ACK))
933 msc = MBUF_SC_CTL;
934
935 (void) m_set_service_class(m, msc);
936
937 /*
39236c6e
A
938 * Set the privileged traffic auxiliary flag if applicable,
939 * or clear it.
316670eb
A
940 */
941 if (!(sotcdb & SOTCDB_NO_PRIVILEGED) && soisprivilegedtraffic(so) &&
942 msc != MBUF_SC_UNSPEC)
39236c6e 943 m->m_pkthdr.pkt_flags |= PKTF_PRIO_PRIVILEGED;
316670eb 944 else
39236c6e 945 m->m_pkthdr.pkt_flags &= ~PKTF_PRIO_PRIVILEGED;
316670eb 946
6d2010ae
A
947no_mbtc:
948 /*
316670eb 949 * Quick exit when best effort
6d2010ae 950 */
316670eb 951 if (msc == MBUF_SC_BE)
6d2010ae 952 goto no_dscp;
316670eb 953
6d2010ae 954 /*
316670eb
A
955 * The default behavior is for the networking stack to not set the
956 * DSCP code, based on SOTCDB_NO_DSCP being set. If the flag is
957 * cleared, set the DSCP code in IPv4 or IPv6 header only for local
958 * traffic, if it is not already set. <rdar://problem/11277343>
6d2010ae 959 */
316670eb 960 if (sotcdb & SOTCDB_NO_DSCP)
6d2010ae 961 goto no_dscp;
316670eb 962
6d2010ae 963 /*
316670eb
A
964 * Test if a IP TOS or IPV6 TCLASS has already been set
965 * on the socket or the raw packet.
6d2010ae 966 */
316670eb 967 if (!(sotcdb & SOTCDB_NO_DSCPTST)) {
6d2010ae 968#if INET6
316670eb
A
969 if (isipv6) {
970 if ((so->so_type == SOCK_RAW &&
971 (ip6->ip6_flow & htonl(0xff << 20)) != 0) ||
972 (inp->in6p_outputopts &&
973 inp->in6p_outputopts->ip6po_tclass != -1))
6d2010ae 974 goto no_dscp;
316670eb 975 } else
6d2010ae 976#endif /* INET6 */
316670eb
A
977 if ((so->so_type == SOCK_RAW &&
978 (inp->inp_flags & INP_HDRINCL)) ||
979 inp->inp_ip_tos != 0)
980 goto no_dscp;
6d2010ae 981 }
316670eb 982
6d2010ae
A
983 /*
984 * Test if destination is local
985 */
316670eb 986 if (!(sotcdb & SOTCDB_NO_LCLTST)) {
6d2010ae 987 int islocal = 0;
316670eb 988 struct rtentry *rt = inp->inp_route.ro_rt;
6d2010ae
A
989
990 if (so->so_type == SOCK_STREAM) {
316670eb 991 if (intotcpcb(inp)->t_flags & TF_LOCAL)
6d2010ae 992 islocal = 1;
316670eb
A
993 } else if (rt != NULL &&
994 (rt->rt_gateway->sa_family == AF_LINK ||
995 (rt->rt_ifp->if_flags & (IFF_LOOPBACK|IFF_POINTOPOINT)))) {
996 if (!(rt->rt_ifp->if_flags & IFF_POINTOPOINT))
6d2010ae 997 islocal = 1;
316670eb
A
998 } else
999#if INET6
1000 if (isipv6 && in6addr_local(&ip6->ip6_dst)) {
1001 islocal = 1;
1002 } else
6d2010ae 1003#endif /* INET6 */
316670eb
A
1004 if (inaddr_local(ip->ip_dst)) {
1005 islocal = 1;
6d2010ae
A
1006 }
1007 if (islocal == 0)
1008 goto no_dscp;
1009 }
1010
1011#if INET6
1012 if (isipv6)
316670eb
A
1013 ip6->ip6_flow |= htonl(dscp_code_from_mbuf_tclass(
1014 m_get_traffic_class(m)) << 20);
6d2010ae
A
1015 else
1016#endif /* INET6 */
316670eb
A
1017 ip->ip_tos |= dscp_code_from_mbuf_tclass(
1018 m_get_traffic_class(m)) << 2;
1019
6d2010ae
A
1020no_dscp:
1021 /*
1022 * For TCP with background traffic class switch CC algo based on sysctl
1023 */
316670eb 1024 if (so->so_type == SOCK_STREAM)
6d2010ae 1025 set_tcp_stream_priority(so);
316670eb
A
1026
1027 so_tc_update_stats(m, so, msc);
1028}
1029
1030__private_extern__ void
1031so_tc_update_stats(struct mbuf *m, struct socket *so, mbuf_svc_class_t msc)
1032{
1033 mbuf_traffic_class_t mtc;
1034
6d2010ae
A
1035 /*
1036 * Assume socket and mbuf traffic class values are the same
316670eb
A
1037 * Also assume the socket lock is held. Note that the stats
1038 * at the socket layer are reduced down to the legacy traffic
1039 * classes; we could/should potentially expand so_tc_stats[].
6d2010ae 1040 */
316670eb
A
1041 mtc = MBUF_SC2TC(msc);
1042 VERIFY(mtc < SO_TC_STATS_MAX);
6d2010ae
A
1043 so->so_tc_stats[mtc].txpackets += 1;
1044 so->so_tc_stats[mtc].txbytes += m->m_pkthdr.len;
6d2010ae
A
1045}
1046
1047__private_extern__ void
1048socket_tclass_init(void)
1049{
39236c6e
A
1050 _CASSERT(_SO_TC_MAX == SO_TC_STATS_MAX);
1051
6d2010ae
A
1052 tclass_lck_grp_attr = lck_grp_attr_alloc_init();
1053 tclass_lck_grp = lck_grp_alloc_init("tclass", tclass_lck_grp_attr);
1054 tclass_lck_attr = lck_attr_alloc_init();
316670eb
A
1055 lck_mtx_init(tclass_lock, tclass_lck_grp, tclass_lck_attr);
1056}
1057
1058__private_extern__ mbuf_svc_class_t
1059so_tc2msc(int tc)
1060{
1061 mbuf_svc_class_t msc;
1062
1063 switch (tc) {
1064 case SO_TC_BK_SYS:
1065 msc = MBUF_SC_BK_SYS;
1066 break;
1067 case SO_TC_BK:
1068 case _SO_TC_BK:
1069 msc = MBUF_SC_BK;
1070 break;
1071 case SO_TC_BE:
1072 msc = MBUF_SC_BE;
1073 break;
1074 case SO_TC_RD:
1075 msc = MBUF_SC_RD;
1076 break;
1077 case SO_TC_OAM:
1078 msc = MBUF_SC_OAM;
1079 break;
1080 case SO_TC_AV:
1081 msc = MBUF_SC_AV;
1082 break;
1083 case SO_TC_RV:
1084 msc = MBUF_SC_RV;
1085 break;
1086 case SO_TC_VI:
1087 case _SO_TC_VI:
1088 msc = MBUF_SC_VI;
1089 break;
1090 case SO_TC_VO:
1091 case _SO_TC_VO:
1092 msc = MBUF_SC_VO;
1093 break;
1094 case SO_TC_CTL:
1095 msc = MBUF_SC_CTL;
1096 break;
1097 case SO_TC_ALL:
1098 default:
1099 msc = MBUF_SC_UNSPEC;
1100 break;
6d2010ae 1101 }
316670eb
A
1102
1103 return (msc);
6d2010ae
A
1104}
1105
316670eb
A
1106__private_extern__ int
1107so_svc2tc(mbuf_svc_class_t svc)
1108{
1109 switch (svc) {
1110 case MBUF_SC_UNSPEC:
1111 return SO_TC_BE;
1112 case MBUF_SC_BK_SYS:
1113 return SO_TC_BK_SYS;
1114 case MBUF_SC_BK:
1115 return SO_TC_BK;
1116 case MBUF_SC_BE:
1117 return SO_TC_BE;
1118 case MBUF_SC_RD:
1119 return SO_TC_RD;
1120 case MBUF_SC_OAM:
1121 return SO_TC_OAM;
1122 case MBUF_SC_AV:
1123 return SO_TC_AV;
1124 case MBUF_SC_RV:
1125 return SO_TC_RV;
1126 case MBUF_SC_VI:
1127 return SO_TC_VI;
1128 case MBUF_SC_VO:
1129 return SO_TC_VO;
1130 case MBUF_SC_CTL:
1131 return SO_TC_CTL;
1132 default:
1133 return SO_TC_BE;
1134 }
1135}
1136
1137/*
39236c6e 1138 * LRO is turned on for AV streaming class.
316670eb 1139 */
39236c6e 1140void
316670eb
A
1141so_set_lro(struct socket *so, int optval)
1142{
39236c6e 1143 if (optval == SO_TC_AV) {
316670eb
A
1144 so->so_flags |= SOF_USELRO;
1145 } else {
39236c6e
A
1146 if (so->so_flags & SOF_USELRO) {
1147 /* transition to non LRO class */
1148 so->so_flags &= ~SOF_USELRO;
1149 struct inpcb *inp = sotoinpcb(so);
1150 struct tcpcb *tp = NULL;
1151 if (inp) {
1152 tp = intotcpcb(inp);
1153 if (tp && (tp->t_flagsext & TF_LRO_OFFLOADED)) {
1154 tcp_lro_remove_state(inp->inp_laddr,
1155 inp->inp_faddr,
1156 inp->inp_lport,
1157 inp->inp_fport);
1158 tp->t_flagsext &= ~TF_LRO_OFFLOADED;
1159 }
1160 }
1161 }
316670eb
A
1162 }
1163}
6d2010ae 1164