]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/in_tclass.c
xnu-3789.70.16.tar.gz
[apple/xnu.git] / bsd / netinet / in_tclass.c
1 /*
2 * Copyright (c) 2009-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <sys/systm.h>
30 #include <sys/kernel.h>
31 #include <sys/types.h>
32 #include <sys/filedesc.h>
33 #include <sys/file_internal.h>
34 #include <sys/proc.h>
35 #include <sys/socket.h>
36 #include <sys/socketvar.h>
37 #include <sys/errno.h>
38 #include <sys/protosw.h>
39 #include <sys/domain.h>
40 #include <sys/mbuf.h>
41 #include <sys/queue.h>
42 #include <sys/sysctl.h>
43
44 #include <net/if.h>
45 #include <net/if_var.h>
46 #include <net/route.h>
47
48 #include <netinet/in.h>
49 #include <netinet/in_var.h>
50 #include <netinet/in_pcb.h>
51 #include <netinet/ip.h>
52 #include <netinet/ip_var.h>
53 #include <netinet/ip6.h>
54 #include <netinet6/ip6_var.h>
55 #include <netinet/udp.h>
56 #include <netinet/udp_var.h>
57 #include <netinet/tcp.h>
58 #include <netinet/tcp_var.h>
59 #include <netinet/tcp_cc.h>
60 #include <netinet/lro_ext.h>
61 #include <netinet/in_tclass.h>
62
63 struct dcsp_msc_map {
64 u_int8_t dscp;
65 mbuf_svc_class_t msc;
66 };
67 static inline int so_throttle_best_effort(struct socket *, struct ifnet *);
68 static void set_dscp_to_wifi_ac_map(const struct dcsp_msc_map *, int);
69 static errno_t dscp_msc_map_from_netsvctype_dscp_map(struct netsvctype_dscp_map *, size_t,
70 struct dcsp_msc_map *);
71
72 static lck_grp_attr_t *tclass_lck_grp_attr = NULL; /* mutex group attributes */
73 static lck_grp_t *tclass_lck_grp = NULL; /* mutex group definition */
74 static lck_attr_t *tclass_lck_attr = NULL; /* mutex attributes */
75 decl_lck_mtx_data(static, tclass_lock_data);
76 static lck_mtx_t *tclass_lock = &tclass_lock_data;
77
78 SYSCTL_NODE(_net, OID_AUTO, qos,
79 CTLFLAG_RW|CTLFLAG_LOCKED, 0, "QoS");
80
81 static int sysctl_default_netsvctype_to_dscp_map SYSCTL_HANDLER_ARGS;
82 SYSCTL_PROC(_net_qos, OID_AUTO, default_netsvctype_to_dscp_map,
83 CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED,
84 0, 0, sysctl_default_netsvctype_to_dscp_map, "S", "");
85
86 static int sysctl_dscp_to_wifi_ac_map SYSCTL_HANDLER_ARGS;
87 SYSCTL_PROC(_net_qos, OID_AUTO, dscp_to_wifi_ac_map,
88 CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED,
89 0, 0, sysctl_dscp_to_wifi_ac_map, "S", "");
90
91 static int sysctl_reset_dscp_to_wifi_ac_map SYSCTL_HANDLER_ARGS;
92 SYSCTL_PROC(_net_qos, OID_AUTO, reset_dscp_to_wifi_ac_map,
93 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
94 0, 0, sysctl_reset_dscp_to_wifi_ac_map, "I", "");
95
96 int net_qos_verbose = 0;
97 SYSCTL_INT(_net_qos, OID_AUTO, verbose,
98 CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_verbose, 0, "");
99
100 /*
101 * Fastlane QoS policy:
102 * By Default allow all apps to get traffic class to DSCP mapping
103 */
104 SYSCTL_NODE(_net_qos, OID_AUTO, policy,
105 CTLFLAG_RW|CTLFLAG_LOCKED, 0, "");
106
107 int net_qos_policy_restricted = 0;
108 SYSCTL_INT(_net_qos_policy, OID_AUTO, restricted,
109 CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_restricted, 0, "");
110
111 int net_qos_policy_restrict_avapps = 0;
112 SYSCTL_INT(_net_qos_policy, OID_AUTO, restrict_avapps,
113 CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_restrict_avapps, 0, "");
114
115 int net_qos_policy_wifi_enabled = 0;
116 SYSCTL_INT(_net_qos_policy, OID_AUTO, wifi_enabled,
117 CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_wifi_enabled, 0, "");
118
119 int net_qos_policy_capable_enabled = 0;
120 SYSCTL_INT(_net_qos_policy, OID_AUTO, capable_enabled,
121 CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_capable_enabled, 0, "");
122
123 /*
124 * Socket traffic class from network service type
125 */
126 const int sotc_by_netservicetype[_NET_SERVICE_TYPE_COUNT] = {
127 SO_TC_BE, /* NET_SERVICE_TYPE_BE */
128 SO_TC_BK_SYS, /* NET_SERVICE_TYPE_BK */
129 SO_TC_VI, /* NET_SERVICE_TYPE_SIG */
130 SO_TC_VI, /* NET_SERVICE_TYPE_VI */
131 SO_TC_VO, /* NET_SERVICE_TYPE_VO */
132 SO_TC_RV, /* NET_SERVICE_TYPE_RV */
133 SO_TC_AV, /* NET_SERVICE_TYPE_AV */
134 SO_TC_OAM, /* NET_SERVICE_TYPE_OAM */
135 SO_TC_RD /* NET_SERVICE_TYPE_RD */
136 };
137
138 /*
139 * DSCP mappings for QoS Fastlane as based on network service types
140 */
141 static const
142 struct netsvctype_dscp_map fastlane_netsvctype_dscp_map[_NET_SERVICE_TYPE_COUNT] = {
143 { NET_SERVICE_TYPE_BE, _DSCP_DF },
144 { NET_SERVICE_TYPE_BK, _DSCP_AF11 },
145 { NET_SERVICE_TYPE_SIG, _DSCP_CS3 },
146 { NET_SERVICE_TYPE_VI, _DSCP_AF41 },
147 { NET_SERVICE_TYPE_VO, _DSCP_EF },
148 { NET_SERVICE_TYPE_RV, _DSCP_CS4 },
149 { NET_SERVICE_TYPE_AV, _DSCP_AF31 },
150 { NET_SERVICE_TYPE_OAM, _DSCP_CS2 },
151 { NET_SERVICE_TYPE_RD, _DSCP_AF21 },
152 };
153
154 static struct net_qos_dscp_map default_net_qos_dscp_map;
155
156 /*
157 * The size is one more than the max because DSCP start at zero
158 */
159 #define DSCP_ARRAY_SIZE (_MAX_DSCP + 1)
160
161 /*
162 * The DSCP to UP mapping (via mbuf service class) for WiFi follows is the mapping
163 * that implemented at the 802.11 driver level when the mbuf service class is
164 * MBUF_SC_BE.
165 *
166 * This clashes with the recommended mapping documented by the IETF document
167 * draft-szigeti-tsvwg-ieee-802-11e-01.txt but we keep the mapping to maintain
168 * binary compatibility. Applications should use the network service type socket
169 * option instead to select L2 QoS marking instead of IP_TOS or IPV6_TCLASS.
170 */
171 static const struct dcsp_msc_map default_dscp_to_wifi_ac_map[] = {
172 { _DSCP_DF, MBUF_SC_BE }, /* RFC 2474 Standard */
173 { 1, MBUF_SC_BE }, /* */
174 { 2, MBUF_SC_BE }, /* */
175 { 3, MBUF_SC_BE }, /* */
176 { 4, MBUF_SC_BE }, /* */
177 { 5, MBUF_SC_BE }, /* */
178 { 6, MBUF_SC_BE }, /* */
179 { 7, MBUF_SC_BE }, /* */
180
181 { _DSCP_CS1, MBUF_SC_BK }, /* RFC 3662 Low-Priority Data */
182 { 9, MBUF_SC_BK }, /* */
183 { _DSCP_AF11, MBUF_SC_BK }, /* RFC 2597 High-Throughput Data */
184 { 11, MBUF_SC_BK }, /* */
185 { _DSCP_AF12, MBUF_SC_BK }, /* RFC 2597 High-Throughput Data */
186 { 13, MBUF_SC_BK }, /* */
187 { _DSCP_AF13, MBUF_SC_BK }, /* RFC 2597 High-Throughput Data */
188 { 15, MBUF_SC_BK }, /* */
189
190 { _DSCP_CS2, MBUF_SC_BK }, /* RFC 4594 OAM */
191 { 17, MBUF_SC_BK }, /* */
192 { _DSCP_AF21, MBUF_SC_BK }, /* RFC 2597 Low-Latency Data */
193 { 19, MBUF_SC_BK }, /* */
194 { _DSCP_AF22, MBUF_SC_BK }, /* RFC 2597 Low-Latency Data */
195 { 21, MBUF_SC_BK }, /* */
196 { _DSCP_AF23, MBUF_SC_BK }, /* RFC 2597 Low-Latency Data */
197 { 23, MBUF_SC_BK }, /* */
198
199 { _DSCP_CS3, MBUF_SC_BE }, /* RFC 2474 Broadcast Video */
200 { 25, MBUF_SC_BE }, /* */
201 { _DSCP_AF31, MBUF_SC_BE }, /* RFC 2597 Multimedia Streaming */
202 { 27, MBUF_SC_BE }, /* */
203 { _DSCP_AF32, MBUF_SC_BE }, /* RFC 2597 Multimedia Streaming */
204 { 29, MBUF_SC_BE }, /* */
205 { _DSCP_AF33, MBUF_SC_BE }, /* RFC 2597 Multimedia Streaming */
206 { 31, MBUF_SC_BE }, /* */
207
208 { _DSCP_CS4, MBUF_SC_VI }, /* RFC 2474 Real-Time Interactive */
209 { 33, MBUF_SC_VI }, /* */
210 { _DSCP_AF41, MBUF_SC_VI }, /* RFC 2597 Multimedia Conferencing */
211 { 35, MBUF_SC_VI }, /* */
212 { _DSCP_AF42, MBUF_SC_VI }, /* RFC 2597 Multimedia Conferencing */
213 { 37, MBUF_SC_VI }, /* */
214 { _DSCP_AF43, MBUF_SC_VI }, /* RFC 2597 Multimedia Conferencing */
215 { 39, MBUF_SC_VI }, /* */
216
217 { _DSCP_CS5, MBUF_SC_VI }, /* RFC 2474 Signaling */
218 { 41, MBUF_SC_VI }, /* */
219 { 42, MBUF_SC_VI }, /* */
220 { 43, MBUF_SC_VI }, /* */
221 { _DSCP_VA, MBUF_SC_VI }, /* RFC 5865 VOICE-ADMIT */
222 { 45, MBUF_SC_VI }, /* */
223 { _DSCP_EF, MBUF_SC_VI }, /* RFC 3246 Telephony */
224 { 47, MBUF_SC_VI }, /* */
225
226 { _DSCP_CS6, MBUF_SC_VO }, /* Wi-Fi WMM Certification: Chariot */
227 { 49, MBUF_SC_VO }, /* */
228 { 50, MBUF_SC_VO }, /* */
229 { 51, MBUF_SC_VO }, /* */
230 { 52, MBUF_SC_VO }, /* Wi-Fi WMM Certification: Sigma */
231 { 53, MBUF_SC_VO }, /* */
232 { 54, MBUF_SC_VO }, /* */
233 { 55, MBUF_SC_VO }, /* */
234
235 { _DSCP_CS7, MBUF_SC_VO }, /* Wi-Fi WMM Certification: Chariot */
236 { 57, MBUF_SC_VO }, /* */
237 { 58, MBUF_SC_VO }, /* */
238 { 59, MBUF_SC_VO }, /* */
239 { 60, MBUF_SC_VO }, /* */
240 { 61, MBUF_SC_VO }, /* */
241 { 62, MBUF_SC_VO }, /* */
242 { 63, MBUF_SC_VO }, /* */
243
244 { 255, MBUF_SC_UNSPEC } /* invalid DSCP to mark last entry */
245 };
246
247 mbuf_svc_class_t wifi_dscp_to_msc_array[DSCP_ARRAY_SIZE];
248
249 /*
250 * If there is no foreground activity on the interface for bg_switch_time
251 * seconds, the background connections can switch to foreground TCP
252 * congestion control.
253 */
254 #define TCP_BG_SWITCH_TIME 2 /* seconds */
255
256 #if (DEVELOPMENT || DEBUG)
257
258 extern char *proc_best_name(proc_t p);
259
260 static int tfp_count = 0;
261
262 static TAILQ_HEAD(, tclass_for_proc) tfp_head =
263 TAILQ_HEAD_INITIALIZER(tfp_head);
264
265 struct tclass_for_proc {
266 TAILQ_ENTRY(tclass_for_proc) tfp_link;
267 int tfp_class;
268 pid_t tfp_pid;
269 char tfp_pname[(2 * MAXCOMLEN) + 1];
270 u_int32_t tfp_qos_mode;
271 };
272
273 static int get_pid_tclass(struct so_tcdbg *);
274 static int get_pname_tclass(struct so_tcdbg *);
275 static int set_pid_tclass(struct so_tcdbg *);
276 static int set_pname_tclass(struct so_tcdbg *);
277 static int flush_pid_tclass(struct so_tcdbg *);
278 static int purge_tclass_for_proc(void);
279 static int flush_tclass_for_proc(void);
280 static void set_tclass_for_curr_proc(struct socket *);
281
282 /*
283 * Must be called with tclass_lock held
284 */
285 static struct tclass_for_proc *
286 find_tfp_by_pid(pid_t pid)
287 {
288 struct tclass_for_proc *tfp;
289
290 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
291 if (tfp->tfp_pid == pid)
292 break;
293 }
294 return (tfp);
295 }
296
297 /*
298 * Must be called with tclass_lock held
299 */
300 static struct tclass_for_proc *
301 find_tfp_by_pname(const char *pname)
302 {
303 struct tclass_for_proc *tfp;
304
305 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
306 if (strncmp(pname, tfp->tfp_pname,
307 sizeof (tfp->tfp_pname)) == 0)
308 break;
309 }
310 return (tfp);
311 }
312
313 __private_extern__ void
314 set_tclass_for_curr_proc(struct socket *so)
315 {
316 struct tclass_for_proc *tfp = NULL;
317 proc_t p = current_proc(); /* Not ref counted */
318 pid_t pid = proc_pid(p);
319 char *pname = proc_best_name(p);
320
321 lck_mtx_lock(tclass_lock);
322
323 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
324 if ((tfp->tfp_pid == pid) || (tfp->tfp_pid == -1 &&
325 strncmp(pname, tfp->tfp_pname,
326 sizeof (tfp->tfp_pname)) == 0)) {
327 if (tfp->tfp_class != SO_TC_UNSPEC)
328 so->so_traffic_class = tfp->tfp_class;
329
330 if (tfp->tfp_qos_mode == QOS_MODE_MARKING_POLICY_ENABLE)
331 so->so_flags1 |= SOF1_QOSMARKING_ALLOWED;
332 else if (tfp->tfp_qos_mode == QOS_MODE_MARKING_POLICY_DISABLE)
333 so->so_flags1 &= ~SOF1_QOSMARKING_ALLOWED;
334 break;
335 }
336 }
337
338 lck_mtx_unlock(tclass_lock);
339 }
340
341 /*
342 * Purge entries with PIDs of exited processes
343 */
344 int
345 purge_tclass_for_proc(void)
346 {
347 int error = 0;
348 struct tclass_for_proc *tfp, *tvar;
349
350 lck_mtx_lock(tclass_lock);
351
352 TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) {
353 proc_t p;
354
355 if (tfp->tfp_pid == -1)
356 continue;
357 if ((p = proc_find(tfp->tfp_pid)) == NULL) {
358 tfp_count--;
359 TAILQ_REMOVE(&tfp_head, tfp, tfp_link);
360
361 _FREE(tfp, M_TEMP);
362 } else {
363 proc_rele(p);
364 }
365 }
366
367 lck_mtx_unlock(tclass_lock);
368
369 return (error);
370 }
371
372 /*
373 * Remove one entry
374 * Must be called with tclass_lock held
375 */
376 static void
377 free_tclass_for_proc(struct tclass_for_proc *tfp)
378 {
379 if (tfp == NULL)
380 return;
381 tfp_count--;
382 TAILQ_REMOVE(&tfp_head, tfp, tfp_link);
383 _FREE(tfp, M_TEMP);
384 }
385
386 /*
387 * Remove all entries
388 */
389 int
390 flush_tclass_for_proc(void)
391 {
392 int error = 0;
393 struct tclass_for_proc *tfp, *tvar;
394
395 lck_mtx_lock(tclass_lock);
396
397 TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) {
398 free_tclass_for_proc(tfp);
399 }
400
401 lck_mtx_unlock(tclass_lock);
402
403 return (error);
404
405 }
406
407 /*
408 * Must be called with tclass_lock held
409 */
410 static struct tclass_for_proc *
411 alloc_tclass_for_proc(pid_t pid, const char *pname)
412 {
413 struct tclass_for_proc *tfp;
414
415 if (pid == -1 && pname == NULL)
416 return (NULL);
417
418 tfp = _MALLOC(sizeof (struct tclass_for_proc), M_TEMP, M_NOWAIT|M_ZERO);
419 if (tfp == NULL)
420 return (NULL);
421
422 tfp->tfp_pid = pid;
423 /*
424 * Add per pid entries before per proc name so we can find
425 * a specific instance of a process before the general name base entry.
426 */
427 if (pid != -1) {
428 TAILQ_INSERT_HEAD(&tfp_head, tfp, tfp_link);
429 } else {
430 strlcpy(tfp->tfp_pname, pname, sizeof (tfp->tfp_pname));
431 TAILQ_INSERT_TAIL(&tfp_head, tfp, tfp_link);
432 }
433
434 tfp_count++;
435
436 return (tfp);
437 }
438
439 /*
440 * SO_TC_UNSPEC for tclass means to remove the entry
441 */
442 int
443 set_pid_tclass(struct so_tcdbg *so_tcdbg)
444 {
445 int error = EINVAL;
446 proc_t p = NULL;
447 struct filedesc *fdp;
448 struct fileproc *fp;
449 struct tclass_for_proc *tfp;
450 int i;
451 pid_t pid = so_tcdbg->so_tcdbg_pid;
452 int tclass = so_tcdbg->so_tcdbg_tclass;
453 int netsvctype = so_tcdbg->so_tcdbg_netsvctype;
454
455 p = proc_find(pid);
456 if (p == NULL) {
457 printf("%s proc_find(%d) failed\n", __func__, pid);
458 goto done;
459 }
460
461 /* Need a tfp */
462 lck_mtx_lock(tclass_lock);
463
464 tfp = find_tfp_by_pid(pid);
465 if (tfp == NULL) {
466 tfp = alloc_tclass_for_proc(pid, NULL);
467 if (tfp == NULL) {
468 lck_mtx_unlock(tclass_lock);
469 error = ENOBUFS;
470 goto done;
471 }
472 }
473 tfp->tfp_class = tclass;
474 tfp->tfp_qos_mode = so_tcdbg->so_tcbbg_qos_mode;
475
476 lck_mtx_unlock(tclass_lock);
477
478 if (tfp != NULL) {
479 proc_fdlock(p);
480
481 fdp = p->p_fd;
482 for (i = 0; i < fdp->fd_nfiles; i++) {
483 struct socket *so;
484
485 fp = fdp->fd_ofiles[i];
486 if (fp == NULL ||
487 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
488 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
489 continue;
490
491 so = (struct socket *)fp->f_fglob->fg_data;
492 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6)
493 continue;
494
495 socket_lock(so, 1);
496 if (tfp->tfp_qos_mode == QOS_MODE_MARKING_POLICY_ENABLE)
497 so->so_flags1 |= SOF1_QOSMARKING_ALLOWED;
498 else if (tfp->tfp_qos_mode == QOS_MODE_MARKING_POLICY_DISABLE)
499 so->so_flags1 &= ~SOF1_QOSMARKING_ALLOWED;
500 socket_unlock(so, 1);
501
502 if (netsvctype != _NET_SERVICE_TYPE_UNSPEC)
503 error = sock_setsockopt(so, SOL_SOCKET,
504 SO_NET_SERVICE_TYPE, &netsvctype, sizeof(int));
505 if (tclass != SO_TC_UNSPEC)
506 error = sock_setsockopt(so, SOL_SOCKET,
507 SO_TRAFFIC_CLASS, &tclass, sizeof(int));
508
509 }
510
511 proc_fdunlock(p);
512 }
513
514 error = 0;
515 done:
516 if (p != NULL)
517 proc_rele(p);
518
519 return (error);
520 }
521
522 int
523 set_pname_tclass(struct so_tcdbg *so_tcdbg)
524 {
525 int error = EINVAL;
526 struct tclass_for_proc *tfp;
527
528 lck_mtx_lock(tclass_lock);
529
530 tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname);
531 if (tfp == NULL) {
532 tfp = alloc_tclass_for_proc(-1, so_tcdbg->so_tcdbg_pname);
533 if (tfp == NULL) {
534 lck_mtx_unlock(tclass_lock);
535 error = ENOBUFS;
536 goto done;
537 }
538 }
539 tfp->tfp_class = so_tcdbg->so_tcdbg_tclass;
540 tfp->tfp_qos_mode = so_tcdbg->so_tcbbg_qos_mode;
541
542 lck_mtx_unlock(tclass_lock);
543
544 error = 0;
545 done:
546
547 return (error);
548 }
549
550 static int
551 flush_pid_tclass(struct so_tcdbg *so_tcdbg)
552 {
553 pid_t pid = so_tcdbg->so_tcdbg_pid;
554 int tclass = so_tcdbg->so_tcdbg_tclass;
555 struct filedesc *fdp;
556 int error = EINVAL;
557 proc_t p;
558 int i;
559
560 p = proc_find(pid);
561 if (p == PROC_NULL) {
562 printf("%s proc_find(%d) failed\n", __func__, pid);
563 goto done;
564 }
565
566 proc_fdlock(p);
567 fdp = p->p_fd;
568 for (i = 0; i < fdp->fd_nfiles; i++) {
569 struct socket *so;
570 struct fileproc *fp;
571
572 fp = fdp->fd_ofiles[i];
573 if (fp == NULL ||
574 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
575 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
576 continue;
577
578 so = (struct socket *)fp->f_fglob->fg_data;
579 error = sock_setsockopt(so, SOL_SOCKET, SO_FLUSH, &tclass,
580 sizeof (tclass));
581 if (error != 0) {
582 printf("%s: setsockopt(SO_FLUSH) (so=0x%llx, fd=%d, "
583 "tclass=%d) failed %d\n", __func__,
584 (uint64_t)VM_KERNEL_ADDRPERM(so), i, tclass,
585 error);
586 error = 0;
587 }
588 }
589 proc_fdunlock(p);
590
591 error = 0;
592 done:
593 if (p != PROC_NULL)
594 proc_rele(p);
595
596 return (error);
597 }
598
599 int
600 get_pid_tclass(struct so_tcdbg *so_tcdbg)
601 {
602 int error = EINVAL;
603 proc_t p = NULL;
604 struct tclass_for_proc *tfp;
605 pid_t pid = so_tcdbg->so_tcdbg_pid;
606
607 so_tcdbg->so_tcdbg_tclass = SO_TC_UNSPEC; /* Means not set */
608
609 p = proc_find(pid);
610 if (p == NULL) {
611 printf("%s proc_find(%d) failed\n", __func__, pid);
612 goto done;
613 }
614
615 /* Need a tfp */
616 lck_mtx_lock(tclass_lock);
617
618 tfp = find_tfp_by_pid(pid);
619 if (tfp != NULL) {
620 so_tcdbg->so_tcdbg_tclass = tfp->tfp_class;
621 so_tcdbg->so_tcbbg_qos_mode = tfp->tfp_qos_mode;
622 error = 0;
623 }
624 lck_mtx_unlock(tclass_lock);
625 done:
626 if (p != NULL)
627 proc_rele(p);
628
629 return (error);
630 }
631
632 int
633 get_pname_tclass(struct so_tcdbg *so_tcdbg)
634 {
635 int error = EINVAL;
636 struct tclass_for_proc *tfp;
637
638 so_tcdbg->so_tcdbg_tclass = SO_TC_UNSPEC; /* Means not set */
639
640 /* Need a tfp */
641 lck_mtx_lock(tclass_lock);
642
643 tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname);
644 if (tfp != NULL) {
645 so_tcdbg->so_tcdbg_tclass = tfp->tfp_class;
646 so_tcdbg->so_tcbbg_qos_mode = tfp->tfp_qos_mode;
647 error = 0;
648 }
649 lck_mtx_unlock(tclass_lock);
650
651 return (error);
652 }
653
654 static int
655 delete_tclass_for_pid_pname(struct so_tcdbg *so_tcdbg)
656 {
657 int error = EINVAL;
658 pid_t pid = so_tcdbg->so_tcdbg_pid;
659 struct tclass_for_proc *tfp = NULL;
660
661 lck_mtx_lock(tclass_lock);
662
663 if (pid != -1)
664 tfp = find_tfp_by_pid(pid);
665 else
666 tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname);
667
668 if (tfp != NULL) {
669 free_tclass_for_proc(tfp);
670 error = 0;
671 }
672
673 lck_mtx_unlock(tclass_lock);
674
675 return (error);
676 }
677
678 /*
679 * Setting options requires privileges
680 */
681 __private_extern__ int
682 so_set_tcdbg(struct socket *so, struct so_tcdbg *so_tcdbg)
683 {
684 int error = 0;
685
686 if ((so->so_state & SS_PRIV) == 0)
687 return (EPERM);
688
689 socket_unlock(so, 0);
690
691 switch (so_tcdbg->so_tcdbg_cmd) {
692 case SO_TCDBG_PID:
693 error = set_pid_tclass(so_tcdbg);
694 break;
695
696 case SO_TCDBG_PNAME:
697 error = set_pname_tclass(so_tcdbg);
698 break;
699
700 case SO_TCDBG_PURGE:
701 error = purge_tclass_for_proc();
702 break;
703
704 case SO_TCDBG_FLUSH:
705 error = flush_tclass_for_proc();
706 break;
707
708 case SO_TCDBG_DELETE:
709 error = delete_tclass_for_pid_pname(so_tcdbg);
710 break;
711
712 case SO_TCDBG_TCFLUSH_PID:
713 error = flush_pid_tclass(so_tcdbg);
714 break;
715
716 default:
717 error = EINVAL;
718 break;
719 }
720
721 socket_lock(so, 0);
722
723 return (error);
724 }
725
726 /*
727 * Not required to be privileged to get
728 */
729 __private_extern__ int
730 sogetopt_tcdbg(struct socket *so, struct sockopt *sopt)
731 {
732 int error = 0;
733 struct so_tcdbg so_tcdbg;
734 void *buf = NULL;
735 size_t len = sopt->sopt_valsize;
736
737 error = sooptcopyin(sopt, &so_tcdbg, sizeof (struct so_tcdbg),
738 sizeof (struct so_tcdbg));
739 if (error != 0)
740 return (error);
741
742 sopt->sopt_valsize = len;
743
744 socket_unlock(so, 0);
745
746 switch (so_tcdbg.so_tcdbg_cmd) {
747 case SO_TCDBG_PID:
748 error = get_pid_tclass(&so_tcdbg);
749 break;
750
751 case SO_TCDBG_PNAME:
752 error = get_pname_tclass(&so_tcdbg);
753 break;
754
755 case SO_TCDBG_COUNT:
756 lck_mtx_lock(tclass_lock);
757 so_tcdbg.so_tcdbg_count = tfp_count;
758 lck_mtx_unlock(tclass_lock);
759 break;
760
761 case SO_TCDBG_LIST: {
762 struct tclass_for_proc *tfp;
763 int n, alloc_count;
764 struct so_tcdbg *ptr;
765
766 lck_mtx_lock(tclass_lock);
767 if ((alloc_count = tfp_count) == 0) {
768 lck_mtx_unlock(tclass_lock);
769 error = EINVAL;
770 break;
771 }
772 len = alloc_count * sizeof (struct so_tcdbg);
773 lck_mtx_unlock(tclass_lock);
774
775 buf = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
776 if (buf == NULL) {
777 error = ENOBUFS;
778 break;
779 }
780
781 lck_mtx_lock(tclass_lock);
782 n = 0;
783 ptr = (struct so_tcdbg *)buf;
784 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
785 if (++n > alloc_count)
786 break;
787 if (tfp->tfp_pid != -1) {
788 ptr->so_tcdbg_cmd = SO_TCDBG_PID;
789 ptr->so_tcdbg_pid = tfp->tfp_pid;
790 } else {
791 ptr->so_tcdbg_cmd = SO_TCDBG_PNAME;
792 ptr->so_tcdbg_pid = -1;
793 strlcpy(ptr->so_tcdbg_pname,
794 tfp->tfp_pname,
795 sizeof (ptr->so_tcdbg_pname));
796 }
797 ptr->so_tcdbg_tclass = tfp->tfp_class;
798 ptr->so_tcbbg_qos_mode = tfp->tfp_qos_mode;
799 ptr++;
800 }
801
802 lck_mtx_unlock(tclass_lock);
803 }
804 break;
805
806 default:
807 error = EINVAL;
808 break;
809 }
810
811 socket_lock(so, 0);
812
813 if (error == 0) {
814 if (buf == NULL) {
815 error = sooptcopyout(sopt, &so_tcdbg,
816 sizeof (struct so_tcdbg));
817 } else {
818 error = sooptcopyout(sopt, buf, len);
819 _FREE(buf, M_TEMP);
820 }
821 }
822 return (error);
823 }
824
825 #endif /* (DEVELOPMENT || DEBUG) */
826
827 int
828 so_get_netsvc_marking_level(struct socket *so)
829 {
830 int marking_level = NETSVC_MRKNG_UNKNOWN;
831 struct ifnet *ifp = NULL;
832
833 switch (SOCK_DOM(so)) {
834 case PF_INET: {
835 struct inpcb *inp = sotoinpcb(so);
836
837 if (inp != NULL)
838 ifp = inp->inp_last_outifp;
839 break;
840 }
841 case PF_INET6: {
842 struct in6pcb *in6p = sotoin6pcb(so);
843
844 if (in6p != NULL)
845 ifp = in6p->in6p_last_outifp;
846 break;
847 }
848 default:
849 break;
850 }
851 if (ifp != NULL) {
852 if ((ifp->if_eflags &
853 (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) ==
854 (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) {
855 if ((so->so_flags1 & SOF1_QOSMARKING_ALLOWED))
856 marking_level = NETSVC_MRKNG_LVL_L3L2_ALL;
857 else
858 marking_level = NETSVC_MRKNG_LVL_L3L2_BK;
859 } else {
860 marking_level = NETSVC_MRKNG_LVL_L2;
861 }
862 }
863 return (marking_level);
864 }
865
866 __private_extern__ int
867 so_set_traffic_class(struct socket *so, int optval)
868 {
869 int error = 0;
870
871 if (optval < SO_TC_BE || optval > SO_TC_CTL) {
872 error = EINVAL;
873 } else {
874 switch (optval) {
875 case _SO_TC_BK:
876 optval = SO_TC_BK;
877 break;
878 case _SO_TC_VI:
879 optval = SO_TC_VI;
880 break;
881 case _SO_TC_VO:
882 optval = SO_TC_VO;
883 break;
884 default:
885 if (!SO_VALID_TC(optval))
886 error = EINVAL;
887 break;
888 }
889
890 if (error == 0) {
891 int oldval = so->so_traffic_class;
892
893 VERIFY(SO_VALID_TC(optval));
894 so->so_traffic_class = optval;
895
896 if ((SOCK_DOM(so) == PF_INET ||
897 SOCK_DOM(so) == PF_INET6) &&
898 SOCK_TYPE(so) == SOCK_STREAM)
899 set_tcp_stream_priority(so);
900
901 if ((SOCK_DOM(so) == PF_INET ||
902 SOCK_DOM(so) == PF_INET6) &&
903 optval != oldval && (optval == SO_TC_BK_SYS ||
904 oldval == SO_TC_BK_SYS)) {
905 /*
906 * If the app switches from BK_SYS to something
907 * else, resume the socket if it was suspended.
908 */
909 if (oldval == SO_TC_BK_SYS)
910 inp_reset_fc_state(so->so_pcb);
911
912 SOTHROTTLELOG("throttle[%d]: so 0x%llx "
913 "[%d,%d] opportunistic %s\n", so->last_pid,
914 (uint64_t)VM_KERNEL_ADDRPERM(so),
915 SOCK_DOM(so), SOCK_TYPE(so),
916 (optval == SO_TC_BK_SYS) ? "ON" : "OFF");
917 }
918 }
919 }
920 return (error);
921 }
922
923 __private_extern__ int
924 so_set_net_service_type(struct socket *so, int netsvctype)
925 {
926 int sotc;
927 int error;
928
929 if (!IS_VALID_NET_SERVICE_TYPE(netsvctype))
930 return (EINVAL);
931
932 sotc = sotc_by_netservicetype[netsvctype];
933 error = so_set_traffic_class(so, sotc);
934 if (error != 0)
935 return (error);
936 so->so_netsvctype = netsvctype;
937 so->so_flags1 |= SOF1_TC_NET_SERV_TYPE;
938
939 return (0);
940 }
941
942 __private_extern__ void
943 so_set_default_traffic_class(struct socket *so)
944 {
945 so->so_traffic_class = SO_TC_BE;
946
947 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)) {
948 if (net_qos_policy_restricted == 0)
949 so->so_flags1 |= SOF1_QOSMARKING_ALLOWED;
950 #if (DEVELOPMENT || DEBUG)
951 if (tfp_count > 0)
952 set_tclass_for_curr_proc(so);
953 #endif /* (DEVELOPMENT || DEBUG) */
954 }
955 }
956
957 __private_extern__ int
958 so_set_opportunistic(struct socket *so, int optval)
959 {
960 return (so_set_traffic_class(so, (optval == 0) ?
961 SO_TC_BE : SO_TC_BK_SYS));
962 }
963
964 __private_extern__ int
965 so_get_opportunistic(struct socket *so)
966 {
967 return (so->so_traffic_class == SO_TC_BK_SYS);
968 }
969
970 __private_extern__ int
971 so_tc_from_control(struct mbuf *control, int *out_netsvctype)
972 {
973 struct cmsghdr *cm;
974 int sotc = SO_TC_UNSPEC;
975
976 *out_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
977
978 for (cm = M_FIRST_CMSGHDR(control); cm != NULL;
979 cm = M_NXT_CMSGHDR(control, cm)) {
980 int val;
981
982 if (cm->cmsg_len < sizeof (struct cmsghdr))
983 break;
984 if (cm->cmsg_level != SOL_SOCKET ||
985 cm->cmsg_len != CMSG_LEN(sizeof(int)))
986 continue;
987 val = *(int *)(void *)CMSG_DATA(cm);
988 /*
989 * The first valid option wins
990 */
991 switch (cm->cmsg_type) {
992 case SO_TRAFFIC_CLASS:
993 if (SO_VALID_TC(val)) {
994 sotc = val;
995 return (sotc);
996 /* NOT REACHED */
997 } else if (val < SO_TC_NET_SERVICE_OFFSET) {
998 break;
999 }
1000 /*
1001 * Handle the case SO_NET_SERVICE_TYPE values are
1002 * passed using SO_TRAFFIC_CLASS
1003 */
1004 val = val - SO_TC_NET_SERVICE_OFFSET;
1005 /* FALLTHROUGH */
1006 case SO_NET_SERVICE_TYPE:
1007 if (!IS_VALID_NET_SERVICE_TYPE(val))
1008 break;
1009 *out_netsvctype = val;
1010 sotc = sotc_by_netservicetype[val];
1011 return (sotc);
1012 /* NOT REACHED */
1013 default:
1014 break;
1015 }
1016 }
1017
1018 return (sotc);
1019 }
1020
1021 __private_extern__ void
1022 so_recv_data_stat(struct socket *so, struct mbuf *m, size_t off)
1023 {
1024 uint32_t mtc = m_get_traffic_class(m);
1025
1026 if (mtc >= SO_TC_STATS_MAX)
1027 mtc = MBUF_TC_BE;
1028
1029 so->so_tc_stats[mtc].rxpackets += 1;
1030 so->so_tc_stats[mtc].rxbytes +=
1031 ((m->m_flags & M_PKTHDR) ? m->m_pkthdr.len : 0) + off;
1032 }
1033
1034 __private_extern__ void
1035 so_inc_recv_data_stat(struct socket *so, size_t pkts, size_t bytes,
1036 uint32_t mtc)
1037 {
1038 if (mtc >= SO_TC_STATS_MAX)
1039 mtc = MBUF_TC_BE;
1040
1041 so->so_tc_stats[mtc].rxpackets += pkts;
1042 so->so_tc_stats[mtc].rxbytes += bytes;
1043 }
1044
1045 static inline int
1046 so_throttle_best_effort(struct socket *so, struct ifnet *ifp)
1047 {
1048 u_int32_t uptime = net_uptime();
1049 return (soissrcbesteffort(so) &&
1050 net_io_policy_throttle_best_effort == 1 &&
1051 ifp->if_rt_sendts > 0 &&
1052 (int)(uptime - ifp->if_rt_sendts) <= TCP_BG_SWITCH_TIME);
1053 }
1054
1055 __private_extern__ void
1056 set_tcp_stream_priority(struct socket *so)
1057 {
1058 struct inpcb *inp = sotoinpcb(so);
1059 struct tcpcb *tp = intotcpcb(inp);
1060 struct ifnet *outifp;
1061 u_char old_cc = tp->tcp_cc_index;
1062 int recvbg = IS_TCP_RECV_BG(so);
1063 bool is_local = false, fg_active = false;
1064 u_int32_t uptime;
1065
1066 VERIFY((SOCK_CHECK_DOM(so, PF_INET) ||
1067 SOCK_CHECK_DOM(so, PF_INET6)) &&
1068 SOCK_CHECK_TYPE(so, SOCK_STREAM) &&
1069 SOCK_CHECK_PROTO(so, IPPROTO_TCP));
1070
1071 /* Return if the socket is in a terminal state */
1072 if (inp->inp_state == INPCB_STATE_DEAD)
1073 return;
1074
1075 outifp = inp->inp_last_outifp;
1076 uptime = net_uptime();
1077
1078 /*
1079 * If the socket was marked as a background socket or if the
1080 * traffic class is set to background with traffic class socket
1081 * option then make both send and recv side of the stream to be
1082 * background. The variable sotcdb which can be set with sysctl
1083 * is used to disable these settings for testing.
1084 */
1085 if (outifp == NULL || (outifp->if_flags & IFF_LOOPBACK))
1086 is_local = true;
1087
1088 /* Check if there has been recent foreground activity */
1089 if (outifp != NULL) {
1090 /*
1091 * If the traffic source is background, check if
1092 * if it can be switched to foreground. This can
1093 * happen when there is no indication of foreground
1094 * activity.
1095 */
1096 if (soissrcbackground(so) &&
1097 ((outifp->if_fg_sendts > 0 &&
1098 (int)(uptime - outifp->if_fg_sendts) <=
1099 TCP_BG_SWITCH_TIME) || net_io_policy_throttled))
1100 fg_active = true;
1101
1102 /*
1103 * The traffic source is best-effort -- check if
1104 * the policy to throttle best effort is enabled
1105 * and there was realtime activity on this
1106 * interface recently. If this is true, enable
1107 * algorithms that respond to increased latency
1108 * on best-effort traffic.
1109 */
1110 if (so_throttle_best_effort(so, outifp))
1111 fg_active = true;
1112 }
1113
1114 /*
1115 * System initiated background traffic like cloud uploads should
1116 * always use background delay sensitive algorithms. This will
1117 * make the stream more responsive to other streams on the user's
1118 * network and it will minimize latency induced.
1119 */
1120 if (fg_active || IS_SO_TC_BACKGROUNDSYSTEM(so->so_traffic_class)) {
1121 /*
1122 * If the interface that the connection is using is
1123 * loopback, do not use background congestion
1124 * control algorithm.
1125 *
1126 * If there has been recent foreground activity or if
1127 * there was an indication that a foreground application
1128 * is going to use networking (net_io_policy_throttled),
1129 * switch the backgroung streams to use background
1130 * congestion control algorithm. Otherwise, even background
1131 * flows can move into foreground.
1132 */
1133 if ((sotcdb & SOTCDB_NO_SENDTCPBG) != 0 || is_local ||
1134 !IS_SO_TC_BACKGROUNDSYSTEM(so->so_traffic_class)) {
1135 if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX)
1136 tcp_set_foreground_cc(so);
1137 } else {
1138 if (old_cc != TCP_CC_ALGO_BACKGROUND_INDEX)
1139 tcp_set_background_cc(so);
1140 }
1141
1142 /* Set receive side background flags */
1143 if ((sotcdb & SOTCDB_NO_RECVTCPBG) != 0 || is_local ||
1144 !IS_SO_TC_BACKGROUNDSYSTEM(so->so_traffic_class)) {
1145 tcp_clear_recv_bg(so);
1146 } else {
1147 tcp_set_recv_bg(so);
1148 }
1149 } else {
1150 tcp_clear_recv_bg(so);
1151 if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX)
1152 tcp_set_foreground_cc(so);
1153 }
1154
1155 if (old_cc != tp->tcp_cc_index || recvbg != IS_TCP_RECV_BG(so)) {
1156 SOTHROTTLELOG("throttle[%d]: so 0x%llx [%d,%d] TCP %s send; "
1157 "%s recv\n", so->last_pid,
1158 (uint64_t)VM_KERNEL_ADDRPERM(so),
1159 SOCK_DOM(so), SOCK_TYPE(so),
1160 (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) ?
1161 "background" : "foreground",
1162 IS_TCP_RECV_BG(so) ? "background" : "foreground");
1163 }
1164 }
1165
1166 /*
1167 * Set traffic class to an IPv4 or IPv6 packet
1168 * - mark the mbuf
1169 * - set the DSCP code following the WMM mapping
1170 */
1171 __private_extern__ void
1172 set_packet_service_class(struct mbuf *m, struct socket *so,
1173 int sotc, u_int32_t flags)
1174 {
1175 mbuf_svc_class_t msc = MBUF_SC_BE; /* Best effort by default */
1176 struct inpcb *inp = sotoinpcb(so); /* in6pcb and inpcb are the same */
1177
1178 if (!(m->m_flags & M_PKTHDR))
1179 return;
1180
1181 /*
1182 * Here is the precedence:
1183 * 1) TRAFFIC_MGT_SO_BACKGROUND trumps all
1184 * 2) Traffic class passed via ancillary data to sendmsdg(2)
1185 * 3) Traffic class socket option last
1186 */
1187 if (sotc != SO_TC_UNSPEC) {
1188 VERIFY(SO_VALID_TC(sotc));
1189 msc = so_tc2msc(sotc);
1190 /* Assert because tc must have been valid */
1191 VERIFY(MBUF_VALID_SC(msc));
1192 }
1193
1194 /*
1195 * If TRAFFIC_MGT_SO_BACKGROUND is set or policy to throttle
1196 * best effort is set, depress the priority.
1197 */
1198 if (!IS_MBUF_SC_BACKGROUND(msc) && soisthrottled(so))
1199 msc = MBUF_SC_BK;
1200
1201 if (IS_MBUF_SC_BESTEFFORT(msc) && inp->inp_last_outifp != NULL &&
1202 so_throttle_best_effort(so, inp->inp_last_outifp))
1203 msc = MBUF_SC_BK;
1204
1205 if (soissrcbackground(so))
1206 m->m_pkthdr.pkt_flags |= PKTF_SO_BACKGROUND;
1207
1208 if (soissrcrealtime(so) || IS_MBUF_SC_REALTIME(msc))
1209 m->m_pkthdr.pkt_flags |= PKTF_SO_REALTIME;
1210 /*
1211 * Set the traffic class in the mbuf packet header svc field
1212 */
1213 if (sotcdb & SOTCDB_NO_MTC)
1214 goto no_mbtc;
1215
1216 /*
1217 * Elevate service class if the packet is a pure TCP ACK.
1218 * We can do this only when the flow is not a background
1219 * flow and the outgoing interface supports
1220 * transmit-start model.
1221 */
1222 if (!IS_MBUF_SC_BACKGROUND(msc) &&
1223 (flags & (PKT_SCF_TCP_ACK | PKT_SCF_TCP_SYN)) != 0)
1224 msc = MBUF_SC_CTL;
1225
1226 (void) m_set_service_class(m, msc);
1227
1228 /*
1229 * Set the privileged traffic auxiliary flag if applicable,
1230 * or clear it.
1231 */
1232 if (!(sotcdb & SOTCDB_NO_PRIVILEGED) && soisprivilegedtraffic(so) &&
1233 msc != MBUF_SC_UNSPEC)
1234 m->m_pkthdr.pkt_flags |= PKTF_PRIO_PRIVILEGED;
1235 else
1236 m->m_pkthdr.pkt_flags &= ~PKTF_PRIO_PRIVILEGED;
1237
1238 no_mbtc:
1239 /*
1240 * For TCP with background traffic class switch CC algo based on sysctl
1241 */
1242 if (so->so_type == SOCK_STREAM)
1243 set_tcp_stream_priority(so);
1244
1245 so_tc_update_stats(m, so, msc);
1246 }
1247
1248 __private_extern__ void
1249 so_tc_update_stats(struct mbuf *m, struct socket *so, mbuf_svc_class_t msc)
1250 {
1251 mbuf_traffic_class_t mtc;
1252
1253 /*
1254 * Assume socket and mbuf traffic class values are the same
1255 * Also assume the socket lock is held. Note that the stats
1256 * at the socket layer are reduced down to the legacy traffic
1257 * classes; we could/should potentially expand so_tc_stats[].
1258 */
1259 mtc = MBUF_SC2TC(msc);
1260 VERIFY(mtc < SO_TC_STATS_MAX);
1261 so->so_tc_stats[mtc].txpackets += 1;
1262 so->so_tc_stats[mtc].txbytes += m->m_pkthdr.len;
1263 }
1264
1265 __private_extern__ void
1266 socket_tclass_init(void)
1267 {
1268 _CASSERT(_SO_TC_MAX == SO_TC_STATS_MAX);
1269
1270 tclass_lck_grp_attr = lck_grp_attr_alloc_init();
1271 tclass_lck_grp = lck_grp_alloc_init("tclass", tclass_lck_grp_attr);
1272 tclass_lck_attr = lck_attr_alloc_init();
1273 lck_mtx_init(tclass_lock, tclass_lck_grp, tclass_lck_attr);
1274 }
1275
1276 __private_extern__ mbuf_svc_class_t
1277 so_tc2msc(int tc)
1278 {
1279 mbuf_svc_class_t msc;
1280
1281 switch (tc) {
1282 case SO_TC_BK_SYS:
1283 msc = MBUF_SC_BK_SYS;
1284 break;
1285 case SO_TC_BK:
1286 case _SO_TC_BK:
1287 msc = MBUF_SC_BK;
1288 break;
1289 case SO_TC_BE:
1290 msc = MBUF_SC_BE;
1291 break;
1292 case SO_TC_RD:
1293 msc = MBUF_SC_RD;
1294 break;
1295 case SO_TC_OAM:
1296 msc = MBUF_SC_OAM;
1297 break;
1298 case SO_TC_AV:
1299 msc = MBUF_SC_AV;
1300 break;
1301 case SO_TC_RV:
1302 msc = MBUF_SC_RV;
1303 break;
1304 case SO_TC_VI:
1305 case _SO_TC_VI:
1306 msc = MBUF_SC_VI;
1307 break;
1308 case SO_TC_VO:
1309 case _SO_TC_VO:
1310 msc = MBUF_SC_VO;
1311 break;
1312 case SO_TC_CTL:
1313 msc = MBUF_SC_CTL;
1314 break;
1315 case SO_TC_ALL:
1316 default:
1317 msc = MBUF_SC_UNSPEC;
1318 break;
1319 }
1320
1321 return (msc);
1322 }
1323
1324 __private_extern__ int
1325 so_svc2tc(mbuf_svc_class_t svc)
1326 {
1327 switch (svc) {
1328 case MBUF_SC_BK_SYS:
1329 return (SO_TC_BK_SYS);
1330 case MBUF_SC_BK:
1331 return (SO_TC_BK);
1332 case MBUF_SC_BE:
1333 return (SO_TC_BE);
1334 case MBUF_SC_RD:
1335 return (SO_TC_RD);
1336 case MBUF_SC_OAM:
1337 return (SO_TC_OAM);
1338 case MBUF_SC_AV:
1339 return (SO_TC_AV);
1340 case MBUF_SC_RV:
1341 return (SO_TC_RV);
1342 case MBUF_SC_VI:
1343 return (SO_TC_VI);
1344 case MBUF_SC_VO:
1345 return (SO_TC_VO);
1346 case MBUF_SC_CTL:
1347 return (SO_TC_CTL);
1348 case MBUF_SC_UNSPEC:
1349 default:
1350 return (SO_TC_BE);
1351 }
1352 }
1353
1354 /*
1355 * LRO is turned on for AV streaming class.
1356 */
1357 void
1358 so_set_lro(struct socket *so, int optval)
1359 {
1360 if (optval == SO_TC_AV) {
1361 so->so_flags |= SOF_USELRO;
1362 } else {
1363 if (so->so_flags & SOF_USELRO) {
1364 /* transition to non LRO class */
1365 so->so_flags &= ~SOF_USELRO;
1366 struct inpcb *inp = sotoinpcb(so);
1367 struct tcpcb *tp = NULL;
1368 if (inp) {
1369 tp = intotcpcb(inp);
1370 if (tp && (tp->t_flagsext & TF_LRO_OFFLOADED)) {
1371 tcp_lro_remove_state(inp->inp_laddr,
1372 inp->inp_faddr,
1373 inp->inp_lport,
1374 inp->inp_fport);
1375 tp->t_flagsext &= ~TF_LRO_OFFLOADED;
1376 }
1377 }
1378 }
1379 }
1380 }
1381
1382 static size_t
1383 sotc_index(int sotc)
1384 {
1385 switch (sotc) {
1386 case SO_TC_BK_SYS:
1387 return (SOTCIX_BK_SYS);
1388 case _SO_TC_BK:
1389 case SO_TC_BK:
1390 return (SOTCIX_BK);
1391
1392 case SO_TC_BE:
1393 return (SOTCIX_BE);
1394 case SO_TC_RD:
1395 return (SOTCIX_RD);
1396 case SO_TC_OAM:
1397 return (SOTCIX_OAM);
1398
1399 case SO_TC_AV:
1400 return (SOTCIX_AV);
1401 case SO_TC_RV:
1402 return (SOTCIX_RV);
1403 case _SO_TC_VI:
1404 case SO_TC_VI:
1405 return (SOTCIX_VI);
1406
1407 case _SO_TC_VO:
1408 case SO_TC_VO:
1409 return (SOTCIX_VO);
1410 case SO_TC_CTL:
1411 return (SOTCIX_CTL);
1412
1413 default:
1414 break;
1415 }
1416 /*
1417 * Unknown traffic class value
1418 */
1419 return (SIZE_T_MAX);
1420 }
1421
1422 /*
1423 * Pass NULL ifp for default map
1424 */
1425 static errno_t
1426 set_netsvctype_dscp_map(size_t in_count,
1427 const struct netsvctype_dscp_map *netsvctype_dscp_map)
1428 {
1429 size_t i;
1430 struct net_qos_dscp_map *net_qos_dscp_map = NULL;
1431 int netsvctype;
1432
1433 /*
1434 * Do not accept more that max number of distinct DSCPs
1435 */
1436 if (in_count > _MAX_DSCP || netsvctype_dscp_map == NULL)
1437 return (EINVAL);
1438
1439 /*
1440 * Validate input parameters
1441 */
1442 for (i = 0; i < in_count; i++) {
1443 if (!IS_VALID_NET_SERVICE_TYPE(netsvctype_dscp_map[i].netsvctype))
1444 return (EINVAL);
1445 if (netsvctype_dscp_map[i].dscp > _MAX_DSCP)
1446 return (EINVAL);
1447 }
1448
1449 net_qos_dscp_map = &default_net_qos_dscp_map;
1450
1451 for (i = 0; i < in_count; i++) {
1452 netsvctype = netsvctype_dscp_map[i].netsvctype;
1453
1454 net_qos_dscp_map->netsvctype_to_dscp[netsvctype] =
1455 netsvctype_dscp_map[i].dscp;
1456 }
1457 for (netsvctype = 0; netsvctype < _NET_SERVICE_TYPE_COUNT; netsvctype++) {
1458 switch (netsvctype) {
1459 case NET_SERVICE_TYPE_BE:
1460 case NET_SERVICE_TYPE_BK:
1461 case NET_SERVICE_TYPE_VI:
1462 case NET_SERVICE_TYPE_VO:
1463 case NET_SERVICE_TYPE_RV:
1464 case NET_SERVICE_TYPE_AV:
1465 case NET_SERVICE_TYPE_OAM:
1466 case NET_SERVICE_TYPE_RD: {
1467 int sotcix;
1468
1469 sotcix = sotc_index(sotc_by_netservicetype[netsvctype]);
1470 net_qos_dscp_map->sotc_to_dscp[sotcix] =
1471 netsvctype_dscp_map[netsvctype].dscp;
1472 break;
1473 }
1474 case NET_SERVICE_TYPE_SIG:
1475 /* Signaling does not have its own traffic class */
1476 break;
1477 default:
1478 /* We should not be here */
1479 ASSERT(0);
1480 }
1481 }
1482 /* Network control socket traffic class is always best effort */
1483 net_qos_dscp_map->sotc_to_dscp[SOTCIX_CTL] = _DSCP_DF;
1484
1485 /* Backround socket traffic class DSCP same as backround system */
1486 net_qos_dscp_map->sotc_to_dscp[SOTCIX_BK] =
1487 net_qos_dscp_map->sotc_to_dscp[SOTCIX_BK_SYS];
1488
1489 return (0);
1490 }
1491
1492 /*
1493 * out_count is an input/ouput parameter
1494 */
1495 static errno_t
1496 get_netsvctype_dscp_map(size_t *out_count,
1497 struct netsvctype_dscp_map *netsvctype_dscp_map)
1498 {
1499 size_t i;
1500 struct net_qos_dscp_map *net_qos_dscp_map = NULL;
1501
1502 /*
1503 * Do not accept more that max number of distinct DSCPs
1504 */
1505 if (out_count == NULL || netsvctype_dscp_map == NULL)
1506 return (EINVAL);
1507 if (*out_count > _MAX_DSCP)
1508 return (EINVAL);
1509
1510 net_qos_dscp_map = &default_net_qos_dscp_map;
1511
1512 for (i = 0; i < MIN(_NET_SERVICE_TYPE_COUNT, *out_count); i++) {
1513 netsvctype_dscp_map[i].netsvctype = i;
1514 netsvctype_dscp_map[i].dscp = net_qos_dscp_map->netsvctype_to_dscp[i];
1515
1516 }
1517 *out_count = i;
1518
1519 return (0);
1520 }
1521
1522 void
1523 net_qos_map_init()
1524 {
1525 errno_t error;
1526
1527 /*
1528 * By default use the Fastlane DSCP mappngs
1529 */
1530 error = set_netsvctype_dscp_map(_NET_SERVICE_TYPE_COUNT,
1531 fastlane_netsvctype_dscp_map);
1532 ASSERT(error == 0);
1533
1534 /*
1535 * No DSCP mapping for network control
1536 */
1537 default_net_qos_dscp_map.sotc_to_dscp[SOTCIX_CTL] = _DSCP_DF;
1538
1539 set_dscp_to_wifi_ac_map(default_dscp_to_wifi_ac_map, 1);
1540 }
1541
1542 int
1543 sysctl_default_netsvctype_to_dscp_map SYSCTL_HANDLER_ARGS
1544 {
1545 #pragma unused(oidp, arg1, arg2)
1546 int error = 0;
1547 const size_t max_netsvctype_to_dscp_map_len =
1548 _NET_SERVICE_TYPE_COUNT * sizeof(struct netsvctype_dscp_map);
1549 size_t len;
1550 struct netsvctype_dscp_map netsvctype_dscp_map[_NET_SERVICE_TYPE_COUNT];
1551 size_t count;
1552
1553 if (req->oldptr == USER_ADDR_NULL) {
1554 req->oldidx =
1555 _NET_SERVICE_TYPE_COUNT * sizeof(struct netsvctype_dscp_map);
1556 } else if (req->oldlen > 0) {
1557 count = _NET_SERVICE_TYPE_COUNT;
1558 error = get_netsvctype_dscp_map(&count, netsvctype_dscp_map);
1559 if (error != 0)
1560 goto done;
1561 len = count * sizeof(struct netsvctype_dscp_map);
1562 error = SYSCTL_OUT(req, netsvctype_dscp_map,
1563 MIN(len, req->oldlen));
1564 if (error != 0)
1565 goto done;
1566 }
1567
1568 if (req->newptr == USER_ADDR_NULL)
1569 goto done;
1570
1571 error = proc_suser(current_proc());
1572 if (error != 0)
1573 goto done;
1574
1575 /*
1576 * Check input length
1577 */
1578 if (req->newlen > max_netsvctype_to_dscp_map_len) {
1579 error = EINVAL;
1580 goto done;
1581 }
1582 /*
1583 * Cap the number of entries to copy from input buffer
1584 */
1585 error = SYSCTL_IN(req, netsvctype_dscp_map, req->newlen);
1586 if (error != 0)
1587 goto done;
1588
1589 count = req->newlen / sizeof(struct netsvctype_dscp_map);
1590 error = set_netsvctype_dscp_map(count, netsvctype_dscp_map);
1591 done:
1592 return (error);
1593 }
1594
1595 __private_extern__ errno_t
1596 set_packet_qos(struct mbuf *m, struct ifnet *ifp, boolean_t qos_allowed,
1597 int sotc, int netsvctype, u_int8_t *dscp_inout)
1598 {
1599 if (ifp == NULL || dscp_inout == NULL)
1600 return (EINVAL);
1601
1602 if ((ifp->if_eflags &
1603 (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) ==
1604 (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) {
1605 u_int8_t dscp;
1606
1607 /*
1608 * When on a Fastlane network, IP_TOS/IPV6_TCLASS are no-ops
1609 */
1610 dscp = _DSCP_DF;
1611
1612 /*
1613 * For DSCP use the network service type is specified, otherwise
1614 * use the socket traffic class
1615 *
1616 * When not whitelisted by the policy, set DSCP only for best
1617 * effort and background, and set the mbuf service class to
1618 * best effort as well so the packet will be queued and
1619 * scheduled at a lower priority.
1620 * We still want to prioritize control traffic on the interface
1621 * so we do not change the mbuf service class for SO_TC_CTL
1622 */
1623 if (netsvctype != _NET_SERVICE_TYPE_UNSPEC &&
1624 netsvctype != NET_SERVICE_TYPE_BE) {
1625 dscp = default_net_qos_dscp_map.netsvctype_to_dscp[netsvctype];
1626
1627 if (qos_allowed == FALSE &&
1628 netsvctype != NET_SERVICE_TYPE_BE &&
1629 netsvctype != NET_SERVICE_TYPE_BK) {
1630 dscp = _DSCP_DF;
1631 if (sotc != SO_TC_CTL)
1632 m_set_service_class(m, MBUF_SC_BE);
1633 }
1634 } else {
1635 size_t sotcix = sotc_index(sotc);
1636
1637 dscp = default_net_qos_dscp_map.sotc_to_dscp[sotcix];
1638
1639 if (qos_allowed == FALSE && sotc != SO_TC_BE &&
1640 sotc != SO_TC_BK && sotc != SO_TC_BK_SYS &&
1641 sotc != SO_TC_CTL) {
1642 dscp = _DSCP_DF;
1643 if (sotc != SO_TC_CTL)
1644 m_set_service_class(m, MBUF_SC_BE);
1645 }
1646 }
1647 if (net_qos_verbose != 0)
1648 printf("%s qos_allowed %d sotc %u netsvctype %u dscp %u\n",
1649 __func__, qos_allowed, sotc, netsvctype, dscp);
1650
1651 if (*dscp_inout != dscp) {
1652 *dscp_inout = dscp;
1653 }
1654 } else if (*dscp_inout != _DSCP_DF && IFNET_IS_WIFI_INFRA(ifp)) {
1655 mbuf_svc_class_t msc = m_get_service_class(m);
1656
1657 /*
1658 * For WiFi infra, when the mbuf service class is best effort
1659 * and the DSCP is not default, set the service class based
1660 * on DSCP
1661 */
1662 if (msc == MBUF_SC_BE) {
1663 msc = wifi_dscp_to_msc_array[*dscp_inout];
1664
1665 if (msc != MBUF_SC_BE) {
1666 m_set_service_class(m, msc);
1667
1668 if (net_qos_verbose != 0)
1669 printf("%s set msc %u for dscp %u\n",
1670 __func__, msc, *dscp_inout);
1671 }
1672 }
1673 }
1674
1675 return (0);
1676 }
1677
1678 static void
1679 set_dscp_to_wifi_ac_map(const struct dcsp_msc_map *map, int clear)
1680 {
1681 int i;
1682
1683 if (clear)
1684 bzero(wifi_dscp_to_msc_array, sizeof(wifi_dscp_to_msc_array));
1685
1686 for (i = 0; i < DSCP_ARRAY_SIZE; i++) {
1687 const struct dcsp_msc_map *elem = map + i;
1688
1689 if (elem->dscp > _MAX_DSCP || elem->msc == MBUF_SC_UNSPEC)
1690 break;
1691 switch (elem->msc) {
1692 case MBUF_SC_BK_SYS:
1693 case MBUF_SC_BK:
1694 wifi_dscp_to_msc_array[elem->dscp] = MBUF_SC_BK;
1695 break;
1696 default:
1697 case MBUF_SC_BE:
1698 case MBUF_SC_RD:
1699 case MBUF_SC_OAM:
1700 wifi_dscp_to_msc_array[elem->dscp] = MBUF_SC_BE;
1701 break;
1702 case MBUF_SC_AV:
1703 case MBUF_SC_RV:
1704 case MBUF_SC_VI:
1705 wifi_dscp_to_msc_array[elem->dscp] = MBUF_SC_VI;
1706 break;
1707 case MBUF_SC_VO:
1708 case MBUF_SC_CTL:
1709 wifi_dscp_to_msc_array[elem->dscp] = MBUF_SC_VO;
1710 break;
1711 }
1712 }
1713 }
1714
1715 static errno_t
1716 dscp_msc_map_from_netsvctype_dscp_map(struct netsvctype_dscp_map *netsvctype_dscp_map,
1717 size_t count, struct dcsp_msc_map *dcsp_msc_map)
1718 {
1719 errno_t error = 0;
1720 u_int32_t i;
1721
1722 /*
1723 * Validate input parameters
1724 */
1725 for (i = 0; i < count; i++) {
1726 if (!SO_VALID_TC(netsvctype_dscp_map[i].netsvctype)) {
1727 error = EINVAL;
1728 goto done;
1729 }
1730 if (netsvctype_dscp_map[i].dscp > _MAX_DSCP) {
1731 error = EINVAL;
1732 goto done;
1733 }
1734 }
1735
1736 bzero(dcsp_msc_map, DSCP_ARRAY_SIZE * sizeof(struct dcsp_msc_map));
1737
1738 for (i = 0; i < count; i++) {
1739 dcsp_msc_map[i].dscp = netsvctype_dscp_map[i].dscp;
1740 dcsp_msc_map[i].msc = so_tc2msc(netsvctype_dscp_map[i].netsvctype);
1741 }
1742 done:
1743 return (error);
1744 }
1745
1746 int
1747 sysctl_dscp_to_wifi_ac_map SYSCTL_HANDLER_ARGS
1748 {
1749 #pragma unused(oidp, arg1, arg2)
1750 int error = 0;
1751 size_t len = DSCP_ARRAY_SIZE * sizeof(struct netsvctype_dscp_map);
1752 struct netsvctype_dscp_map netsvctype_dscp_map[DSCP_ARRAY_SIZE];
1753 struct dcsp_msc_map dcsp_msc_map[DSCP_ARRAY_SIZE];
1754 size_t count;
1755 u_int32_t i;
1756
1757 if (req->oldptr == USER_ADDR_NULL) {
1758 req->oldidx = len;
1759 } else if (req->oldlen > 0) {
1760 for (i = 0; i < DSCP_ARRAY_SIZE; i++) {
1761 netsvctype_dscp_map[i].dscp = i;
1762 netsvctype_dscp_map[i].netsvctype =
1763 so_svc2tc(wifi_dscp_to_msc_array[i]);
1764 }
1765 error = SYSCTL_OUT(req, netsvctype_dscp_map,
1766 MIN(len, req->oldlen));
1767 if (error != 0)
1768 goto done;
1769 }
1770
1771 if (req->newptr == USER_ADDR_NULL)
1772 goto done;
1773
1774 error = proc_suser(current_proc());
1775 if (error != 0)
1776 goto done;
1777
1778 /*
1779 * Check input length
1780 */
1781 if (req->newlen > len) {
1782 error = EINVAL;
1783 goto done;
1784 }
1785 /*
1786 * Cap the number of entries to copy from input buffer
1787 */
1788 if (len > req->newlen)
1789 len = req->newlen;
1790 error = SYSCTL_IN(req, netsvctype_dscp_map, len);
1791 if (error != 0) {
1792 goto done;
1793 }
1794 count = len / sizeof(struct netsvctype_dscp_map);
1795 bzero(dcsp_msc_map, sizeof(dcsp_msc_map));
1796 error = dscp_msc_map_from_netsvctype_dscp_map(netsvctype_dscp_map, count,
1797 dcsp_msc_map);
1798 if (error != 0) {
1799 goto done;
1800 }
1801 set_dscp_to_wifi_ac_map(dcsp_msc_map, 0);
1802 done:
1803 return (error);
1804 }
1805
1806 int
1807 sysctl_reset_dscp_to_wifi_ac_map SYSCTL_HANDLER_ARGS
1808 {
1809 #pragma unused(oidp, arg1, arg2)
1810 int error = 0;
1811 int val = 0;
1812
1813 error = sysctl_handle_int(oidp, &val, 0, req);
1814 if (error || !req->newptr)
1815 return (error);
1816
1817 set_dscp_to_wifi_ac_map(default_dscp_to_wifi_ac_map, 1);
1818
1819 return (0);
1820 }