]> git.saurik.com Git - apple/xnu.git/blob - bsd/netinet/in_tclass.c
xnu-4570.71.2.tar.gz
[apple/xnu.git] / bsd / netinet / in_tclass.c
1 /*
2 * Copyright (c) 2009-2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <sys/systm.h>
30 #include <sys/kernel.h>
31 #include <sys/types.h>
32 #include <sys/filedesc.h>
33 #include <sys/file_internal.h>
34 #include <sys/proc.h>
35 #include <sys/socket.h>
36 #include <sys/socketvar.h>
37 #include <sys/errno.h>
38 #include <sys/protosw.h>
39 #include <sys/domain.h>
40 #include <sys/mbuf.h>
41 #include <sys/queue.h>
42 #include <sys/sysctl.h>
43 #include <sys/sysproto.h>
44
45 #include <net/if.h>
46 #include <net/if_var.h>
47 #include <net/route.h>
48
49 #include <netinet/in.h>
50 #include <netinet/in_var.h>
51 #include <netinet/in_pcb.h>
52 #include <netinet/ip.h>
53 #include <netinet/ip_var.h>
54 #include <netinet/ip6.h>
55 #include <netinet6/ip6_var.h>
56 #include <netinet/udp.h>
57 #include <netinet/udp_var.h>
58 #include <netinet/tcp.h>
59 #include <netinet/tcp_var.h>
60 #include <netinet/tcp_cc.h>
61 #include <netinet/lro_ext.h>
62 #include <netinet/in_tclass.h>
63
64 struct dcsp_msc_map {
65 u_int8_t dscp;
66 mbuf_svc_class_t msc;
67 };
68 static inline int so_throttle_best_effort(struct socket *, struct ifnet *);
69 static void set_dscp_to_wifi_ac_map(const struct dcsp_msc_map *, int);
70 static errno_t dscp_msc_map_from_netsvctype_dscp_map(struct netsvctype_dscp_map *, size_t,
71 struct dcsp_msc_map *);
72
73 static lck_grp_attr_t *tclass_lck_grp_attr = NULL; /* mutex group attributes */
74 static lck_grp_t *tclass_lck_grp = NULL; /* mutex group definition */
75 static lck_attr_t *tclass_lck_attr = NULL; /* mutex attributes */
76 decl_lck_mtx_data(static, tclass_lock_data);
77 static lck_mtx_t *tclass_lock = &tclass_lock_data;
78
79 SYSCTL_NODE(_net, OID_AUTO, qos,
80 CTLFLAG_RW|CTLFLAG_LOCKED, 0, "QoS");
81
82 static int sysctl_default_netsvctype_to_dscp_map SYSCTL_HANDLER_ARGS;
83 SYSCTL_PROC(_net_qos, OID_AUTO, default_netsvctype_to_dscp_map,
84 CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED,
85 0, 0, sysctl_default_netsvctype_to_dscp_map, "S", "");
86
87 static int sysctl_dscp_to_wifi_ac_map SYSCTL_HANDLER_ARGS;
88 SYSCTL_PROC(_net_qos, OID_AUTO, dscp_to_wifi_ac_map,
89 CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED,
90 0, 0, sysctl_dscp_to_wifi_ac_map, "S", "");
91
92 static int sysctl_reset_dscp_to_wifi_ac_map SYSCTL_HANDLER_ARGS;
93 SYSCTL_PROC(_net_qos, OID_AUTO, reset_dscp_to_wifi_ac_map,
94 CTLTYPE_INT | CTLFLAG_WR | CTLFLAG_LOCKED,
95 0, 0, sysctl_reset_dscp_to_wifi_ac_map, "I", "");
96
97 int net_qos_verbose = 0;
98 SYSCTL_INT(_net_qos, OID_AUTO, verbose,
99 CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_verbose, 0, "");
100
101 /*
102 * Fastlane QoS policy:
103 * By Default allow all apps to get traffic class to DSCP mapping
104 */
105 SYSCTL_NODE(_net_qos, OID_AUTO, policy,
106 CTLFLAG_RW|CTLFLAG_LOCKED, 0, "");
107
108 int net_qos_policy_restricted = 0;
109 SYSCTL_INT(_net_qos_policy, OID_AUTO, restricted,
110 CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_restricted, 0, "");
111
112 int net_qos_policy_restrict_avapps = 0;
113 SYSCTL_INT(_net_qos_policy, OID_AUTO, restrict_avapps,
114 CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_restrict_avapps, 0, "");
115
116 int net_qos_policy_wifi_enabled = 0;
117 SYSCTL_INT(_net_qos_policy, OID_AUTO, wifi_enabled,
118 CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_wifi_enabled, 0, "");
119
120 int net_qos_policy_none_wifi_enabled = 0;
121 SYSCTL_INT(_net_qos_policy, OID_AUTO, none_wifi_enabled,
122 CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_none_wifi_enabled, 0, "");
123
124 int net_qos_policy_capable_enabled = 0;
125 SYSCTL_INT(_net_qos_policy, OID_AUTO, capable_enabled,
126 CTLFLAG_RW | CTLFLAG_LOCKED, &net_qos_policy_capable_enabled, 0, "");
127
128 /*
129 * Socket traffic class from network service type
130 */
131 const int sotc_by_netservicetype[_NET_SERVICE_TYPE_COUNT] = {
132 SO_TC_BE, /* NET_SERVICE_TYPE_BE */
133 SO_TC_BK_SYS, /* NET_SERVICE_TYPE_BK */
134 SO_TC_VI, /* NET_SERVICE_TYPE_SIG */
135 SO_TC_VI, /* NET_SERVICE_TYPE_VI */
136 SO_TC_VO, /* NET_SERVICE_TYPE_VO */
137 SO_TC_RV, /* NET_SERVICE_TYPE_RV */
138 SO_TC_AV, /* NET_SERVICE_TYPE_AV */
139 SO_TC_OAM, /* NET_SERVICE_TYPE_OAM */
140 SO_TC_RD /* NET_SERVICE_TYPE_RD */
141 };
142
143 /*
144 * DSCP mappings for QoS Fastlane as based on network service types
145 */
146 static const
147 struct netsvctype_dscp_map fastlane_netsvctype_dscp_map[_NET_SERVICE_TYPE_COUNT] = {
148 { NET_SERVICE_TYPE_BE, _DSCP_DF },
149 { NET_SERVICE_TYPE_BK, _DSCP_AF11 },
150 { NET_SERVICE_TYPE_SIG, _DSCP_CS3 },
151 { NET_SERVICE_TYPE_VI, _DSCP_AF41 },
152 { NET_SERVICE_TYPE_VO, _DSCP_EF },
153 { NET_SERVICE_TYPE_RV, _DSCP_CS4 },
154 { NET_SERVICE_TYPE_AV, _DSCP_AF31 },
155 { NET_SERVICE_TYPE_OAM, _DSCP_CS2 },
156 { NET_SERVICE_TYPE_RD, _DSCP_AF21 },
157 };
158
159 static struct net_qos_dscp_map default_net_qos_dscp_map;
160
161 /*
162 * The size is one more than the max because DSCP start at zero
163 */
164 #define DSCP_ARRAY_SIZE (_MAX_DSCP + 1)
165
166 /*
167 * The DSCP to UP mapping (via mbuf service class) for WiFi follows is the mapping
168 * that implemented at the 802.11 driver level when the mbuf service class is
169 * MBUF_SC_BE.
170 *
171 * This clashes with the recommended mapping documented by the IETF document
172 * draft-szigeti-tsvwg-ieee-802-11e-01.txt but we keep the mapping to maintain
173 * binary compatibility. Applications should use the network service type socket
174 * option instead to select L2 QoS marking instead of IP_TOS or IPV6_TCLASS.
175 */
176 static const struct dcsp_msc_map default_dscp_to_wifi_ac_map[] = {
177 { _DSCP_DF, MBUF_SC_BE }, /* RFC 2474 Standard */
178 { 1, MBUF_SC_BE }, /* */
179 { 2, MBUF_SC_BE }, /* */
180 { 3, MBUF_SC_BE }, /* */
181 { 4, MBUF_SC_BE }, /* */
182 { 5, MBUF_SC_BE }, /* */
183 { 6, MBUF_SC_BE }, /* */
184 { 7, MBUF_SC_BE }, /* */
185
186 { _DSCP_CS1, MBUF_SC_BK }, /* RFC 3662 Low-Priority Data */
187 { 9, MBUF_SC_BK }, /* */
188 { _DSCP_AF11, MBUF_SC_BK }, /* RFC 2597 High-Throughput Data */
189 { 11, MBUF_SC_BK }, /* */
190 { _DSCP_AF12, MBUF_SC_BK }, /* RFC 2597 High-Throughput Data */
191 { 13, MBUF_SC_BK }, /* */
192 { _DSCP_AF13, MBUF_SC_BK }, /* RFC 2597 High-Throughput Data */
193 { 15, MBUF_SC_BK }, /* */
194
195 { _DSCP_CS2, MBUF_SC_BK }, /* RFC 4594 OAM */
196 { 17, MBUF_SC_BK }, /* */
197 { _DSCP_AF21, MBUF_SC_BK }, /* RFC 2597 Low-Latency Data */
198 { 19, MBUF_SC_BK }, /* */
199 { _DSCP_AF22, MBUF_SC_BK }, /* RFC 2597 Low-Latency Data */
200 { 21, MBUF_SC_BK }, /* */
201 { _DSCP_AF23, MBUF_SC_BK }, /* RFC 2597 Low-Latency Data */
202 { 23, MBUF_SC_BK }, /* */
203
204 { _DSCP_CS3, MBUF_SC_BE }, /* RFC 2474 Broadcast Video */
205 { 25, MBUF_SC_BE }, /* */
206 { _DSCP_AF31, MBUF_SC_BE }, /* RFC 2597 Multimedia Streaming */
207 { 27, MBUF_SC_BE }, /* */
208 { _DSCP_AF32, MBUF_SC_BE }, /* RFC 2597 Multimedia Streaming */
209 { 29, MBUF_SC_BE }, /* */
210 { _DSCP_AF33, MBUF_SC_BE }, /* RFC 2597 Multimedia Streaming */
211 { 31, MBUF_SC_BE }, /* */
212
213 { _DSCP_CS4, MBUF_SC_VI }, /* RFC 2474 Real-Time Interactive */
214 { 33, MBUF_SC_VI }, /* */
215 { _DSCP_AF41, MBUF_SC_VI }, /* RFC 2597 Multimedia Conferencing */
216 { 35, MBUF_SC_VI }, /* */
217 { _DSCP_AF42, MBUF_SC_VI }, /* RFC 2597 Multimedia Conferencing */
218 { 37, MBUF_SC_VI }, /* */
219 { _DSCP_AF43, MBUF_SC_VI }, /* RFC 2597 Multimedia Conferencing */
220 { 39, MBUF_SC_VI }, /* */
221
222 { _DSCP_CS5, MBUF_SC_VI }, /* RFC 2474 Signaling */
223 { 41, MBUF_SC_VI }, /* */
224 { 42, MBUF_SC_VI }, /* */
225 { 43, MBUF_SC_VI }, /* */
226 { _DSCP_VA, MBUF_SC_VI }, /* RFC 5865 VOICE-ADMIT */
227 { 45, MBUF_SC_VI }, /* */
228 { _DSCP_EF, MBUF_SC_VI }, /* RFC 3246 Telephony */
229 { 47, MBUF_SC_VI }, /* */
230
231 { _DSCP_CS6, MBUF_SC_VO }, /* Wi-Fi WMM Certification: Chariot */
232 { 49, MBUF_SC_VO }, /* */
233 { 50, MBUF_SC_VO }, /* */
234 { 51, MBUF_SC_VO }, /* */
235 { 52, MBUF_SC_VO }, /* Wi-Fi WMM Certification: Sigma */
236 { 53, MBUF_SC_VO }, /* */
237 { 54, MBUF_SC_VO }, /* */
238 { 55, MBUF_SC_VO }, /* */
239
240 { _DSCP_CS7, MBUF_SC_VO }, /* Wi-Fi WMM Certification: Chariot */
241 { 57, MBUF_SC_VO }, /* */
242 { 58, MBUF_SC_VO }, /* */
243 { 59, MBUF_SC_VO }, /* */
244 { 60, MBUF_SC_VO }, /* */
245 { 61, MBUF_SC_VO }, /* */
246 { 62, MBUF_SC_VO }, /* */
247 { 63, MBUF_SC_VO }, /* */
248
249 { 255, MBUF_SC_UNSPEC } /* invalid DSCP to mark last entry */
250 };
251
252 mbuf_svc_class_t wifi_dscp_to_msc_array[DSCP_ARRAY_SIZE];
253
254 /*
255 * If there is no foreground activity on the interface for bg_switch_time
256 * seconds, the background connections can switch to foreground TCP
257 * congestion control.
258 */
259 #define TCP_BG_SWITCH_TIME 2 /* seconds */
260
261 #if (DEVELOPMENT || DEBUG)
262
263 extern char *proc_best_name(proc_t p);
264
265 static int tfp_count = 0;
266
267 static TAILQ_HEAD(, tclass_for_proc) tfp_head =
268 TAILQ_HEAD_INITIALIZER(tfp_head);
269
270 struct tclass_for_proc {
271 TAILQ_ENTRY(tclass_for_proc) tfp_link;
272 int tfp_class;
273 pid_t tfp_pid;
274 char tfp_pname[(2 * MAXCOMLEN) + 1];
275 u_int32_t tfp_qos_mode;
276 };
277
278 static int get_pid_tclass(struct so_tcdbg *);
279 static int get_pname_tclass(struct so_tcdbg *);
280 static int set_pid_tclass(struct so_tcdbg *);
281 static int set_pname_tclass(struct so_tcdbg *);
282 static int flush_pid_tclass(struct so_tcdbg *);
283 static int purge_tclass_for_proc(void);
284 static int flush_tclass_for_proc(void);
285 static void set_tclass_for_curr_proc(struct socket *);
286
287 /*
288 * Must be called with tclass_lock held
289 */
290 static struct tclass_for_proc *
291 find_tfp_by_pid(pid_t pid)
292 {
293 struct tclass_for_proc *tfp;
294
295 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
296 if (tfp->tfp_pid == pid)
297 break;
298 }
299 return (tfp);
300 }
301
302 /*
303 * Must be called with tclass_lock held
304 */
305 static struct tclass_for_proc *
306 find_tfp_by_pname(const char *pname)
307 {
308 struct tclass_for_proc *tfp;
309
310 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
311 if (strncmp(pname, tfp->tfp_pname,
312 sizeof (tfp->tfp_pname)) == 0)
313 break;
314 }
315 return (tfp);
316 }
317
318 __private_extern__ void
319 set_tclass_for_curr_proc(struct socket *so)
320 {
321 struct tclass_for_proc *tfp = NULL;
322 proc_t p = current_proc(); /* Not ref counted */
323 pid_t pid = proc_pid(p);
324 char *pname = proc_best_name(p);
325
326 lck_mtx_lock(tclass_lock);
327
328 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
329 if ((tfp->tfp_pid == pid) || (tfp->tfp_pid == -1 &&
330 strncmp(pname, tfp->tfp_pname,
331 sizeof (tfp->tfp_pname)) == 0)) {
332 if (tfp->tfp_class != SO_TC_UNSPEC)
333 so->so_traffic_class = tfp->tfp_class;
334
335 if (tfp->tfp_qos_mode == QOS_MODE_MARKING_POLICY_ENABLE)
336 so->so_flags1 |= SOF1_QOSMARKING_ALLOWED;
337 else if (tfp->tfp_qos_mode == QOS_MODE_MARKING_POLICY_DISABLE)
338 so->so_flags1 &= ~SOF1_QOSMARKING_ALLOWED;
339 break;
340 }
341 }
342
343 lck_mtx_unlock(tclass_lock);
344 }
345
346 /*
347 * Purge entries with PIDs of exited processes
348 */
349 int
350 purge_tclass_for_proc(void)
351 {
352 int error = 0;
353 struct tclass_for_proc *tfp, *tvar;
354
355 lck_mtx_lock(tclass_lock);
356
357 TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) {
358 proc_t p;
359
360 if (tfp->tfp_pid == -1)
361 continue;
362 if ((p = proc_find(tfp->tfp_pid)) == NULL) {
363 tfp_count--;
364 TAILQ_REMOVE(&tfp_head, tfp, tfp_link);
365
366 _FREE(tfp, M_TEMP);
367 } else {
368 proc_rele(p);
369 }
370 }
371
372 lck_mtx_unlock(tclass_lock);
373
374 return (error);
375 }
376
377 /*
378 * Remove one entry
379 * Must be called with tclass_lock held
380 */
381 static void
382 free_tclass_for_proc(struct tclass_for_proc *tfp)
383 {
384 if (tfp == NULL)
385 return;
386 tfp_count--;
387 TAILQ_REMOVE(&tfp_head, tfp, tfp_link);
388 _FREE(tfp, M_TEMP);
389 }
390
391 /*
392 * Remove all entries
393 */
394 int
395 flush_tclass_for_proc(void)
396 {
397 int error = 0;
398 struct tclass_for_proc *tfp, *tvar;
399
400 lck_mtx_lock(tclass_lock);
401
402 TAILQ_FOREACH_SAFE(tfp, &tfp_head, tfp_link, tvar) {
403 free_tclass_for_proc(tfp);
404 }
405
406 lck_mtx_unlock(tclass_lock);
407
408 return (error);
409
410 }
411
412 /*
413 * Must be called with tclass_lock held
414 */
415 static struct tclass_for_proc *
416 alloc_tclass_for_proc(pid_t pid, const char *pname)
417 {
418 struct tclass_for_proc *tfp;
419
420 if (pid == -1 && pname == NULL)
421 return (NULL);
422
423 tfp = _MALLOC(sizeof (struct tclass_for_proc), M_TEMP, M_NOWAIT|M_ZERO);
424 if (tfp == NULL)
425 return (NULL);
426
427 tfp->tfp_pid = pid;
428 /*
429 * Add per pid entries before per proc name so we can find
430 * a specific instance of a process before the general name base entry.
431 */
432 if (pid != -1) {
433 TAILQ_INSERT_HEAD(&tfp_head, tfp, tfp_link);
434 } else {
435 strlcpy(tfp->tfp_pname, pname, sizeof (tfp->tfp_pname));
436 TAILQ_INSERT_TAIL(&tfp_head, tfp, tfp_link);
437 }
438
439 tfp_count++;
440
441 return (tfp);
442 }
443
444 /*
445 * SO_TC_UNSPEC for tclass means to remove the entry
446 */
447 int
448 set_pid_tclass(struct so_tcdbg *so_tcdbg)
449 {
450 int error = EINVAL;
451 proc_t p = NULL;
452 struct filedesc *fdp;
453 struct fileproc *fp;
454 struct tclass_for_proc *tfp;
455 int i;
456 pid_t pid = so_tcdbg->so_tcdbg_pid;
457 int tclass = so_tcdbg->so_tcdbg_tclass;
458 int netsvctype = so_tcdbg->so_tcdbg_netsvctype;
459
460 p = proc_find(pid);
461 if (p == NULL) {
462 printf("%s proc_find(%d) failed\n", __func__, pid);
463 goto done;
464 }
465
466 /* Need a tfp */
467 lck_mtx_lock(tclass_lock);
468
469 tfp = find_tfp_by_pid(pid);
470 if (tfp == NULL) {
471 tfp = alloc_tclass_for_proc(pid, NULL);
472 if (tfp == NULL) {
473 lck_mtx_unlock(tclass_lock);
474 error = ENOBUFS;
475 goto done;
476 }
477 }
478 tfp->tfp_class = tclass;
479 tfp->tfp_qos_mode = so_tcdbg->so_tcbbg_qos_mode;
480
481 lck_mtx_unlock(tclass_lock);
482
483 if (tfp != NULL) {
484 proc_fdlock(p);
485
486 fdp = p->p_fd;
487 for (i = 0; i < fdp->fd_nfiles; i++) {
488 struct socket *so;
489
490 fp = fdp->fd_ofiles[i];
491 if (fp == NULL ||
492 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
493 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
494 continue;
495
496 so = (struct socket *)fp->f_fglob->fg_data;
497 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6)
498 continue;
499
500 socket_lock(so, 1);
501 if (tfp->tfp_qos_mode == QOS_MODE_MARKING_POLICY_ENABLE)
502 so->so_flags1 |= SOF1_QOSMARKING_ALLOWED;
503 else if (tfp->tfp_qos_mode == QOS_MODE_MARKING_POLICY_DISABLE)
504 so->so_flags1 &= ~SOF1_QOSMARKING_ALLOWED;
505 socket_unlock(so, 1);
506
507 if (netsvctype != _NET_SERVICE_TYPE_UNSPEC)
508 error = sock_setsockopt(so, SOL_SOCKET,
509 SO_NET_SERVICE_TYPE, &netsvctype, sizeof(int));
510 if (tclass != SO_TC_UNSPEC)
511 error = sock_setsockopt(so, SOL_SOCKET,
512 SO_TRAFFIC_CLASS, &tclass, sizeof(int));
513
514 }
515
516 proc_fdunlock(p);
517 }
518
519 error = 0;
520 done:
521 if (p != NULL)
522 proc_rele(p);
523
524 return (error);
525 }
526
527 int
528 set_pname_tclass(struct so_tcdbg *so_tcdbg)
529 {
530 int error = EINVAL;
531 struct tclass_for_proc *tfp;
532
533 lck_mtx_lock(tclass_lock);
534
535 tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname);
536 if (tfp == NULL) {
537 tfp = alloc_tclass_for_proc(-1, so_tcdbg->so_tcdbg_pname);
538 if (tfp == NULL) {
539 lck_mtx_unlock(tclass_lock);
540 error = ENOBUFS;
541 goto done;
542 }
543 }
544 tfp->tfp_class = so_tcdbg->so_tcdbg_tclass;
545 tfp->tfp_qos_mode = so_tcdbg->so_tcbbg_qos_mode;
546
547 lck_mtx_unlock(tclass_lock);
548
549 error = 0;
550 done:
551
552 return (error);
553 }
554
555 static int
556 flush_pid_tclass(struct so_tcdbg *so_tcdbg)
557 {
558 pid_t pid = so_tcdbg->so_tcdbg_pid;
559 int tclass = so_tcdbg->so_tcdbg_tclass;
560 struct filedesc *fdp;
561 int error = EINVAL;
562 proc_t p;
563 int i;
564
565 p = proc_find(pid);
566 if (p == PROC_NULL) {
567 printf("%s proc_find(%d) failed\n", __func__, pid);
568 goto done;
569 }
570
571 proc_fdlock(p);
572 fdp = p->p_fd;
573 for (i = 0; i < fdp->fd_nfiles; i++) {
574 struct socket *so;
575 struct fileproc *fp;
576
577 fp = fdp->fd_ofiles[i];
578 if (fp == NULL ||
579 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
580 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
581 continue;
582
583 so = (struct socket *)fp->f_fglob->fg_data;
584 error = sock_setsockopt(so, SOL_SOCKET, SO_FLUSH, &tclass,
585 sizeof (tclass));
586 if (error != 0) {
587 printf("%s: setsockopt(SO_FLUSH) (so=0x%llx, fd=%d, "
588 "tclass=%d) failed %d\n", __func__,
589 (uint64_t)VM_KERNEL_ADDRPERM(so), i, tclass,
590 error);
591 error = 0;
592 }
593 }
594 proc_fdunlock(p);
595
596 error = 0;
597 done:
598 if (p != PROC_NULL)
599 proc_rele(p);
600
601 return (error);
602 }
603
604 int
605 get_pid_tclass(struct so_tcdbg *so_tcdbg)
606 {
607 int error = EINVAL;
608 proc_t p = NULL;
609 struct tclass_for_proc *tfp;
610 pid_t pid = so_tcdbg->so_tcdbg_pid;
611
612 so_tcdbg->so_tcdbg_tclass = SO_TC_UNSPEC; /* Means not set */
613
614 p = proc_find(pid);
615 if (p == NULL) {
616 printf("%s proc_find(%d) failed\n", __func__, pid);
617 goto done;
618 }
619
620 /* Need a tfp */
621 lck_mtx_lock(tclass_lock);
622
623 tfp = find_tfp_by_pid(pid);
624 if (tfp != NULL) {
625 so_tcdbg->so_tcdbg_tclass = tfp->tfp_class;
626 so_tcdbg->so_tcbbg_qos_mode = tfp->tfp_qos_mode;
627 error = 0;
628 }
629 lck_mtx_unlock(tclass_lock);
630 done:
631 if (p != NULL)
632 proc_rele(p);
633
634 return (error);
635 }
636
637 int
638 get_pname_tclass(struct so_tcdbg *so_tcdbg)
639 {
640 int error = EINVAL;
641 struct tclass_for_proc *tfp;
642
643 so_tcdbg->so_tcdbg_tclass = SO_TC_UNSPEC; /* Means not set */
644
645 /* Need a tfp */
646 lck_mtx_lock(tclass_lock);
647
648 tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname);
649 if (tfp != NULL) {
650 so_tcdbg->so_tcdbg_tclass = tfp->tfp_class;
651 so_tcdbg->so_tcbbg_qos_mode = tfp->tfp_qos_mode;
652 error = 0;
653 }
654 lck_mtx_unlock(tclass_lock);
655
656 return (error);
657 }
658
659 static int
660 delete_tclass_for_pid_pname(struct so_tcdbg *so_tcdbg)
661 {
662 int error = EINVAL;
663 pid_t pid = so_tcdbg->so_tcdbg_pid;
664 struct tclass_for_proc *tfp = NULL;
665
666 lck_mtx_lock(tclass_lock);
667
668 if (pid != -1)
669 tfp = find_tfp_by_pid(pid);
670 else
671 tfp = find_tfp_by_pname(so_tcdbg->so_tcdbg_pname);
672
673 if (tfp != NULL) {
674 free_tclass_for_proc(tfp);
675 error = 0;
676 }
677
678 lck_mtx_unlock(tclass_lock);
679
680 return (error);
681 }
682
683 /*
684 * Setting options requires privileges
685 */
686 __private_extern__ int
687 so_set_tcdbg(struct socket *so, struct so_tcdbg *so_tcdbg)
688 {
689 int error = 0;
690
691 if ((so->so_state & SS_PRIV) == 0)
692 return (EPERM);
693
694 socket_unlock(so, 0);
695
696 switch (so_tcdbg->so_tcdbg_cmd) {
697 case SO_TCDBG_PID:
698 error = set_pid_tclass(so_tcdbg);
699 break;
700
701 case SO_TCDBG_PNAME:
702 error = set_pname_tclass(so_tcdbg);
703 break;
704
705 case SO_TCDBG_PURGE:
706 error = purge_tclass_for_proc();
707 break;
708
709 case SO_TCDBG_FLUSH:
710 error = flush_tclass_for_proc();
711 break;
712
713 case SO_TCDBG_DELETE:
714 error = delete_tclass_for_pid_pname(so_tcdbg);
715 break;
716
717 case SO_TCDBG_TCFLUSH_PID:
718 error = flush_pid_tclass(so_tcdbg);
719 break;
720
721 default:
722 error = EINVAL;
723 break;
724 }
725
726 socket_lock(so, 0);
727
728 return (error);
729 }
730
731 /*
732 * Not required to be privileged to get
733 */
734 __private_extern__ int
735 sogetopt_tcdbg(struct socket *so, struct sockopt *sopt)
736 {
737 int error = 0;
738 struct so_tcdbg so_tcdbg;
739 void *buf = NULL;
740 size_t len = sopt->sopt_valsize;
741
742 error = sooptcopyin(sopt, &so_tcdbg, sizeof (struct so_tcdbg),
743 sizeof (struct so_tcdbg));
744 if (error != 0)
745 return (error);
746
747 sopt->sopt_valsize = len;
748
749 socket_unlock(so, 0);
750
751 switch (so_tcdbg.so_tcdbg_cmd) {
752 case SO_TCDBG_PID:
753 error = get_pid_tclass(&so_tcdbg);
754 break;
755
756 case SO_TCDBG_PNAME:
757 error = get_pname_tclass(&so_tcdbg);
758 break;
759
760 case SO_TCDBG_COUNT:
761 lck_mtx_lock(tclass_lock);
762 so_tcdbg.so_tcdbg_count = tfp_count;
763 lck_mtx_unlock(tclass_lock);
764 break;
765
766 case SO_TCDBG_LIST: {
767 struct tclass_for_proc *tfp;
768 int n, alloc_count;
769 struct so_tcdbg *ptr;
770
771 lck_mtx_lock(tclass_lock);
772 if ((alloc_count = tfp_count) == 0) {
773 lck_mtx_unlock(tclass_lock);
774 error = EINVAL;
775 break;
776 }
777 len = alloc_count * sizeof (struct so_tcdbg);
778 lck_mtx_unlock(tclass_lock);
779
780 buf = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
781 if (buf == NULL) {
782 error = ENOBUFS;
783 break;
784 }
785
786 lck_mtx_lock(tclass_lock);
787 n = 0;
788 ptr = (struct so_tcdbg *)buf;
789 TAILQ_FOREACH(tfp, &tfp_head, tfp_link) {
790 if (++n > alloc_count)
791 break;
792 if (tfp->tfp_pid != -1) {
793 ptr->so_tcdbg_cmd = SO_TCDBG_PID;
794 ptr->so_tcdbg_pid = tfp->tfp_pid;
795 } else {
796 ptr->so_tcdbg_cmd = SO_TCDBG_PNAME;
797 ptr->so_tcdbg_pid = -1;
798 strlcpy(ptr->so_tcdbg_pname,
799 tfp->tfp_pname,
800 sizeof (ptr->so_tcdbg_pname));
801 }
802 ptr->so_tcdbg_tclass = tfp->tfp_class;
803 ptr->so_tcbbg_qos_mode = tfp->tfp_qos_mode;
804 ptr++;
805 }
806
807 lck_mtx_unlock(tclass_lock);
808 }
809 break;
810
811 default:
812 error = EINVAL;
813 break;
814 }
815
816 socket_lock(so, 0);
817
818 if (error == 0) {
819 if (buf == NULL) {
820 error = sooptcopyout(sopt, &so_tcdbg,
821 sizeof (struct so_tcdbg));
822 } else {
823 error = sooptcopyout(sopt, buf, len);
824 _FREE(buf, M_TEMP);
825 }
826 }
827 return (error);
828 }
829
830 #endif /* (DEVELOPMENT || DEBUG) */
831
832 int
833 so_get_netsvc_marking_level(struct socket *so)
834 {
835 int marking_level = NETSVC_MRKNG_UNKNOWN;
836 struct ifnet *ifp = NULL;
837
838 switch (SOCK_DOM(so)) {
839 case PF_INET: {
840 struct inpcb *inp = sotoinpcb(so);
841
842 if (inp != NULL)
843 ifp = inp->inp_last_outifp;
844 break;
845 }
846 case PF_INET6: {
847 struct in6pcb *in6p = sotoin6pcb(so);
848
849 if (in6p != NULL)
850 ifp = in6p->in6p_last_outifp;
851 break;
852 }
853 default:
854 break;
855 }
856 if (ifp != NULL) {
857 if ((ifp->if_eflags &
858 (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) ==
859 (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) {
860 if ((so->so_flags1 & SOF1_QOSMARKING_ALLOWED))
861 marking_level = NETSVC_MRKNG_LVL_L3L2_ALL;
862 else
863 marking_level = NETSVC_MRKNG_LVL_L3L2_BK;
864 } else {
865 marking_level = NETSVC_MRKNG_LVL_L2;
866 }
867 }
868 return (marking_level);
869 }
870
871 __private_extern__ int
872 so_set_traffic_class(struct socket *so, int optval)
873 {
874 int error = 0;
875
876 if (optval < SO_TC_BE || optval > SO_TC_CTL) {
877 error = EINVAL;
878 } else {
879 switch (optval) {
880 case _SO_TC_BK:
881 optval = SO_TC_BK;
882 break;
883 case _SO_TC_VI:
884 optval = SO_TC_VI;
885 break;
886 case _SO_TC_VO:
887 optval = SO_TC_VO;
888 break;
889 default:
890 if (!SO_VALID_TC(optval))
891 error = EINVAL;
892 break;
893 }
894
895 if (error == 0) {
896 int oldval = so->so_traffic_class;
897
898 VERIFY(SO_VALID_TC(optval));
899 so->so_traffic_class = optval;
900
901 if ((SOCK_DOM(so) == PF_INET ||
902 SOCK_DOM(so) == PF_INET6) &&
903 SOCK_TYPE(so) == SOCK_STREAM)
904 set_tcp_stream_priority(so);
905
906 if ((SOCK_DOM(so) == PF_INET ||
907 SOCK_DOM(so) == PF_INET6) &&
908 optval != oldval && (optval == SO_TC_BK_SYS ||
909 oldval == SO_TC_BK_SYS)) {
910 /*
911 * If the app switches from BK_SYS to something
912 * else, resume the socket if it was suspended.
913 */
914 if (oldval == SO_TC_BK_SYS)
915 inp_reset_fc_state(so->so_pcb);
916
917 SOTHROTTLELOG("throttle[%d]: so 0x%llx "
918 "[%d,%d] opportunistic %s\n", so->last_pid,
919 (uint64_t)VM_KERNEL_ADDRPERM(so),
920 SOCK_DOM(so), SOCK_TYPE(so),
921 (optval == SO_TC_BK_SYS) ? "ON" : "OFF");
922 }
923 }
924 }
925 return (error);
926 }
927
928 __private_extern__ int
929 so_set_net_service_type(struct socket *so, int netsvctype)
930 {
931 int sotc;
932 int error;
933
934 if (!IS_VALID_NET_SERVICE_TYPE(netsvctype))
935 return (EINVAL);
936
937 sotc = sotc_by_netservicetype[netsvctype];
938 error = so_set_traffic_class(so, sotc);
939 if (error != 0)
940 return (error);
941 so->so_netsvctype = netsvctype;
942 so->so_flags1 |= SOF1_TC_NET_SERV_TYPE;
943
944 return (0);
945 }
946
947 __private_extern__ void
948 so_set_default_traffic_class(struct socket *so)
949 {
950 so->so_traffic_class = SO_TC_BE;
951
952 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)) {
953 if (net_qos_policy_restricted == 0)
954 so->so_flags1 |= SOF1_QOSMARKING_ALLOWED;
955 #if (DEVELOPMENT || DEBUG)
956 if (tfp_count > 0)
957 set_tclass_for_curr_proc(so);
958 #endif /* (DEVELOPMENT || DEBUG) */
959 }
960 }
961
962 __private_extern__ int
963 so_set_opportunistic(struct socket *so, int optval)
964 {
965 return (so_set_traffic_class(so, (optval == 0) ?
966 SO_TC_BE : SO_TC_BK_SYS));
967 }
968
969 __private_extern__ int
970 so_get_opportunistic(struct socket *so)
971 {
972 return (so->so_traffic_class == SO_TC_BK_SYS);
973 }
974
975 __private_extern__ int
976 so_tc_from_control(struct mbuf *control, int *out_netsvctype)
977 {
978 struct cmsghdr *cm;
979 int sotc = SO_TC_UNSPEC;
980
981 *out_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
982
983 for (cm = M_FIRST_CMSGHDR(control); cm != NULL;
984 cm = M_NXT_CMSGHDR(control, cm)) {
985 int val;
986
987 if (cm->cmsg_len < sizeof (struct cmsghdr))
988 break;
989 if (cm->cmsg_level != SOL_SOCKET ||
990 cm->cmsg_len != CMSG_LEN(sizeof(int)))
991 continue;
992 val = *(int *)(void *)CMSG_DATA(cm);
993 /*
994 * The first valid option wins
995 */
996 switch (cm->cmsg_type) {
997 case SO_TRAFFIC_CLASS:
998 if (SO_VALID_TC(val)) {
999 sotc = val;
1000 return (sotc);
1001 /* NOT REACHED */
1002 } else if (val < SO_TC_NET_SERVICE_OFFSET) {
1003 break;
1004 }
1005 /*
1006 * Handle the case SO_NET_SERVICE_TYPE values are
1007 * passed using SO_TRAFFIC_CLASS
1008 */
1009 val = val - SO_TC_NET_SERVICE_OFFSET;
1010 /* FALLTHROUGH */
1011 case SO_NET_SERVICE_TYPE:
1012 if (!IS_VALID_NET_SERVICE_TYPE(val))
1013 break;
1014 *out_netsvctype = val;
1015 sotc = sotc_by_netservicetype[val];
1016 return (sotc);
1017 /* NOT REACHED */
1018 default:
1019 break;
1020 }
1021 }
1022
1023 return (sotc);
1024 }
1025
1026 __private_extern__ void
1027 so_recv_data_stat(struct socket *so, struct mbuf *m, size_t off)
1028 {
1029 uint32_t mtc = m_get_traffic_class(m);
1030
1031 if (mtc >= SO_TC_STATS_MAX)
1032 mtc = MBUF_TC_BE;
1033
1034 so->so_tc_stats[mtc].rxpackets += 1;
1035 so->so_tc_stats[mtc].rxbytes +=
1036 ((m->m_flags & M_PKTHDR) ? m->m_pkthdr.len : 0) + off;
1037 }
1038
1039 __private_extern__ void
1040 so_inc_recv_data_stat(struct socket *so, size_t pkts, size_t bytes,
1041 uint32_t mtc)
1042 {
1043 if (mtc >= SO_TC_STATS_MAX)
1044 mtc = MBUF_TC_BE;
1045
1046 so->so_tc_stats[mtc].rxpackets += pkts;
1047 so->so_tc_stats[mtc].rxbytes += bytes;
1048 }
1049
1050 static inline int
1051 so_throttle_best_effort(struct socket *so, struct ifnet *ifp)
1052 {
1053 u_int32_t uptime = net_uptime();
1054 return (soissrcbesteffort(so) &&
1055 net_io_policy_throttle_best_effort == 1 &&
1056 ifp->if_rt_sendts > 0 &&
1057 (int)(uptime - ifp->if_rt_sendts) <= TCP_BG_SWITCH_TIME);
1058 }
1059
1060 __private_extern__ void
1061 set_tcp_stream_priority(struct socket *so)
1062 {
1063 struct inpcb *inp = sotoinpcb(so);
1064 struct tcpcb *tp = intotcpcb(inp);
1065 struct ifnet *outifp;
1066 u_char old_cc = tp->tcp_cc_index;
1067 int recvbg = IS_TCP_RECV_BG(so);
1068 bool is_local = false, fg_active = false;
1069 u_int32_t uptime;
1070
1071 VERIFY((SOCK_CHECK_DOM(so, PF_INET) ||
1072 SOCK_CHECK_DOM(so, PF_INET6)) &&
1073 SOCK_CHECK_TYPE(so, SOCK_STREAM) &&
1074 SOCK_CHECK_PROTO(so, IPPROTO_TCP));
1075
1076 /* Return if the socket is in a terminal state */
1077 if (inp->inp_state == INPCB_STATE_DEAD)
1078 return;
1079
1080 outifp = inp->inp_last_outifp;
1081 uptime = net_uptime();
1082
1083 /*
1084 * If the socket was marked as a background socket or if the
1085 * traffic class is set to background with traffic class socket
1086 * option then make both send and recv side of the stream to be
1087 * background. The variable sotcdb which can be set with sysctl
1088 * is used to disable these settings for testing.
1089 */
1090 if (outifp == NULL || (outifp->if_flags & IFF_LOOPBACK))
1091 is_local = true;
1092
1093 /* Check if there has been recent foreground activity */
1094 if (outifp != NULL) {
1095 /*
1096 * If the traffic source is background, check if
1097 * if it can be switched to foreground. This can
1098 * happen when there is no indication of foreground
1099 * activity.
1100 */
1101 if (soissrcbackground(so) && outifp->if_fg_sendts > 0 &&
1102 (int)(uptime - outifp->if_fg_sendts) <= TCP_BG_SWITCH_TIME)
1103 fg_active = true;
1104
1105 /*
1106 * The traffic source is best-effort -- check if
1107 * the policy to throttle best effort is enabled
1108 * and there was realtime activity on this
1109 * interface recently. If this is true, enable
1110 * algorithms that respond to increased latency
1111 * on best-effort traffic.
1112 */
1113 if (so_throttle_best_effort(so, outifp))
1114 fg_active = true;
1115 }
1116
1117 /*
1118 * System initiated background traffic like cloud uploads should
1119 * always use background delay sensitive algorithms. This will
1120 * make the stream more responsive to other streams on the user's
1121 * network and it will minimize latency induced.
1122 */
1123 if (fg_active || IS_SO_TC_BACKGROUNDSYSTEM(so->so_traffic_class)) {
1124 /*
1125 * If the interface that the connection is using is
1126 * loopback, do not use background congestion
1127 * control algorithm.
1128 *
1129 * If there has been recent foreground activity or if
1130 * there was an indication that a foreground application
1131 * is going to use networking (net_io_policy_throttled),
1132 * switch the backgroung streams to use background
1133 * congestion control algorithm. Otherwise, even background
1134 * flows can move into foreground.
1135 */
1136 if ((sotcdb & SOTCDB_NO_SENDTCPBG) != 0 || is_local ||
1137 !IS_SO_TC_BACKGROUNDSYSTEM(so->so_traffic_class)) {
1138 if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX)
1139 tcp_set_foreground_cc(so);
1140 } else {
1141 if (old_cc != TCP_CC_ALGO_BACKGROUND_INDEX)
1142 tcp_set_background_cc(so);
1143 }
1144
1145 /* Set receive side background flags */
1146 if ((sotcdb & SOTCDB_NO_RECVTCPBG) != 0 || is_local ||
1147 !IS_SO_TC_BACKGROUNDSYSTEM(so->so_traffic_class)) {
1148 tcp_clear_recv_bg(so);
1149 } else {
1150 tcp_set_recv_bg(so);
1151 }
1152 } else {
1153 tcp_clear_recv_bg(so);
1154 if (old_cc == TCP_CC_ALGO_BACKGROUND_INDEX)
1155 tcp_set_foreground_cc(so);
1156 }
1157
1158 if (old_cc != tp->tcp_cc_index || recvbg != IS_TCP_RECV_BG(so)) {
1159 SOTHROTTLELOG("throttle[%d]: so 0x%llx [%d,%d] TCP %s send; "
1160 "%s recv\n", so->last_pid,
1161 (uint64_t)VM_KERNEL_ADDRPERM(so),
1162 SOCK_DOM(so), SOCK_TYPE(so),
1163 (tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) ?
1164 "background" : "foreground",
1165 IS_TCP_RECV_BG(so) ? "background" : "foreground");
1166 }
1167 }
1168
1169 /*
1170 * Set traffic class to an IPv4 or IPv6 packet
1171 * - mark the mbuf
1172 * - set the DSCP code following the WMM mapping
1173 */
1174 __private_extern__ void
1175 set_packet_service_class(struct mbuf *m, struct socket *so,
1176 int sotc, u_int32_t flags)
1177 {
1178 mbuf_svc_class_t msc = MBUF_SC_BE; /* Best effort by default */
1179 struct inpcb *inp = sotoinpcb(so); /* in6pcb and inpcb are the same */
1180
1181 if (!(m->m_flags & M_PKTHDR))
1182 return;
1183
1184 /*
1185 * Here is the precedence:
1186 * 1) TRAFFIC_MGT_SO_BACKGROUND trumps all
1187 * 2) Traffic class passed via ancillary data to sendmsdg(2)
1188 * 3) Traffic class socket option last
1189 */
1190 if (sotc != SO_TC_UNSPEC) {
1191 VERIFY(SO_VALID_TC(sotc));
1192 msc = so_tc2msc(sotc);
1193 /* Assert because tc must have been valid */
1194 VERIFY(MBUF_VALID_SC(msc));
1195 }
1196
1197 /*
1198 * If TRAFFIC_MGT_SO_BACKGROUND is set or policy to throttle
1199 * best effort is set, depress the priority.
1200 */
1201 if (!IS_MBUF_SC_BACKGROUND(msc) && soisthrottled(so))
1202 msc = MBUF_SC_BK;
1203
1204 if (IS_MBUF_SC_BESTEFFORT(msc) && inp->inp_last_outifp != NULL &&
1205 so_throttle_best_effort(so, inp->inp_last_outifp))
1206 msc = MBUF_SC_BK;
1207
1208 if (soissrcbackground(so))
1209 m->m_pkthdr.pkt_flags |= PKTF_SO_BACKGROUND;
1210
1211 if (soissrcrealtime(so) || IS_MBUF_SC_REALTIME(msc))
1212 m->m_pkthdr.pkt_flags |= PKTF_SO_REALTIME;
1213 /*
1214 * Set the traffic class in the mbuf packet header svc field
1215 */
1216 if (sotcdb & SOTCDB_NO_MTC)
1217 goto no_mbtc;
1218
1219 /*
1220 * Elevate service class if the packet is a pure TCP ACK.
1221 * We can do this only when the flow is not a background
1222 * flow and the outgoing interface supports
1223 * transmit-start model.
1224 */
1225 if (!IS_MBUF_SC_BACKGROUND(msc) &&
1226 (flags & (PKT_SCF_TCP_ACK | PKT_SCF_TCP_SYN)) != 0)
1227 msc = MBUF_SC_CTL;
1228
1229 (void) m_set_service_class(m, msc);
1230
1231 /*
1232 * Set the privileged traffic auxiliary flag if applicable,
1233 * or clear it.
1234 */
1235 if (!(sotcdb & SOTCDB_NO_PRIVILEGED) && soisprivilegedtraffic(so) &&
1236 msc != MBUF_SC_UNSPEC)
1237 m->m_pkthdr.pkt_flags |= PKTF_PRIO_PRIVILEGED;
1238 else
1239 m->m_pkthdr.pkt_flags &= ~PKTF_PRIO_PRIVILEGED;
1240
1241 no_mbtc:
1242 /*
1243 * For TCP with background traffic class switch CC algo based on sysctl
1244 */
1245 if (so->so_type == SOCK_STREAM)
1246 set_tcp_stream_priority(so);
1247
1248 so_tc_update_stats(m, so, msc);
1249 }
1250
1251 __private_extern__ void
1252 so_tc_update_stats(struct mbuf *m, struct socket *so, mbuf_svc_class_t msc)
1253 {
1254 mbuf_traffic_class_t mtc;
1255
1256 /*
1257 * Assume socket and mbuf traffic class values are the same
1258 * Also assume the socket lock is held. Note that the stats
1259 * at the socket layer are reduced down to the legacy traffic
1260 * classes; we could/should potentially expand so_tc_stats[].
1261 */
1262 mtc = MBUF_SC2TC(msc);
1263 VERIFY(mtc < SO_TC_STATS_MAX);
1264 so->so_tc_stats[mtc].txpackets += 1;
1265 so->so_tc_stats[mtc].txbytes += m->m_pkthdr.len;
1266 }
1267
1268 __private_extern__ void
1269 socket_tclass_init(void)
1270 {
1271 _CASSERT(_SO_TC_MAX == SO_TC_STATS_MAX);
1272
1273 tclass_lck_grp_attr = lck_grp_attr_alloc_init();
1274 tclass_lck_grp = lck_grp_alloc_init("tclass", tclass_lck_grp_attr);
1275 tclass_lck_attr = lck_attr_alloc_init();
1276 lck_mtx_init(tclass_lock, tclass_lck_grp, tclass_lck_attr);
1277 }
1278
1279 __private_extern__ mbuf_svc_class_t
1280 so_tc2msc(int tc)
1281 {
1282 mbuf_svc_class_t msc;
1283
1284 switch (tc) {
1285 case SO_TC_BK_SYS:
1286 msc = MBUF_SC_BK_SYS;
1287 break;
1288 case SO_TC_BK:
1289 case _SO_TC_BK:
1290 msc = MBUF_SC_BK;
1291 break;
1292 case SO_TC_BE:
1293 msc = MBUF_SC_BE;
1294 break;
1295 case SO_TC_RD:
1296 msc = MBUF_SC_RD;
1297 break;
1298 case SO_TC_OAM:
1299 msc = MBUF_SC_OAM;
1300 break;
1301 case SO_TC_AV:
1302 msc = MBUF_SC_AV;
1303 break;
1304 case SO_TC_RV:
1305 msc = MBUF_SC_RV;
1306 break;
1307 case SO_TC_VI:
1308 case _SO_TC_VI:
1309 msc = MBUF_SC_VI;
1310 break;
1311 case SO_TC_VO:
1312 case _SO_TC_VO:
1313 msc = MBUF_SC_VO;
1314 break;
1315 case SO_TC_CTL:
1316 msc = MBUF_SC_CTL;
1317 break;
1318 case SO_TC_ALL:
1319 default:
1320 msc = MBUF_SC_UNSPEC;
1321 break;
1322 }
1323
1324 return (msc);
1325 }
1326
1327 __private_extern__ int
1328 so_svc2tc(mbuf_svc_class_t svc)
1329 {
1330 switch (svc) {
1331 case MBUF_SC_BK_SYS:
1332 return (SO_TC_BK_SYS);
1333 case MBUF_SC_BK:
1334 return (SO_TC_BK);
1335 case MBUF_SC_BE:
1336 return (SO_TC_BE);
1337 case MBUF_SC_RD:
1338 return (SO_TC_RD);
1339 case MBUF_SC_OAM:
1340 return (SO_TC_OAM);
1341 case MBUF_SC_AV:
1342 return (SO_TC_AV);
1343 case MBUF_SC_RV:
1344 return (SO_TC_RV);
1345 case MBUF_SC_VI:
1346 return (SO_TC_VI);
1347 case MBUF_SC_VO:
1348 return (SO_TC_VO);
1349 case MBUF_SC_CTL:
1350 return (SO_TC_CTL);
1351 case MBUF_SC_UNSPEC:
1352 default:
1353 return (SO_TC_BE);
1354 }
1355 }
1356
1357 /*
1358 * LRO is turned on for AV streaming class.
1359 */
1360 void
1361 so_set_lro(struct socket *so, int optval)
1362 {
1363 if (optval == SO_TC_AV) {
1364 so->so_flags |= SOF_USELRO;
1365 } else {
1366 if (so->so_flags & SOF_USELRO) {
1367 /* transition to non LRO class */
1368 so->so_flags &= ~SOF_USELRO;
1369 struct inpcb *inp = sotoinpcb(so);
1370 struct tcpcb *tp = NULL;
1371 if (inp) {
1372 tp = intotcpcb(inp);
1373 if (tp && (tp->t_flagsext & TF_LRO_OFFLOADED)) {
1374 tcp_lro_remove_state(inp->inp_laddr,
1375 inp->inp_faddr,
1376 inp->inp_lport,
1377 inp->inp_fport);
1378 tp->t_flagsext &= ~TF_LRO_OFFLOADED;
1379 }
1380 }
1381 }
1382 }
1383 }
1384
1385 static size_t
1386 sotc_index(int sotc)
1387 {
1388 switch (sotc) {
1389 case SO_TC_BK_SYS:
1390 return (SOTCIX_BK_SYS);
1391 case _SO_TC_BK:
1392 case SO_TC_BK:
1393 return (SOTCIX_BK);
1394
1395 case SO_TC_BE:
1396 return (SOTCIX_BE);
1397 case SO_TC_RD:
1398 return (SOTCIX_RD);
1399 case SO_TC_OAM:
1400 return (SOTCIX_OAM);
1401
1402 case SO_TC_AV:
1403 return (SOTCIX_AV);
1404 case SO_TC_RV:
1405 return (SOTCIX_RV);
1406 case _SO_TC_VI:
1407 case SO_TC_VI:
1408 return (SOTCIX_VI);
1409
1410 case _SO_TC_VO:
1411 case SO_TC_VO:
1412 return (SOTCIX_VO);
1413 case SO_TC_CTL:
1414 return (SOTCIX_CTL);
1415
1416 default:
1417 break;
1418 }
1419 /*
1420 * Unknown traffic class value
1421 */
1422 return (SIZE_T_MAX);
1423 }
1424
1425 /*
1426 * Pass NULL ifp for default map
1427 */
1428 static errno_t
1429 set_netsvctype_dscp_map(size_t in_count,
1430 const struct netsvctype_dscp_map *netsvctype_dscp_map)
1431 {
1432 size_t i;
1433 struct net_qos_dscp_map *net_qos_dscp_map = NULL;
1434 int netsvctype;
1435
1436 /*
1437 * Do not accept more that max number of distinct DSCPs
1438 */
1439 if (in_count > _MAX_DSCP || netsvctype_dscp_map == NULL)
1440 return (EINVAL);
1441
1442 /*
1443 * Validate input parameters
1444 */
1445 for (i = 0; i < in_count; i++) {
1446 if (!IS_VALID_NET_SERVICE_TYPE(netsvctype_dscp_map[i].netsvctype))
1447 return (EINVAL);
1448 if (netsvctype_dscp_map[i].dscp > _MAX_DSCP)
1449 return (EINVAL);
1450 }
1451
1452 net_qos_dscp_map = &default_net_qos_dscp_map;
1453
1454 for (i = 0; i < in_count; i++) {
1455 netsvctype = netsvctype_dscp_map[i].netsvctype;
1456
1457 net_qos_dscp_map->netsvctype_to_dscp[netsvctype] =
1458 netsvctype_dscp_map[i].dscp;
1459 }
1460 for (netsvctype = 0; netsvctype < _NET_SERVICE_TYPE_COUNT; netsvctype++) {
1461 switch (netsvctype) {
1462 case NET_SERVICE_TYPE_BE:
1463 case NET_SERVICE_TYPE_BK:
1464 case NET_SERVICE_TYPE_VI:
1465 case NET_SERVICE_TYPE_VO:
1466 case NET_SERVICE_TYPE_RV:
1467 case NET_SERVICE_TYPE_AV:
1468 case NET_SERVICE_TYPE_OAM:
1469 case NET_SERVICE_TYPE_RD: {
1470 size_t sotcix;
1471
1472 sotcix = sotc_index(sotc_by_netservicetype[netsvctype]);
1473 if (sotcix != SIZE_T_MAX) {
1474 net_qos_dscp_map->sotc_to_dscp[sotcix] =
1475 netsvctype_dscp_map[netsvctype].dscp;
1476 }
1477 break;
1478 }
1479 case NET_SERVICE_TYPE_SIG:
1480 /* Signaling does not have its own traffic class */
1481 break;
1482 default:
1483 /* We should not be here */
1484 ASSERT(0);
1485 }
1486 }
1487 /* Network control socket traffic class is always best effort */
1488 net_qos_dscp_map->sotc_to_dscp[SOTCIX_CTL] = _DSCP_DF;
1489
1490 /* Backround socket traffic class DSCP same as backround system */
1491 net_qos_dscp_map->sotc_to_dscp[SOTCIX_BK] =
1492 net_qos_dscp_map->sotc_to_dscp[SOTCIX_BK_SYS];
1493
1494 return (0);
1495 }
1496
1497 /*
1498 * out_count is an input/ouput parameter
1499 */
1500 static errno_t
1501 get_netsvctype_dscp_map(size_t *out_count,
1502 struct netsvctype_dscp_map *netsvctype_dscp_map)
1503 {
1504 size_t i;
1505 struct net_qos_dscp_map *net_qos_dscp_map = NULL;
1506
1507 /*
1508 * Do not accept more that max number of distinct DSCPs
1509 */
1510 if (out_count == NULL || netsvctype_dscp_map == NULL)
1511 return (EINVAL);
1512 if (*out_count > _MAX_DSCP)
1513 return (EINVAL);
1514
1515 net_qos_dscp_map = &default_net_qos_dscp_map;
1516
1517 for (i = 0; i < MIN(_NET_SERVICE_TYPE_COUNT, *out_count); i++) {
1518 netsvctype_dscp_map[i].netsvctype = i;
1519 netsvctype_dscp_map[i].dscp = net_qos_dscp_map->netsvctype_to_dscp[i];
1520
1521 }
1522 *out_count = i;
1523
1524 return (0);
1525 }
1526
1527 void
1528 net_qos_map_init()
1529 {
1530 errno_t error;
1531
1532 /*
1533 * By default use the Fastlane DSCP mappngs
1534 */
1535 error = set_netsvctype_dscp_map(_NET_SERVICE_TYPE_COUNT,
1536 fastlane_netsvctype_dscp_map);
1537 ASSERT(error == 0);
1538
1539 /*
1540 * No DSCP mapping for network control
1541 */
1542 default_net_qos_dscp_map.sotc_to_dscp[SOTCIX_CTL] = _DSCP_DF;
1543
1544 set_dscp_to_wifi_ac_map(default_dscp_to_wifi_ac_map, 1);
1545 }
1546
1547 int
1548 sysctl_default_netsvctype_to_dscp_map SYSCTL_HANDLER_ARGS
1549 {
1550 #pragma unused(oidp, arg1, arg2)
1551 int error = 0;
1552 const size_t max_netsvctype_to_dscp_map_len =
1553 _NET_SERVICE_TYPE_COUNT * sizeof(struct netsvctype_dscp_map);
1554 size_t len;
1555 struct netsvctype_dscp_map netsvctype_dscp_map[_NET_SERVICE_TYPE_COUNT] = {};
1556 size_t count;
1557
1558 if (req->oldptr == USER_ADDR_NULL) {
1559 req->oldidx =
1560 _NET_SERVICE_TYPE_COUNT * sizeof(struct netsvctype_dscp_map);
1561 } else if (req->oldlen > 0) {
1562 count = _NET_SERVICE_TYPE_COUNT;
1563 error = get_netsvctype_dscp_map(&count, netsvctype_dscp_map);
1564 if (error != 0)
1565 goto done;
1566 len = count * sizeof(struct netsvctype_dscp_map);
1567 error = SYSCTL_OUT(req, netsvctype_dscp_map,
1568 MIN(len, req->oldlen));
1569 if (error != 0)
1570 goto done;
1571 }
1572
1573 if (req->newptr == USER_ADDR_NULL)
1574 goto done;
1575
1576 error = proc_suser(current_proc());
1577 if (error != 0)
1578 goto done;
1579
1580 /*
1581 * Check input length
1582 */
1583 if (req->newlen > max_netsvctype_to_dscp_map_len) {
1584 error = EINVAL;
1585 goto done;
1586 }
1587 /*
1588 * Cap the number of entries to copy from input buffer
1589 */
1590 error = SYSCTL_IN(req, netsvctype_dscp_map, req->newlen);
1591 if (error != 0)
1592 goto done;
1593
1594 count = req->newlen / sizeof(struct netsvctype_dscp_map);
1595 error = set_netsvctype_dscp_map(count, netsvctype_dscp_map);
1596 done:
1597 return (error);
1598 }
1599
1600 __private_extern__ errno_t
1601 set_packet_qos(struct mbuf *m, struct ifnet *ifp, boolean_t qos_allowed,
1602 int sotc, int netsvctype, u_int8_t *dscp_inout)
1603 {
1604 if (ifp == NULL || dscp_inout == NULL)
1605 return (EINVAL);
1606
1607 if ((ifp->if_eflags &
1608 (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) ==
1609 (IFEF_QOSMARKING_ENABLED | IFEF_QOSMARKING_CAPABLE)) {
1610 u_int8_t dscp;
1611
1612 /*
1613 * When on a Fastlane network, IP_TOS/IPV6_TCLASS are no-ops
1614 */
1615 dscp = _DSCP_DF;
1616
1617 /*
1618 * For DSCP use the network service type is specified, otherwise
1619 * use the socket traffic class
1620 *
1621 * When not whitelisted by the policy, set DSCP only for best
1622 * effort and background, and set the mbuf service class to
1623 * best effort as well so the packet will be queued and
1624 * scheduled at a lower priority.
1625 * We still want to prioritize control traffic on the interface
1626 * so we do not change the mbuf service class for SO_TC_CTL
1627 */
1628 if (IS_VALID_NET_SERVICE_TYPE(netsvctype) &&
1629 netsvctype != NET_SERVICE_TYPE_BE) {
1630 dscp = default_net_qos_dscp_map.netsvctype_to_dscp[netsvctype];
1631
1632 if (qos_allowed == FALSE &&
1633 netsvctype != NET_SERVICE_TYPE_BE &&
1634 netsvctype != NET_SERVICE_TYPE_BK) {
1635 dscp = _DSCP_DF;
1636 if (sotc != SO_TC_CTL)
1637 m_set_service_class(m, MBUF_SC_BE);
1638 }
1639 } else if (sotc != SO_TC_UNSPEC) {
1640 size_t sotcix = sotc_index(sotc);
1641 if (sotcix != SIZE_T_MAX) {
1642 dscp = default_net_qos_dscp_map.sotc_to_dscp[sotcix];
1643
1644 if (qos_allowed == FALSE && sotc != SO_TC_BE &&
1645 sotc != SO_TC_BK && sotc != SO_TC_BK_SYS &&
1646 sotc != SO_TC_CTL) {
1647 dscp = _DSCP_DF;
1648 if (sotc != SO_TC_CTL)
1649 m_set_service_class(m, MBUF_SC_BE);
1650 }
1651 }
1652 }
1653 if (net_qos_verbose != 0)
1654 printf("%s qos_allowed %d sotc %u netsvctype %u dscp %u\n",
1655 __func__, qos_allowed, sotc, netsvctype, dscp);
1656
1657 if (*dscp_inout != dscp) {
1658 *dscp_inout = dscp;
1659 }
1660 } else if (*dscp_inout != _DSCP_DF && IFNET_IS_WIFI_INFRA(ifp)) {
1661 mbuf_svc_class_t msc = m_get_service_class(m);
1662
1663 /*
1664 * For WiFi infra, when the mbuf service class is best effort
1665 * and the DSCP is not default, set the service class based
1666 * on DSCP
1667 */
1668 if (msc == MBUF_SC_BE) {
1669 msc = wifi_dscp_to_msc_array[*dscp_inout];
1670
1671 if (msc != MBUF_SC_BE) {
1672 m_set_service_class(m, msc);
1673
1674 if (net_qos_verbose != 0)
1675 printf("%s set msc %u for dscp %u\n",
1676 __func__, msc, *dscp_inout);
1677 }
1678 }
1679 }
1680
1681 return (0);
1682 }
1683
1684 static void
1685 set_dscp_to_wifi_ac_map(const struct dcsp_msc_map *map, int clear)
1686 {
1687 int i;
1688
1689 if (clear)
1690 bzero(wifi_dscp_to_msc_array, sizeof(wifi_dscp_to_msc_array));
1691
1692 for (i = 0; i < DSCP_ARRAY_SIZE; i++) {
1693 const struct dcsp_msc_map *elem = map + i;
1694
1695 if (elem->dscp > _MAX_DSCP || elem->msc == MBUF_SC_UNSPEC)
1696 break;
1697 switch (elem->msc) {
1698 case MBUF_SC_BK_SYS:
1699 case MBUF_SC_BK:
1700 wifi_dscp_to_msc_array[elem->dscp] = MBUF_SC_BK;
1701 break;
1702 default:
1703 case MBUF_SC_BE:
1704 case MBUF_SC_RD:
1705 case MBUF_SC_OAM:
1706 wifi_dscp_to_msc_array[elem->dscp] = MBUF_SC_BE;
1707 break;
1708 case MBUF_SC_AV:
1709 case MBUF_SC_RV:
1710 case MBUF_SC_VI:
1711 wifi_dscp_to_msc_array[elem->dscp] = MBUF_SC_VI;
1712 break;
1713 case MBUF_SC_VO:
1714 case MBUF_SC_CTL:
1715 wifi_dscp_to_msc_array[elem->dscp] = MBUF_SC_VO;
1716 break;
1717 }
1718 }
1719 }
1720
1721 static errno_t
1722 dscp_msc_map_from_netsvctype_dscp_map(struct netsvctype_dscp_map *netsvctype_dscp_map,
1723 size_t count, struct dcsp_msc_map *dcsp_msc_map)
1724 {
1725 errno_t error = 0;
1726 u_int32_t i;
1727
1728 /*
1729 * Validate input parameters
1730 */
1731 for (i = 0; i < count; i++) {
1732 if (!SO_VALID_TC(netsvctype_dscp_map[i].netsvctype)) {
1733 error = EINVAL;
1734 goto done;
1735 }
1736 if (netsvctype_dscp_map[i].dscp > _MAX_DSCP) {
1737 error = EINVAL;
1738 goto done;
1739 }
1740 }
1741
1742 bzero(dcsp_msc_map, DSCP_ARRAY_SIZE * sizeof(struct dcsp_msc_map));
1743
1744 for (i = 0; i < count; i++) {
1745 dcsp_msc_map[i].dscp = netsvctype_dscp_map[i].dscp;
1746 dcsp_msc_map[i].msc = so_tc2msc(netsvctype_dscp_map[i].netsvctype);
1747 }
1748 done:
1749 return (error);
1750 }
1751
1752 int
1753 sysctl_dscp_to_wifi_ac_map SYSCTL_HANDLER_ARGS
1754 {
1755 #pragma unused(oidp, arg1, arg2)
1756 int error = 0;
1757 size_t len = DSCP_ARRAY_SIZE * sizeof(struct netsvctype_dscp_map);
1758 struct netsvctype_dscp_map netsvctype_dscp_map[DSCP_ARRAY_SIZE] = {};
1759 struct dcsp_msc_map dcsp_msc_map[DSCP_ARRAY_SIZE];
1760 size_t count;
1761 u_int32_t i;
1762
1763 if (req->oldptr == USER_ADDR_NULL) {
1764 req->oldidx = len;
1765 } else if (req->oldlen > 0) {
1766 for (i = 0; i < DSCP_ARRAY_SIZE; i++) {
1767 netsvctype_dscp_map[i].dscp = i;
1768 netsvctype_dscp_map[i].netsvctype =
1769 so_svc2tc(wifi_dscp_to_msc_array[i]);
1770 }
1771 error = SYSCTL_OUT(req, netsvctype_dscp_map,
1772 MIN(len, req->oldlen));
1773 if (error != 0)
1774 goto done;
1775 }
1776
1777 if (req->newptr == USER_ADDR_NULL)
1778 goto done;
1779
1780 error = proc_suser(current_proc());
1781 if (error != 0)
1782 goto done;
1783
1784 /*
1785 * Check input length
1786 */
1787 if (req->newlen > len) {
1788 error = EINVAL;
1789 goto done;
1790 }
1791 /*
1792 * Cap the number of entries to copy from input buffer
1793 */
1794 if (len > req->newlen)
1795 len = req->newlen;
1796 error = SYSCTL_IN(req, netsvctype_dscp_map, len);
1797 if (error != 0) {
1798 goto done;
1799 }
1800 count = len / sizeof(struct netsvctype_dscp_map);
1801 bzero(dcsp_msc_map, sizeof(dcsp_msc_map));
1802 error = dscp_msc_map_from_netsvctype_dscp_map(netsvctype_dscp_map, count,
1803 dcsp_msc_map);
1804 if (error != 0) {
1805 goto done;
1806 }
1807 set_dscp_to_wifi_ac_map(dcsp_msc_map, 0);
1808 done:
1809 return (error);
1810 }
1811
1812 int
1813 sysctl_reset_dscp_to_wifi_ac_map SYSCTL_HANDLER_ARGS
1814 {
1815 #pragma unused(oidp, arg1, arg2)
1816 int error = 0;
1817 int val = 0;
1818
1819 error = sysctl_handle_int(oidp, &val, 0, req);
1820 if (error || !req->newptr)
1821 return (error);
1822
1823 set_dscp_to_wifi_ac_map(default_dscp_to_wifi_ac_map, 1);
1824
1825 return (0);
1826 }
1827
1828 /*
1829 * Returns whether a large upload or download transfer should be marked as
1830 * BK service type for network activity. This is a system level
1831 * hint/suggestion to classify application traffic based on statistics
1832 * collected from the current network attachment
1833 *
1834 * Returns 1 for BK and 0 for default
1835 */
1836
1837 int
1838 net_qos_guideline(struct proc *p, struct net_qos_guideline_args *arg,
1839 int *retval)
1840 {
1841 #pragma unused(p)
1842 #define RETURN_USE_BK 1
1843 #define RETURN_USE_DEFAULT 0
1844 struct net_qos_param qos_arg;
1845 struct ifnet *ipv4_primary, *ipv6_primary;
1846 int err = 0;
1847
1848 if (arg->param == USER_ADDR_NULL || retval == NULL ||
1849 arg->param_len != sizeof (qos_arg)) {
1850 return (EINVAL);
1851 }
1852 err = copyin(arg->param, (caddr_t) &qos_arg, sizeof (qos_arg));
1853 if (err != 0)
1854 return (err);
1855
1856 *retval = RETURN_USE_DEFAULT;
1857 ipv4_primary = ifindex2ifnet[get_primary_ifscope(AF_INET)];
1858 ipv6_primary = ifindex2ifnet[get_primary_ifscope(AF_INET6)];
1859
1860 /*
1861 * If either of the interfaces is in Low Internet mode, enable
1862 * background delay based algorithms on this transfer
1863 */
1864 if (qos_arg.nq_uplink) {
1865 if ((ipv4_primary != NULL &&
1866 (ipv4_primary->if_xflags & IFXF_LOW_INTERNET_UL)) ||
1867 (ipv6_primary != NULL &&
1868 (ipv6_primary->if_xflags & IFXF_LOW_INTERNET_UL))) {
1869 *retval = RETURN_USE_BK;
1870 return (0);
1871 }
1872 } else {
1873 if ((ipv4_primary != NULL &&
1874 (ipv4_primary->if_xflags & IFXF_LOW_INTERNET_DL)) ||
1875 (ipv6_primary != NULL &&
1876 (ipv6_primary->if_xflags & IFXF_LOW_INTERNET_DL))) {
1877 *retval = RETURN_USE_BK;
1878 return (0);
1879 }
1880 }
1881
1882 /*
1883 * Some times IPv4 and IPv6 primary interfaces can be different.
1884 * In this case, if either of them is non-cellular, we should mark
1885 * the transfer as BK as it can potentially get used based on
1886 * the host name resolution
1887 */
1888 if (ipv4_primary != NULL && IFNET_IS_EXPENSIVE(ipv4_primary) &&
1889 ipv6_primary != NULL && IFNET_IS_EXPENSIVE(ipv6_primary)) {
1890 if (qos_arg.nq_use_expensive) {
1891 return (0);
1892 } else {
1893 *retval = RETURN_USE_BK;
1894 return (0);
1895 }
1896 }
1897 if (qos_arg.nq_transfer_size >= 5 * 1024 * 1024) {
1898 *retval = RETURN_USE_BK;
1899 return (0);
1900 }
1901
1902
1903 #undef RETURN_USE_BK
1904 #undef RETURN_USE_DEFAULT
1905 return (0);
1906 }