]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/uipc_socket.c
b94476d05ea29c77beaee69b10de9cdb7766b19f
[apple/xnu.git] / bsd / kern / uipc_socket.c
1 /*
2 * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/net_api_stats.h>
102 #include <net/ntstat.h>
103 #include <net/content_filter.h>
104 #include <netinet/in.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet/in_tclass.h>
107 #include <netinet/in_var.h>
108 #include <netinet/tcp_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet6/ip6_var.h>
111 #include <netinet/flow_divert.h>
112 #include <kern/zalloc.h>
113 #include <kern/locks.h>
114 #include <machine/limits.h>
115 #include <libkern/OSAtomic.h>
116 #include <pexpert/pexpert.h>
117 #include <kern/assert.h>
118 #include <kern/task.h>
119 #include <kern/policy_internal.h>
120
121 #include <sys/kpi_mbuf.h>
122 #include <sys/mcache.h>
123 #include <sys/unpcb.h>
124 #include <libkern/section_keywords.h>
125
126 #if CONFIG_MACF
127 #include <security/mac_framework.h>
128 #endif /* MAC */
129
130 #if MULTIPATH
131 #include <netinet/mp_pcb.h>
132 #include <netinet/mptcp_var.h>
133 #endif /* MULTIPATH */
134
135 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
136
137 #if DEBUG || DEVELOPMENT
138 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
139 #else
140 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
141 #endif
142
143 /* TODO: this should be in a header file somewhere */
144 extern char *proc_name_address(void *p);
145
146 static u_int32_t so_cache_hw; /* High water mark for socache */
147 static u_int32_t so_cache_timeouts; /* number of timeouts */
148 static u_int32_t so_cache_max_freed; /* max freed per timeout */
149 static u_int32_t cached_sock_count = 0;
150 STAILQ_HEAD(, socket) so_cache_head;
151 int max_cached_sock_count = MAX_CACHED_SOCKETS;
152 static u_int32_t so_cache_time;
153 static int socketinit_done;
154 static struct zone *so_cache_zone;
155
156 static lck_grp_t *so_cache_mtx_grp;
157 static lck_attr_t *so_cache_mtx_attr;
158 static lck_grp_attr_t *so_cache_mtx_grp_attr;
159 static lck_mtx_t *so_cache_mtx;
160
161 #include <machine/limits.h>
162
163 static int filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
164 static void filt_sordetach(struct knote *kn);
165 static int filt_soread(struct knote *kn, long hint);
166 static int filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
167 static int filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
168
169 static int filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
170 static void filt_sowdetach(struct knote *kn);
171 static int filt_sowrite(struct knote *kn, long hint);
172 static int filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
173 static int filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
174
175 static int filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
176 static void filt_sockdetach(struct knote *kn);
177 static int filt_sockev(struct knote *kn, long hint);
178 static int filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
179 static int filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
180
181 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
182 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
183
184 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
185 .f_isfd = 1,
186 .f_attach = filt_sorattach,
187 .f_detach = filt_sordetach,
188 .f_event = filt_soread,
189 .f_touch = filt_sortouch,
190 .f_process = filt_sorprocess,
191 };
192
193 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
194 .f_isfd = 1,
195 .f_attach = filt_sowattach,
196 .f_detach = filt_sowdetach,
197 .f_event = filt_sowrite,
198 .f_touch = filt_sowtouch,
199 .f_process = filt_sowprocess,
200 };
201
202 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
203 .f_isfd = 1,
204 .f_attach = filt_sockattach,
205 .f_detach = filt_sockdetach,
206 .f_event = filt_sockev,
207 .f_touch = filt_socktouch,
208 .f_process = filt_sockprocess,
209 };
210
211 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
212 .f_isfd = 1,
213 .f_attach = filt_sorattach,
214 .f_detach = filt_sordetach,
215 .f_event = filt_soread,
216 .f_touch = filt_sortouch,
217 .f_process = filt_sorprocess,
218 };
219
220 SYSCTL_DECL(_kern_ipc);
221
222 #define EVEN_MORE_LOCKING_DEBUG 0
223
224 int socket_debug = 0;
225 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
226 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
227
228 static unsigned long sodefunct_calls = 0;
229 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
230 &sodefunct_calls, "");
231
232 static int socket_zone = M_SOCKET;
233 so_gen_t so_gencnt; /* generation count for sockets */
234
235 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
236 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
237
238 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
239 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
240 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
241 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
242 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
243 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
244 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
245 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
246 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
247
248 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
249
250 int somaxconn = SOMAXCONN;
251 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
252 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
253
254 /* Should we get a maximum also ??? */
255 static int sosendmaxchain = 65536;
256 static int sosendminchain = 16384;
257 static int sorecvmincopy = 16384;
258 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
259 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
260 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
261 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
262
263 /*
264 * Set to enable jumbo clusters (if available) for large writes when
265 * the socket is marked with SOF_MULTIPAGES; see below.
266 */
267 int sosendjcl = 1;
268 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
269 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
270
271 /*
272 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
273 * writes on the socket for all protocols on any network interfaces,
274 * depending upon sosendjcl above. Be extra careful when setting this
275 * to 1, because sending down packets that cross physical pages down to
276 * broken drivers (those that falsely assume that the physical pages
277 * are contiguous) might lead to system panics or silent data corruption.
278 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
279 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
280 * capable. Set this to 1 only for testing/debugging purposes.
281 */
282 int sosendjcl_ignore_capab = 0;
283 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
284 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
285
286 /*
287 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
288 * writes on the socket for all protocols on any network interfaces.
289 * Be extra careful when setting this to 1, because sending down packets with
290 * clusters larger that 2 KB might lead to system panics or data corruption.
291 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
292 * on the outgoing interface
293 * Set this to 1 for testing/debugging purposes only.
294 */
295 int sosendbigcl_ignore_capab = 0;
296 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
297 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
298
299 int sodefunctlog = 0;
300 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
301 &sodefunctlog, 0, "");
302
303 int sothrottlelog = 0;
304 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
305 &sothrottlelog, 0, "");
306
307 int sorestrictrecv = 1;
308 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
309 &sorestrictrecv, 0, "Enable inbound interface restrictions");
310
311 int sorestrictsend = 1;
312 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
313 &sorestrictsend, 0, "Enable outbound interface restrictions");
314
315 int soreserveheadroom = 1;
316 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
317 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
318
319 #if (DEBUG || DEVELOPMENT)
320 int so_notsent_lowat_check = 1;
321 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
322 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
323 #endif /* DEBUG || DEVELOPMENT */
324
325 int so_accept_list_waits = 0;
326 #if (DEBUG || DEVELOPMENT)
327 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
328 &so_accept_list_waits, 0, "number of waits for listener incomp list");
329 #endif /* DEBUG || DEVELOPMENT */
330
331 extern struct inpcbinfo tcbinfo;
332
333 /* TODO: these should be in header file */
334 extern int get_inpcb_str_size(void);
335 extern int get_tcp_str_size(void);
336
337 vm_size_t so_cache_zone_element_size;
338
339 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
340 user_ssize_t *);
341 static void cached_sock_alloc(struct socket **, int);
342 static void cached_sock_free(struct socket *);
343
344 /*
345 * Maximum of extended background idle sockets per process
346 * Set to zero to disable further setting of the option
347 */
348
349 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
350 #define SO_IDLE_BK_IDLE_TIME 600
351 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
352
353 struct soextbkidlestat soextbkidlestat;
354
355 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
356 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
357 "Maximum of extended background idle sockets per process");
358
359 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
360 &soextbkidlestat.so_xbkidle_time, 0,
361 "Time in seconds to keep extended background idle sockets");
362
363 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
364 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
365 "High water mark for extended background idle sockets");
366
367 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
368 &soextbkidlestat, soextbkidlestat, "");
369
370 int so_set_extended_bk_idle(struct socket *, int);
371
372
373 /*
374 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
375 * setting the DSCP code on the packet based on the service class; see
376 * <rdar://problem/11277343> for details.
377 */
378 __private_extern__ u_int32_t sotcdb = 0;
379 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
380 &sotcdb, 0, "");
381
382 void
383 socketinit(void)
384 {
385 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
386 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
387
388 #ifdef __LP64__
389 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
390 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
391 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
392 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
393 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
394 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
395 #else
396 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
397 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
398 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
399 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
400 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
401 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
402 #endif
403
404 if (socketinit_done) {
405 printf("socketinit: already called...\n");
406 return;
407 }
408 socketinit_done = 1;
409
410 PE_parse_boot_argn("socket_debug", &socket_debug,
411 sizeof(socket_debug));
412
413 /*
414 * allocate lock group attribute and group for socket cache mutex
415 */
416 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
417 so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
418 so_cache_mtx_grp_attr);
419
420 /*
421 * allocate the lock attribute for socket cache mutex
422 */
423 so_cache_mtx_attr = lck_attr_alloc_init();
424
425 /* cached sockets mutex */
426 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
427 if (so_cache_mtx == NULL) {
428 panic("%s: unable to allocate so_cache_mtx\n", __func__);
429 /* NOTREACHED */
430 }
431 STAILQ_INIT(&so_cache_head);
432
433 so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
434 + get_inpcb_str_size() + 4 + get_tcp_str_size());
435
436 so_cache_zone = zinit(so_cache_zone_element_size,
437 (120000 * so_cache_zone_element_size), 8192, "socache zone");
438 zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
439 zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
440
441 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
442 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
443 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
444 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
445
446 in_pcbinit();
447 sflt_init();
448 socket_tclass_init();
449 #if MULTIPATH
450 mp_pcbinit();
451 #endif /* MULTIPATH */
452 }
453
454 static void
455 cached_sock_alloc(struct socket **so, int waitok)
456 {
457 caddr_t temp;
458 uintptr_t offset;
459
460 lck_mtx_lock(so_cache_mtx);
461
462 if (!STAILQ_EMPTY(&so_cache_head)) {
463 VERIFY(cached_sock_count > 0);
464
465 *so = STAILQ_FIRST(&so_cache_head);
466 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
467 STAILQ_NEXT((*so), so_cache_ent) = NULL;
468
469 cached_sock_count--;
470 lck_mtx_unlock(so_cache_mtx);
471
472 temp = (*so)->so_saved_pcb;
473 bzero((caddr_t)*so, sizeof(struct socket));
474
475 (*so)->so_saved_pcb = temp;
476 } else {
477 lck_mtx_unlock(so_cache_mtx);
478
479 if (waitok) {
480 *so = (struct socket *)zalloc(so_cache_zone);
481 } else {
482 *so = (struct socket *)zalloc_noblock(so_cache_zone);
483 }
484
485 if (*so == NULL) {
486 return;
487 }
488
489 bzero((caddr_t)*so, sizeof(struct socket));
490
491 /*
492 * Define offsets for extra structures into our
493 * single block of memory. Align extra structures
494 * on longword boundaries.
495 */
496
497 offset = (uintptr_t)*so;
498 offset += sizeof(struct socket);
499
500 offset = ALIGN(offset);
501
502 (*so)->so_saved_pcb = (caddr_t)offset;
503 offset += get_inpcb_str_size();
504
505 offset = ALIGN(offset);
506
507 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
508 (caddr_t)offset;
509 }
510
511 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
512 }
513
514 static void
515 cached_sock_free(struct socket *so)
516 {
517 lck_mtx_lock(so_cache_mtx);
518
519 so_cache_time = net_uptime();
520 if (++cached_sock_count > max_cached_sock_count) {
521 --cached_sock_count;
522 lck_mtx_unlock(so_cache_mtx);
523 zfree(so_cache_zone, so);
524 } else {
525 if (so_cache_hw < cached_sock_count) {
526 so_cache_hw = cached_sock_count;
527 }
528
529 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
530
531 so->cache_timestamp = so_cache_time;
532 lck_mtx_unlock(so_cache_mtx);
533 }
534 }
535
536 void
537 so_update_last_owner_locked(struct socket *so, proc_t self)
538 {
539 if (so->last_pid != 0) {
540 /*
541 * last_pid and last_upid should remain zero for sockets
542 * created using sock_socket. The check above achieves that
543 */
544 if (self == PROC_NULL) {
545 self = current_proc();
546 }
547
548 if (so->last_upid != proc_uniqueid(self) ||
549 so->last_pid != proc_pid(self)) {
550 so->last_upid = proc_uniqueid(self);
551 so->last_pid = proc_pid(self);
552 proc_getexecutableuuid(self, so->last_uuid,
553 sizeof(so->last_uuid));
554 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
555 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
556 }
557 }
558 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
559 }
560 }
561
562 void
563 so_update_policy(struct socket *so)
564 {
565 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
566 (void) inp_update_policy(sotoinpcb(so));
567 }
568 }
569
570 #if NECP
571 static void
572 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
573 struct sockaddr *override_remote_addr)
574 {
575 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
576 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
577 override_remote_addr, 0);
578 }
579 }
580 #endif /* NECP */
581
582 boolean_t
583 so_cache_timer(void)
584 {
585 struct socket *p;
586 int n_freed = 0;
587 boolean_t rc = FALSE;
588
589 lck_mtx_lock(so_cache_mtx);
590 so_cache_timeouts++;
591 so_cache_time = net_uptime();
592
593 while (!STAILQ_EMPTY(&so_cache_head)) {
594 VERIFY(cached_sock_count > 0);
595 p = STAILQ_FIRST(&so_cache_head);
596 if ((so_cache_time - p->cache_timestamp) <
597 SO_CACHE_TIME_LIMIT) {
598 break;
599 }
600
601 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
602 --cached_sock_count;
603
604 zfree(so_cache_zone, p);
605
606 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
607 so_cache_max_freed++;
608 break;
609 }
610 }
611
612 /* Schedule again if there is more to cleanup */
613 if (!STAILQ_EMPTY(&so_cache_head)) {
614 rc = TRUE;
615 }
616
617 lck_mtx_unlock(so_cache_mtx);
618 return rc;
619 }
620
621 /*
622 * Get a socket structure from our zone, and initialize it.
623 * We don't implement `waitok' yet (see comments in uipc_domain.c).
624 * Note that it would probably be better to allocate socket
625 * and PCB at the same time, but I'm not convinced that all
626 * the protocols can be easily modified to do this.
627 */
628 struct socket *
629 soalloc(int waitok, int dom, int type)
630 {
631 struct socket *so;
632
633 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
634 cached_sock_alloc(&so, waitok);
635 } else {
636 MALLOC_ZONE(so, struct socket *, sizeof(*so), socket_zone,
637 M_WAITOK);
638 if (so != NULL) {
639 bzero(so, sizeof(*so));
640 }
641 }
642 if (so != NULL) {
643 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
644 so->so_zone = socket_zone;
645
646 /*
647 * Increment the socket allocation statistics
648 */
649 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
650
651 #if CONFIG_MACF_SOCKET
652 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
653 if (mac_socket_label_init(so, !waitok) != 0) {
654 sodealloc(so);
655 return NULL;
656 }
657 #endif /* MAC_SOCKET */
658 }
659
660 return so;
661 }
662
663 int
664 socreate_internal(int dom, struct socket **aso, int type, int proto,
665 struct proc *p, uint32_t flags, struct proc *ep)
666 {
667 struct protosw *prp;
668 struct socket *so;
669 int error = 0;
670 #if defined(XNU_TARGET_OS_OSX)
671 pid_t rpid = -1;
672 #endif
673
674 #if TCPDEBUG
675 extern int tcpconsdebug;
676 #endif
677
678 VERIFY(aso != NULL);
679 *aso = NULL;
680
681 if (proto != 0) {
682 prp = pffindproto(dom, proto, type);
683 } else {
684 prp = pffindtype(dom, type);
685 }
686
687 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
688 if (pffinddomain(dom) == NULL) {
689 return EAFNOSUPPORT;
690 }
691 if (proto != 0) {
692 if (pffindprotonotype(dom, proto) != NULL) {
693 return EPROTOTYPE;
694 }
695 }
696 return EPROTONOSUPPORT;
697 }
698 if (prp->pr_type != type) {
699 return EPROTOTYPE;
700 }
701 so = soalloc(1, dom, type);
702 if (so == NULL) {
703 return ENOBUFS;
704 }
705
706 switch (dom) {
707 case PF_LOCAL:
708 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
709 break;
710 case PF_INET:
711 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
712 if (type == SOCK_STREAM) {
713 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
714 } else {
715 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
716 }
717 break;
718 case PF_ROUTE:
719 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
720 break;
721 case PF_NDRV:
722 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
723 break;
724 case PF_KEY:
725 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
726 break;
727 case PF_INET6:
728 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
729 if (type == SOCK_STREAM) {
730 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
731 } else {
732 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
733 }
734 break;
735 case PF_SYSTEM:
736 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
737 break;
738 case PF_MULTIPATH:
739 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
740 break;
741 default:
742 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
743 break;
744 }
745
746 if (flags & SOCF_MPTCP) {
747 so->so_state |= SS_NBIO;
748 }
749
750 TAILQ_INIT(&so->so_incomp);
751 TAILQ_INIT(&so->so_comp);
752 so->so_type = type;
753 so->last_upid = proc_uniqueid(p);
754 so->last_pid = proc_pid(p);
755 proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
756 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
757
758 if (ep != PROC_NULL && ep != p) {
759 so->e_upid = proc_uniqueid(ep);
760 so->e_pid = proc_pid(ep);
761 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
762 so->so_flags |= SOF_DELEGATED;
763 #if defined(XNU_TARGET_OS_OSX)
764 if (ep->p_responsible_pid != so->e_pid) {
765 rpid = ep->p_responsible_pid;
766 }
767 #endif
768 }
769
770 #if defined(XNU_TARGET_OS_OSX)
771 if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
772 rpid = p->p_responsible_pid;
773 }
774
775 so->so_rpid = -1;
776 uuid_clear(so->so_ruuid);
777 if (rpid >= 0) {
778 proc_t rp = proc_find(rpid);
779 if (rp != PROC_NULL) {
780 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
781 so->so_rpid = rpid;
782 proc_rele(rp);
783 }
784 }
785 #endif
786
787 so->so_cred = kauth_cred_proc_ref(p);
788 if (!suser(kauth_cred_get(), NULL)) {
789 so->so_state |= SS_PRIV;
790 }
791
792 so->so_proto = prp;
793 so->so_rcv.sb_flags |= SB_RECV;
794 so->so_rcv.sb_so = so->so_snd.sb_so = so;
795 so->next_lock_lr = 0;
796 so->next_unlock_lr = 0;
797
798 #if CONFIG_MACF_SOCKET
799 mac_socket_label_associate(kauth_cred_get(), so);
800 #endif /* MAC_SOCKET */
801
802 /*
803 * Attachment will create the per pcb lock if necessary and
804 * increase refcount for creation, make sure it's done before
805 * socket is inserted in lists.
806 */
807 so->so_usecount++;
808
809 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
810 if (error != 0) {
811 /*
812 * Warning:
813 * If so_pcb is not zero, the socket will be leaked,
814 * so protocol attachment handler must be coded carefuly
815 */
816 so->so_state |= SS_NOFDREF;
817 VERIFY(so->so_usecount > 0);
818 so->so_usecount--;
819 sofreelastref(so, 1); /* will deallocate the socket */
820 return error;
821 }
822
823 /*
824 * Note: needs so_pcb to be set after pru_attach
825 */
826 if (prp->pr_update_last_owner != NULL) {
827 (*prp->pr_update_last_owner)(so, p, ep);
828 }
829
830 atomic_add_32(&prp->pr_domain->dom_refs, 1);
831 TAILQ_INIT(&so->so_evlist);
832
833 /* Attach socket filters for this protocol */
834 sflt_initsock(so);
835 #if TCPDEBUG
836 if (tcpconsdebug == 2) {
837 so->so_options |= SO_DEBUG;
838 }
839 #endif
840 so_set_default_traffic_class(so);
841
842 /*
843 * If this thread or task is marked to create backgrounded sockets,
844 * mark the socket as background.
845 */
846 if (!(flags & SOCF_MPTCP) &&
847 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
848 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
849 so->so_background_thread = current_thread();
850 }
851
852 switch (dom) {
853 /*
854 * Don't mark Unix domain or system
855 * eligible for defunct by default.
856 */
857 case PF_LOCAL:
858 case PF_SYSTEM:
859 so->so_flags |= SOF_NODEFUNCT;
860 break;
861 default:
862 break;
863 }
864
865 /*
866 * Entitlements can't be checked at socket creation time except if the
867 * application requested a feature guarded by a privilege (c.f., socket
868 * delegation).
869 * The priv(9) and the Sandboxing APIs are designed with the idea that
870 * a privilege check should only be triggered by a userland request.
871 * A privilege check at socket creation time is time consuming and
872 * could trigger many authorisation error messages from the security
873 * APIs.
874 */
875
876 *aso = so;
877
878 return 0;
879 }
880
881 /*
882 * Returns: 0 Success
883 * EAFNOSUPPORT
884 * EPROTOTYPE
885 * EPROTONOSUPPORT
886 * ENOBUFS
887 * <pru_attach>:ENOBUFS[AF_UNIX]
888 * <pru_attach>:ENOBUFS[TCP]
889 * <pru_attach>:ENOMEM[TCP]
890 * <pru_attach>:??? [other protocol families, IPSEC]
891 */
892 int
893 socreate(int dom, struct socket **aso, int type, int proto)
894 {
895 return socreate_internal(dom, aso, type, proto, current_proc(), 0,
896 PROC_NULL);
897 }
898
899 int
900 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
901 {
902 int error = 0;
903 struct proc *ep = PROC_NULL;
904
905 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
906 error = ESRCH;
907 goto done;
908 }
909
910 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
911
912 /*
913 * It might not be wise to hold the proc reference when calling
914 * socreate_internal since it calls soalloc with M_WAITOK
915 */
916 done:
917 if (ep != PROC_NULL) {
918 proc_rele(ep);
919 }
920
921 return error;
922 }
923
924 /*
925 * Returns: 0 Success
926 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
927 * <pru_bind>:EAFNOSUPPORT Address family not supported
928 * <pru_bind>:EADDRNOTAVAIL Address not available.
929 * <pru_bind>:EINVAL Invalid argument
930 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
931 * <pru_bind>:EACCES Permission denied
932 * <pru_bind>:EADDRINUSE Address in use
933 * <pru_bind>:EAGAIN Resource unavailable, try again
934 * <pru_bind>:EPERM Operation not permitted
935 * <pru_bind>:???
936 * <sf_bind>:???
937 *
938 * Notes: It's not possible to fully enumerate the return codes above,
939 * since socket filter authors and protocol family authors may
940 * not choose to limit their error returns to those listed, even
941 * though this may result in some software operating incorrectly.
942 *
943 * The error codes which are enumerated above are those known to
944 * be returned by the tcp_usr_bind function supplied.
945 */
946 int
947 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
948 {
949 struct proc *p = current_proc();
950 int error = 0;
951
952 if (dolock) {
953 socket_lock(so, 1);
954 }
955
956 so_update_last_owner_locked(so, p);
957 so_update_policy(so);
958
959 #if NECP
960 so_update_necp_policy(so, nam, NULL);
961 #endif /* NECP */
962
963 /*
964 * If this is a bind request on a socket that has been marked
965 * as inactive, reject it now before we go any further.
966 */
967 if (so->so_flags & SOF_DEFUNCT) {
968 error = EINVAL;
969 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
970 __func__, proc_pid(p), proc_best_name(p),
971 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
972 SOCK_DOM(so), SOCK_TYPE(so), error);
973 goto out;
974 }
975
976 /* Socket filter */
977 error = sflt_bind(so, nam);
978
979 if (error == 0) {
980 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
981 }
982 out:
983 if (dolock) {
984 socket_unlock(so, 1);
985 }
986
987 if (error == EJUSTRETURN) {
988 error = 0;
989 }
990
991 return error;
992 }
993
994 void
995 sodealloc(struct socket *so)
996 {
997 kauth_cred_unref(&so->so_cred);
998
999 /* Remove any filters */
1000 sflt_termsock(so);
1001
1002 #if CONTENT_FILTER
1003 cfil_sock_detach(so);
1004 #endif /* CONTENT_FILTER */
1005
1006 /* Delete the state allocated for msg queues on a socket */
1007 if (so->so_flags & SOF_ENABLE_MSGS) {
1008 FREE(so->so_msg_state, M_TEMP);
1009 so->so_msg_state = NULL;
1010 }
1011 VERIFY(so->so_msg_state == NULL);
1012
1013 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
1014
1015 #if CONFIG_MACF_SOCKET
1016 mac_socket_label_destroy(so);
1017 #endif /* MAC_SOCKET */
1018
1019 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
1020 cached_sock_free(so);
1021 } else {
1022 FREE_ZONE(so, sizeof(*so), so->so_zone);
1023 }
1024 }
1025
1026 /*
1027 * Returns: 0 Success
1028 * EINVAL
1029 * EOPNOTSUPP
1030 * <pru_listen>:EINVAL[AF_UNIX]
1031 * <pru_listen>:EINVAL[TCP]
1032 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
1033 * <pru_listen>:EINVAL[TCP] Invalid argument
1034 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
1035 * <pru_listen>:EACCES[TCP] Permission denied
1036 * <pru_listen>:EADDRINUSE[TCP] Address in use
1037 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
1038 * <pru_listen>:EPERM[TCP] Operation not permitted
1039 * <sf_listen>:???
1040 *
1041 * Notes: Other <pru_listen> returns depend on the protocol family; all
1042 * <sf_listen> returns depend on what the filter author causes
1043 * their filter to return.
1044 */
1045 int
1046 solisten(struct socket *so, int backlog)
1047 {
1048 struct proc *p = current_proc();
1049 int error = 0;
1050
1051 socket_lock(so, 1);
1052
1053 so_update_last_owner_locked(so, p);
1054 so_update_policy(so);
1055
1056 #if NECP
1057 so_update_necp_policy(so, NULL, NULL);
1058 #endif /* NECP */
1059
1060 if (so->so_proto == NULL) {
1061 error = EINVAL;
1062 goto out;
1063 }
1064 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1065 error = EOPNOTSUPP;
1066 goto out;
1067 }
1068
1069 /*
1070 * If the listen request is made on a socket that is not fully
1071 * disconnected, or on a socket that has been marked as inactive,
1072 * reject the request now.
1073 */
1074 if ((so->so_state &
1075 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1076 (so->so_flags & SOF_DEFUNCT)) {
1077 error = EINVAL;
1078 if (so->so_flags & SOF_DEFUNCT) {
1079 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1080 "(%d)\n", __func__, proc_pid(p),
1081 proc_best_name(p),
1082 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1083 SOCK_DOM(so), SOCK_TYPE(so), error);
1084 }
1085 goto out;
1086 }
1087
1088 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1089 error = EPERM;
1090 goto out;
1091 }
1092
1093 error = sflt_listen(so);
1094 if (error == 0) {
1095 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1096 }
1097
1098 if (error) {
1099 if (error == EJUSTRETURN) {
1100 error = 0;
1101 }
1102 goto out;
1103 }
1104
1105 if (TAILQ_EMPTY(&so->so_comp)) {
1106 so->so_options |= SO_ACCEPTCONN;
1107 }
1108 /*
1109 * POSIX: The implementation may have an upper limit on the length of
1110 * the listen queue-either global or per accepting socket. If backlog
1111 * exceeds this limit, the length of the listen queue is set to the
1112 * limit.
1113 *
1114 * If listen() is called with a backlog argument value that is less
1115 * than 0, the function behaves as if it had been called with a backlog
1116 * argument value of 0.
1117 *
1118 * A backlog argument of 0 may allow the socket to accept connections,
1119 * in which case the length of the listen queue may be set to an
1120 * implementation-defined minimum value.
1121 */
1122 if (backlog <= 0 || backlog > somaxconn) {
1123 backlog = somaxconn;
1124 }
1125
1126 so->so_qlimit = backlog;
1127 out:
1128 socket_unlock(so, 1);
1129 return error;
1130 }
1131
1132 /*
1133 * The "accept list lock" protects the fields related to the listener queues
1134 * because we can unlock a socket to respect the lock ordering between
1135 * the listener socket and its clients sockets. The lock ordering is first to
1136 * acquire the client socket before the listener socket.
1137 *
1138 * The accept list lock serializes access to the following fields:
1139 * - of the listener socket:
1140 * - so_comp
1141 * - so_incomp
1142 * - so_qlen
1143 * - so_inqlen
1144 * - of client sockets that are in so_comp or so_incomp:
1145 * - so_head
1146 * - so_list
1147 *
1148 * As one can see the accept list lock protects the consistent of the
1149 * linkage of the client sockets.
1150 *
1151 * Note that those fields may be read without holding the accept list lock
1152 * for a preflight provided the accept list lock is taken when committing
1153 * to take an action based on the result of the preflight. The preflight
1154 * saves the cost of doing the unlock/lock dance.
1155 */
1156 void
1157 so_acquire_accept_list(struct socket *head, struct socket *so)
1158 {
1159 lck_mtx_t *mutex_held;
1160
1161 if (head->so_proto->pr_getlock == NULL) {
1162 return;
1163 }
1164 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1165 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1166
1167 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1168 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1169 return;
1170 }
1171 if (so != NULL) {
1172 socket_unlock(so, 0);
1173 }
1174 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1175 so_accept_list_waits += 1;
1176 msleep((caddr_t)&head->so_incomp, mutex_held,
1177 PSOCK | PCATCH, __func__, NULL);
1178 }
1179 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1180 if (so != NULL) {
1181 socket_unlock(head, 0);
1182 socket_lock(so, 0);
1183 socket_lock(head, 0);
1184 }
1185 }
1186
1187 void
1188 so_release_accept_list(struct socket *head)
1189 {
1190 if (head->so_proto->pr_getlock != NULL) {
1191 lck_mtx_t *mutex_held;
1192
1193 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1194 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1195
1196 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1197 wakeup((caddr_t)&head->so_incomp);
1198 }
1199 }
1200
1201 void
1202 sofreelastref(struct socket *so, int dealloc)
1203 {
1204 struct socket *head = so->so_head;
1205
1206 /* Assume socket is locked */
1207
1208 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1209 selthreadclear(&so->so_snd.sb_sel);
1210 selthreadclear(&so->so_rcv.sb_sel);
1211 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1212 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1213 so->so_event = sonullevent;
1214 return;
1215 }
1216 if (head != NULL) {
1217 /*
1218 * Need to lock the listener when the protocol has
1219 * per socket locks
1220 */
1221 if (head->so_proto->pr_getlock != NULL) {
1222 socket_lock(head, 1);
1223 so_acquire_accept_list(head, so);
1224 }
1225 if (so->so_state & SS_INCOMP) {
1226 so->so_state &= ~SS_INCOMP;
1227 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1228 head->so_incqlen--;
1229 head->so_qlen--;
1230 so->so_head = NULL;
1231
1232 if (head->so_proto->pr_getlock != NULL) {
1233 so_release_accept_list(head);
1234 socket_unlock(head, 1);
1235 }
1236 } else if (so->so_state & SS_COMP) {
1237 if (head->so_proto->pr_getlock != NULL) {
1238 so_release_accept_list(head);
1239 socket_unlock(head, 1);
1240 }
1241 /*
1242 * We must not decommission a socket that's
1243 * on the accept(2) queue. If we do, then
1244 * accept(2) may hang after select(2) indicated
1245 * that the listening socket was ready.
1246 */
1247 selthreadclear(&so->so_snd.sb_sel);
1248 selthreadclear(&so->so_rcv.sb_sel);
1249 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1250 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1251 so->so_event = sonullevent;
1252 return;
1253 } else {
1254 if (head->so_proto->pr_getlock != NULL) {
1255 so_release_accept_list(head);
1256 socket_unlock(head, 1);
1257 }
1258 printf("sofree: not queued\n");
1259 }
1260 }
1261 sowflush(so);
1262 sorflush(so);
1263
1264 #if FLOW_DIVERT
1265 if (so->so_flags & SOF_FLOW_DIVERT) {
1266 flow_divert_detach(so);
1267 }
1268 #endif /* FLOW_DIVERT */
1269
1270 /* 3932268: disable upcall */
1271 so->so_rcv.sb_flags &= ~SB_UPCALL;
1272 so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1273 so->so_event = sonullevent;
1274
1275 if (dealloc) {
1276 sodealloc(so);
1277 }
1278 }
1279
1280 void
1281 soclose_wait_locked(struct socket *so)
1282 {
1283 lck_mtx_t *mutex_held;
1284
1285 if (so->so_proto->pr_getlock != NULL) {
1286 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1287 } else {
1288 mutex_held = so->so_proto->pr_domain->dom_mtx;
1289 }
1290 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1291
1292 /*
1293 * Double check here and return if there's no outstanding upcall;
1294 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1295 */
1296 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1297 return;
1298 }
1299 so->so_rcv.sb_flags &= ~SB_UPCALL;
1300 so->so_snd.sb_flags &= ~SB_UPCALL;
1301 so->so_flags |= SOF_CLOSEWAIT;
1302
1303 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1304 "soclose_wait_locked", NULL);
1305 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1306 so->so_flags &= ~SOF_CLOSEWAIT;
1307 }
1308
1309 /*
1310 * Close a socket on last file table reference removal.
1311 * Initiate disconnect if connected.
1312 * Free socket when disconnect complete.
1313 */
1314 int
1315 soclose_locked(struct socket *so)
1316 {
1317 int error = 0;
1318 struct timespec ts;
1319
1320 if (so->so_usecount == 0) {
1321 panic("soclose: so=%p refcount=0\n", so);
1322 /* NOTREACHED */
1323 }
1324
1325 sflt_notify(so, sock_evt_closing, NULL);
1326
1327 if (so->so_upcallusecount) {
1328 soclose_wait_locked(so);
1329 }
1330
1331 #if CONTENT_FILTER
1332 /*
1333 * We have to wait until the content filters are done
1334 */
1335 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1336 cfil_sock_close_wait(so);
1337 cfil_sock_is_closed(so);
1338 cfil_sock_detach(so);
1339 }
1340 #endif /* CONTENT_FILTER */
1341
1342 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1343 soresume(current_proc(), so, 1);
1344 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1345 }
1346
1347 if ((so->so_options & SO_ACCEPTCONN)) {
1348 struct socket *sp, *sonext;
1349 int persocklock = 0;
1350 int incomp_overflow_only;
1351
1352 /*
1353 * We do not want new connection to be added
1354 * to the connection queues
1355 */
1356 so->so_options &= ~SO_ACCEPTCONN;
1357
1358 /*
1359 * We can drop the lock on the listener once
1360 * we've acquired the incoming list
1361 */
1362 if (so->so_proto->pr_getlock != NULL) {
1363 persocklock = 1;
1364 so_acquire_accept_list(so, NULL);
1365 socket_unlock(so, 0);
1366 }
1367 again:
1368 incomp_overflow_only = 1;
1369
1370 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1371 /*
1372 * Radar 5350314
1373 * skip sockets thrown away by tcpdropdropblreq
1374 * they will get cleanup by the garbage collection.
1375 * otherwise, remove the incomp socket from the queue
1376 * and let soabort trigger the appropriate cleanup.
1377 */
1378 if (sp->so_flags & SOF_OVERFLOW) {
1379 continue;
1380 }
1381
1382 if (persocklock != 0) {
1383 socket_lock(sp, 1);
1384 }
1385
1386 /*
1387 * Radar 27945981
1388 * The extra reference for the list insure the
1389 * validity of the socket pointer when we perform the
1390 * unlock of the head above
1391 */
1392 if (sp->so_state & SS_INCOMP) {
1393 sp->so_state &= ~SS_INCOMP;
1394 sp->so_head = NULL;
1395 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1396 so->so_incqlen--;
1397 so->so_qlen--;
1398
1399 (void) soabort(sp);
1400 } else {
1401 panic("%s sp %p in so_incomp but !SS_INCOMP",
1402 __func__, sp);
1403 }
1404
1405 if (persocklock != 0) {
1406 socket_unlock(sp, 1);
1407 }
1408 }
1409
1410 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1411 /* Dequeue from so_comp since sofree() won't do it */
1412 if (persocklock != 0) {
1413 socket_lock(sp, 1);
1414 }
1415
1416 if (sp->so_state & SS_COMP) {
1417 sp->so_state &= ~SS_COMP;
1418 sp->so_head = NULL;
1419 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1420 so->so_qlen--;
1421
1422 (void) soabort(sp);
1423 } else {
1424 panic("%s sp %p in so_comp but !SS_COMP",
1425 __func__, sp);
1426 }
1427
1428 if (persocklock) {
1429 socket_unlock(sp, 1);
1430 }
1431 }
1432
1433 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1434 #if (DEBUG | DEVELOPMENT)
1435 panic("%s head %p so_comp not empty\n", __func__, so);
1436 #endif /* (DEVELOPMENT || DEBUG) */
1437
1438 goto again;
1439 }
1440
1441 if (!TAILQ_EMPTY(&so->so_comp)) {
1442 #if (DEBUG | DEVELOPMENT)
1443 panic("%s head %p so_comp not empty\n", __func__, so);
1444 #endif /* (DEVELOPMENT || DEBUG) */
1445
1446 goto again;
1447 }
1448
1449 if (persocklock) {
1450 socket_lock(so, 0);
1451 so_release_accept_list(so);
1452 }
1453 }
1454 if (so->so_pcb == NULL) {
1455 /* 3915887: mark the socket as ready for dealloc */
1456 so->so_flags |= SOF_PCBCLEARING;
1457 goto discard;
1458 }
1459 if (so->so_state & SS_ISCONNECTED) {
1460 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1461 error = sodisconnectlocked(so);
1462 if (error) {
1463 goto drop;
1464 }
1465 }
1466 if (so->so_options & SO_LINGER) {
1467 lck_mtx_t *mutex_held;
1468
1469 if ((so->so_state & SS_ISDISCONNECTING) &&
1470 (so->so_state & SS_NBIO)) {
1471 goto drop;
1472 }
1473 if (so->so_proto->pr_getlock != NULL) {
1474 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1475 } else {
1476 mutex_held = so->so_proto->pr_domain->dom_mtx;
1477 }
1478 while (so->so_state & SS_ISCONNECTED) {
1479 ts.tv_sec = (so->so_linger / 100);
1480 ts.tv_nsec = (so->so_linger % 100) *
1481 NSEC_PER_USEC * 1000 * 10;
1482 error = msleep((caddr_t)&so->so_timeo,
1483 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1484 if (error) {
1485 /*
1486 * It's OK when the time fires,
1487 * don't report an error
1488 */
1489 if (error == EWOULDBLOCK) {
1490 error = 0;
1491 }
1492 break;
1493 }
1494 }
1495 }
1496 }
1497 drop:
1498 if (so->so_usecount == 0) {
1499 panic("soclose: usecount is zero so=%p\n", so);
1500 /* NOTREACHED */
1501 }
1502 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1503 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1504 if (error == 0) {
1505 error = error2;
1506 }
1507 }
1508 if (so->so_usecount <= 0) {
1509 panic("soclose: usecount is zero so=%p\n", so);
1510 /* NOTREACHED */
1511 }
1512 discard:
1513 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1514 (so->so_state & SS_NOFDREF)) {
1515 panic("soclose: NOFDREF");
1516 /* NOTREACHED */
1517 }
1518 so->so_state |= SS_NOFDREF;
1519
1520 if ((so->so_flags & SOF_KNOTE) != 0) {
1521 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1522 }
1523
1524 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1525 evsofree(so);
1526
1527 VERIFY(so->so_usecount > 0);
1528 so->so_usecount--;
1529 sofree(so);
1530 return error;
1531 }
1532
1533 int
1534 soclose(struct socket *so)
1535 {
1536 int error = 0;
1537 socket_lock(so, 1);
1538
1539 if (so->so_retaincnt == 0) {
1540 error = soclose_locked(so);
1541 } else {
1542 /*
1543 * if the FD is going away, but socket is
1544 * retained in kernel remove its reference
1545 */
1546 so->so_usecount--;
1547 if (so->so_usecount < 2) {
1548 panic("soclose: retaincnt non null and so=%p "
1549 "usecount=%d\n", so, so->so_usecount);
1550 }
1551 }
1552 socket_unlock(so, 1);
1553 return error;
1554 }
1555
1556 /*
1557 * Must be called at splnet...
1558 */
1559 /* Should already be locked */
1560 int
1561 soabort(struct socket *so)
1562 {
1563 int error;
1564
1565 #ifdef MORE_LOCKING_DEBUG
1566 lck_mtx_t *mutex_held;
1567
1568 if (so->so_proto->pr_getlock != NULL) {
1569 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1570 } else {
1571 mutex_held = so->so_proto->pr_domain->dom_mtx;
1572 }
1573 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1574 #endif
1575
1576 if ((so->so_flags & SOF_ABORTED) == 0) {
1577 so->so_flags |= SOF_ABORTED;
1578 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1579 if (error) {
1580 sofree(so);
1581 return error;
1582 }
1583 }
1584 return 0;
1585 }
1586
1587 int
1588 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1589 {
1590 int error;
1591
1592 if (dolock) {
1593 socket_lock(so, 1);
1594 }
1595
1596 so_update_last_owner_locked(so, PROC_NULL);
1597 so_update_policy(so);
1598 #if NECP
1599 so_update_necp_policy(so, NULL, NULL);
1600 #endif /* NECP */
1601
1602 if ((so->so_state & SS_NOFDREF) == 0) {
1603 panic("soaccept: !NOFDREF");
1604 }
1605 so->so_state &= ~SS_NOFDREF;
1606 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1607
1608 if (dolock) {
1609 socket_unlock(so, 1);
1610 }
1611 return error;
1612 }
1613
1614 int
1615 soaccept(struct socket *so, struct sockaddr **nam)
1616 {
1617 return soacceptlock(so, nam, 1);
1618 }
1619
1620 int
1621 soacceptfilter(struct socket *so, struct socket *head)
1622 {
1623 struct sockaddr *local = NULL, *remote = NULL;
1624 int error = 0;
1625
1626 /*
1627 * Hold the lock even if this socket has not been made visible
1628 * to the filter(s). For sockets with global locks, this protects
1629 * against the head or peer going away
1630 */
1631 socket_lock(so, 1);
1632 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1633 sogetaddr_locked(so, &local, 0) != 0) {
1634 so->so_state &= ~SS_NOFDREF;
1635 socket_unlock(so, 1);
1636 soclose(so);
1637 /* Out of resources; try it again next time */
1638 error = ECONNABORTED;
1639 goto done;
1640 }
1641
1642 error = sflt_accept(head, so, local, remote);
1643
1644 /*
1645 * If we get EJUSTRETURN from one of the filters, mark this socket
1646 * as inactive and return it anyway. This newly accepted socket
1647 * will be disconnected later before we hand it off to the caller.
1648 */
1649 if (error == EJUSTRETURN) {
1650 error = 0;
1651 (void) sosetdefunct(current_proc(), so,
1652 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1653 }
1654
1655 if (error != 0) {
1656 /*
1657 * This may seem like a duplication to the above error
1658 * handling part when we return ECONNABORTED, except
1659 * the following is done while holding the lock since
1660 * the socket has been exposed to the filter(s) earlier.
1661 */
1662 so->so_state &= ~SS_NOFDREF;
1663 socket_unlock(so, 1);
1664 soclose(so);
1665 /* Propagate socket filter's error code to the caller */
1666 } else {
1667 socket_unlock(so, 1);
1668 }
1669 done:
1670 /* Callee checks for NULL pointer */
1671 sock_freeaddr(remote);
1672 sock_freeaddr(local);
1673 return error;
1674 }
1675
1676 /*
1677 * Returns: 0 Success
1678 * EOPNOTSUPP Operation not supported on socket
1679 * EISCONN Socket is connected
1680 * <pru_connect>:EADDRNOTAVAIL Address not available.
1681 * <pru_connect>:EINVAL Invalid argument
1682 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1683 * <pru_connect>:EACCES Permission denied
1684 * <pru_connect>:EADDRINUSE Address in use
1685 * <pru_connect>:EAGAIN Resource unavailable, try again
1686 * <pru_connect>:EPERM Operation not permitted
1687 * <sf_connect_out>:??? [anything a filter writer might set]
1688 */
1689 int
1690 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1691 {
1692 int error;
1693 struct proc *p = current_proc();
1694
1695 if (dolock) {
1696 socket_lock(so, 1);
1697 }
1698
1699 so_update_last_owner_locked(so, p);
1700 so_update_policy(so);
1701
1702 #if NECP
1703 so_update_necp_policy(so, NULL, nam);
1704 #endif /* NECP */
1705
1706 /*
1707 * If this is a listening socket or if this is a previously-accepted
1708 * socket that has been marked as inactive, reject the connect request.
1709 */
1710 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1711 error = EOPNOTSUPP;
1712 if (so->so_flags & SOF_DEFUNCT) {
1713 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1714 "(%d)\n", __func__, proc_pid(p),
1715 proc_best_name(p),
1716 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1717 SOCK_DOM(so), SOCK_TYPE(so), error);
1718 }
1719 if (dolock) {
1720 socket_unlock(so, 1);
1721 }
1722 return error;
1723 }
1724
1725 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1726 if (dolock) {
1727 socket_unlock(so, 1);
1728 }
1729 return EPERM;
1730 }
1731
1732 /*
1733 * If protocol is connection-based, can only connect once.
1734 * Otherwise, if connected, try to disconnect first.
1735 * This allows user to disconnect by connecting to, e.g.,
1736 * a null address.
1737 */
1738 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1739 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1740 (error = sodisconnectlocked(so)))) {
1741 error = EISCONN;
1742 } else {
1743 /*
1744 * Run connect filter before calling protocol:
1745 * - non-blocking connect returns before completion;
1746 */
1747 error = sflt_connectout(so, nam);
1748 if (error != 0) {
1749 if (error == EJUSTRETURN) {
1750 error = 0;
1751 }
1752 } else {
1753 error = (*so->so_proto->pr_usrreqs->pru_connect)
1754 (so, nam, p);
1755 if (error != 0) {
1756 so->so_state &= ~SS_ISCONNECTING;
1757 }
1758 }
1759 }
1760 if (dolock) {
1761 socket_unlock(so, 1);
1762 }
1763 return error;
1764 }
1765
1766 int
1767 soconnect(struct socket *so, struct sockaddr *nam)
1768 {
1769 return soconnectlock(so, nam, 1);
1770 }
1771
1772 /*
1773 * Returns: 0 Success
1774 * <pru_connect2>:EINVAL[AF_UNIX]
1775 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1776 * <pru_connect2>:??? [other protocol families]
1777 *
1778 * Notes: <pru_connect2> is not supported by [TCP].
1779 */
1780 int
1781 soconnect2(struct socket *so1, struct socket *so2)
1782 {
1783 int error;
1784
1785 socket_lock(so1, 1);
1786 if (so2->so_proto->pr_lock) {
1787 socket_lock(so2, 1);
1788 }
1789
1790 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1791
1792 socket_unlock(so1, 1);
1793 if (so2->so_proto->pr_lock) {
1794 socket_unlock(so2, 1);
1795 }
1796 return error;
1797 }
1798
1799 int
1800 soconnectxlocked(struct socket *so, struct sockaddr *src,
1801 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1802 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1803 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1804 {
1805 int error;
1806
1807 so_update_last_owner_locked(so, p);
1808 so_update_policy(so);
1809
1810 /*
1811 * If this is a listening socket or if this is a previously-accepted
1812 * socket that has been marked as inactive, reject the connect request.
1813 */
1814 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1815 error = EOPNOTSUPP;
1816 if (so->so_flags & SOF_DEFUNCT) {
1817 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1818 "(%d)\n", __func__, proc_pid(p),
1819 proc_best_name(p),
1820 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1821 SOCK_DOM(so), SOCK_TYPE(so), error);
1822 }
1823 return error;
1824 }
1825
1826 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1827 return EPERM;
1828 }
1829
1830 /*
1831 * If protocol is connection-based, can only connect once
1832 * unless PR_MULTICONN is set. Otherwise, if connected,
1833 * try to disconnect first. This allows user to disconnect
1834 * by connecting to, e.g., a null address.
1835 */
1836 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1837 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1838 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1839 (error = sodisconnectlocked(so)) != 0)) {
1840 error = EISCONN;
1841 } else {
1842 /*
1843 * Run connect filter before calling protocol:
1844 * - non-blocking connect returns before completion;
1845 */
1846 error = sflt_connectout(so, dst);
1847 if (error != 0) {
1848 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1849 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1850 if (error == EJUSTRETURN) {
1851 error = 0;
1852 }
1853 } else {
1854 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1855 (so, src, dst, p, ifscope, aid, pcid,
1856 flags, arg, arglen, auio, bytes_written);
1857 if (error != 0) {
1858 so->so_state &= ~SS_ISCONNECTING;
1859 }
1860 }
1861 }
1862
1863 return error;
1864 }
1865
1866 int
1867 sodisconnectlocked(struct socket *so)
1868 {
1869 int error;
1870
1871 if ((so->so_state & SS_ISCONNECTED) == 0) {
1872 error = ENOTCONN;
1873 goto bad;
1874 }
1875 if (so->so_state & SS_ISDISCONNECTING) {
1876 error = EALREADY;
1877 goto bad;
1878 }
1879
1880 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1881 if (error == 0) {
1882 sflt_notify(so, sock_evt_disconnected, NULL);
1883 }
1884
1885 bad:
1886 return error;
1887 }
1888
1889 /* Locking version */
1890 int
1891 sodisconnect(struct socket *so)
1892 {
1893 int error;
1894
1895 socket_lock(so, 1);
1896 error = sodisconnectlocked(so);
1897 socket_unlock(so, 1);
1898 return error;
1899 }
1900
1901 int
1902 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1903 {
1904 int error;
1905
1906 /*
1907 * Call the protocol disconnectx handler; let it handle all
1908 * matters related to the connection state of this session.
1909 */
1910 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1911 if (error == 0) {
1912 /*
1913 * The event applies only for the session, not for
1914 * the disconnection of individual subflows.
1915 */
1916 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1917 sflt_notify(so, sock_evt_disconnected, NULL);
1918 }
1919 }
1920 return error;
1921 }
1922
1923 int
1924 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1925 {
1926 int error;
1927
1928 socket_lock(so, 1);
1929 error = sodisconnectxlocked(so, aid, cid);
1930 socket_unlock(so, 1);
1931 return error;
1932 }
1933
1934 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1935
1936 /*
1937 * sosendcheck will lock the socket buffer if it isn't locked and
1938 * verify that there is space for the data being inserted.
1939 *
1940 * Returns: 0 Success
1941 * EPIPE
1942 * sblock:EWOULDBLOCK
1943 * sblock:EINTR
1944 * sbwait:EBADF
1945 * sbwait:EINTR
1946 * [so_error]:???
1947 */
1948 int
1949 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1950 int32_t clen, int32_t atomic, int flags, int *sblocked,
1951 struct mbuf *control)
1952 {
1953 int error = 0;
1954 int32_t space;
1955 int assumelock = 0;
1956
1957 restart:
1958 if (*sblocked == 0) {
1959 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1960 so->so_send_filt_thread != 0 &&
1961 so->so_send_filt_thread == current_thread()) {
1962 /*
1963 * We're being called recursively from a filter,
1964 * allow this to continue. Radar 4150520.
1965 * Don't set sblocked because we don't want
1966 * to perform an unlock later.
1967 */
1968 assumelock = 1;
1969 } else {
1970 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1971 if (error) {
1972 if (so->so_flags & SOF_DEFUNCT) {
1973 goto defunct;
1974 }
1975 return error;
1976 }
1977 *sblocked = 1;
1978 }
1979 }
1980
1981 /*
1982 * If a send attempt is made on a socket that has been marked
1983 * as inactive (disconnected), reject the request.
1984 */
1985 if (so->so_flags & SOF_DEFUNCT) {
1986 defunct:
1987 error = EPIPE;
1988 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1989 __func__, proc_selfpid(), proc_best_name(current_proc()),
1990 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1991 SOCK_DOM(so), SOCK_TYPE(so), error);
1992 return error;
1993 }
1994
1995 if (so->so_state & SS_CANTSENDMORE) {
1996 #if CONTENT_FILTER
1997 /*
1998 * Can re-inject data of half closed connections
1999 */
2000 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2001 so->so_snd.sb_cfil_thread == current_thread() &&
2002 cfil_sock_data_pending(&so->so_snd) != 0) {
2003 CFIL_LOG(LOG_INFO,
2004 "so %llx ignore SS_CANTSENDMORE",
2005 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2006 } else
2007 #endif /* CONTENT_FILTER */
2008 return EPIPE;
2009 }
2010 if (so->so_error) {
2011 error = so->so_error;
2012 so->so_error = 0;
2013 return error;
2014 }
2015
2016 if ((so->so_state & SS_ISCONNECTED) == 0) {
2017 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2018 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2019 (resid != 0 || clen == 0) &&
2020 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2021 return ENOTCONN;
2022 }
2023 } else if (addr == 0) {
2024 return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2025 ENOTCONN : EDESTADDRREQ;
2026 }
2027 }
2028
2029 if (so->so_flags & SOF_ENABLE_MSGS) {
2030 space = msgq_sbspace(so, control);
2031 } else {
2032 space = sbspace(&so->so_snd);
2033 }
2034
2035 if (flags & MSG_OOB) {
2036 space += 1024;
2037 }
2038 if ((atomic && resid > so->so_snd.sb_hiwat) ||
2039 clen > so->so_snd.sb_hiwat) {
2040 return EMSGSIZE;
2041 }
2042
2043 if ((space < resid + clen &&
2044 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2045 space < clen)) ||
2046 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2047 /*
2048 * don't block the connectx call when there's more data
2049 * than can be copied.
2050 */
2051 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2052 if (space == 0) {
2053 return EWOULDBLOCK;
2054 }
2055 if (space < (int32_t)so->so_snd.sb_lowat) {
2056 return 0;
2057 }
2058 }
2059 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2060 assumelock) {
2061 return EWOULDBLOCK;
2062 }
2063 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
2064 *sblocked = 0;
2065 error = sbwait(&so->so_snd);
2066 if (error) {
2067 if (so->so_flags & SOF_DEFUNCT) {
2068 goto defunct;
2069 }
2070 return error;
2071 }
2072 goto restart;
2073 }
2074 return 0;
2075 }
2076
2077 /*
2078 * Send on a socket.
2079 * If send must go all at once and message is larger than
2080 * send buffering, then hard error.
2081 * Lock against other senders.
2082 * If must go all at once and not enough room now, then
2083 * inform user that this would block and do nothing.
2084 * Otherwise, if nonblocking, send as much as possible.
2085 * The data to be sent is described by "uio" if nonzero,
2086 * otherwise by the mbuf chain "top" (which must be null
2087 * if uio is not). Data provided in mbuf chain must be small
2088 * enough to send all at once.
2089 *
2090 * Returns nonzero on error, timeout or signal; callers
2091 * must check for short counts if EINTR/ERESTART are returned.
2092 * Data and control buffers are freed on return.
2093 *
2094 * Returns: 0 Success
2095 * EOPNOTSUPP
2096 * EINVAL
2097 * ENOBUFS
2098 * uiomove:EFAULT
2099 * sosendcheck:EPIPE
2100 * sosendcheck:EWOULDBLOCK
2101 * sosendcheck:EINTR
2102 * sosendcheck:EBADF
2103 * sosendcheck:EINTR
2104 * sosendcheck:??? [value from so_error]
2105 * <pru_send>:ECONNRESET[TCP]
2106 * <pru_send>:EINVAL[TCP]
2107 * <pru_send>:ENOBUFS[TCP]
2108 * <pru_send>:EADDRINUSE[TCP]
2109 * <pru_send>:EADDRNOTAVAIL[TCP]
2110 * <pru_send>:EAFNOSUPPORT[TCP]
2111 * <pru_send>:EACCES[TCP]
2112 * <pru_send>:EAGAIN[TCP]
2113 * <pru_send>:EPERM[TCP]
2114 * <pru_send>:EMSGSIZE[TCP]
2115 * <pru_send>:EHOSTUNREACH[TCP]
2116 * <pru_send>:ENETUNREACH[TCP]
2117 * <pru_send>:ENETDOWN[TCP]
2118 * <pru_send>:ENOMEM[TCP]
2119 * <pru_send>:ENOBUFS[TCP]
2120 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2121 * <pru_send>:EINVAL[AF_UNIX]
2122 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2123 * <pru_send>:EPIPE[AF_UNIX]
2124 * <pru_send>:ENOTCONN[AF_UNIX]
2125 * <pru_send>:EISCONN[AF_UNIX]
2126 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2127 * <sf_data_out>:??? [whatever a filter author chooses]
2128 *
2129 * Notes: Other <pru_send> returns depend on the protocol family; all
2130 * <sf_data_out> returns depend on what the filter author causes
2131 * their filter to return.
2132 */
2133 int
2134 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2135 struct mbuf *top, struct mbuf *control, int flags)
2136 {
2137 struct mbuf **mp;
2138 struct mbuf *m, *freelist = NULL;
2139 user_ssize_t space, len, resid, orig_resid;
2140 int clen = 0, error, dontroute, mlen, sendflags;
2141 int atomic = sosendallatonce(so) || top;
2142 int sblocked = 0;
2143 struct proc *p = current_proc();
2144 struct mbuf *control_copy = NULL;
2145 uint16_t headroom = 0;
2146 boolean_t en_tracing = FALSE;
2147
2148 if (uio != NULL) {
2149 resid = uio_resid(uio);
2150 } else {
2151 resid = top->m_pkthdr.len;
2152 }
2153
2154 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2155 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2156
2157 socket_lock(so, 1);
2158
2159 /*
2160 * trace if tracing & network (vs. unix) sockets & and
2161 * non-loopback
2162 */
2163 if (ENTR_SHOULDTRACE &&
2164 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2165 struct inpcb *inp = sotoinpcb(so);
2166 if (inp->inp_last_outifp != NULL &&
2167 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2168 en_tracing = TRUE;
2169 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2170 VM_KERNEL_ADDRPERM(so),
2171 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2172 (int64_t)resid);
2173 orig_resid = resid;
2174 }
2175 }
2176
2177 /*
2178 * Re-injection should not affect process accounting
2179 */
2180 if ((flags & MSG_SKIPCFIL) == 0) {
2181 so_update_last_owner_locked(so, p);
2182 so_update_policy(so);
2183
2184 #if NECP
2185 so_update_necp_policy(so, NULL, addr);
2186 #endif /* NECP */
2187 }
2188
2189 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2190 error = EOPNOTSUPP;
2191 goto out_locked;
2192 }
2193
2194 /*
2195 * In theory resid should be unsigned.
2196 * However, space must be signed, as it might be less than 0
2197 * if we over-committed, and we must use a signed comparison
2198 * of space and resid. On the other hand, a negative resid
2199 * causes us to loop sending 0-length segments to the protocol.
2200 *
2201 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2202 * But it will be used by sockets doing message delivery.
2203 *
2204 * Note: We limit resid to be a positive int value as we use
2205 * imin() to set bytes_to_copy -- radr://14558484
2206 */
2207 if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
2208 !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
2209 error = EINVAL;
2210 goto out_locked;
2211 }
2212
2213 dontroute = (flags & MSG_DONTROUTE) &&
2214 (so->so_options & SO_DONTROUTE) == 0 &&
2215 (so->so_proto->pr_flags & PR_ATOMIC);
2216 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2217
2218 if (control != NULL) {
2219 clen = control->m_len;
2220 }
2221
2222 if (soreserveheadroom != 0) {
2223 headroom = so->so_pktheadroom;
2224 }
2225
2226 do {
2227 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2228 &sblocked, control);
2229 if (error) {
2230 goto out_locked;
2231 }
2232
2233 mp = &top;
2234 if (so->so_flags & SOF_ENABLE_MSGS) {
2235 space = msgq_sbspace(so, control);
2236 } else {
2237 space = sbspace(&so->so_snd) - clen;
2238 }
2239 space += ((flags & MSG_OOB) ? 1024 : 0);
2240
2241 do {
2242 if (uio == NULL) {
2243 /*
2244 * Data is prepackaged in "top".
2245 */
2246 resid = 0;
2247 if (flags & MSG_EOR) {
2248 top->m_flags |= M_EOR;
2249 }
2250 } else {
2251 int chainlength;
2252 int bytes_to_copy;
2253 boolean_t jumbocl;
2254 boolean_t bigcl;
2255 int bytes_to_alloc;
2256
2257 bytes_to_copy = imin(resid, space);
2258
2259 bytes_to_alloc = bytes_to_copy;
2260 if (top == NULL) {
2261 bytes_to_alloc += headroom;
2262 }
2263
2264 if (sosendminchain > 0) {
2265 chainlength = 0;
2266 } else {
2267 chainlength = sosendmaxchain;
2268 }
2269
2270 /*
2271 * Use big 4 KB cluster when the outgoing interface
2272 * does not prefer 2 KB clusters
2273 */
2274 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2275 sosendbigcl_ignore_capab;
2276
2277 /*
2278 * Attempt to use larger than system page-size
2279 * clusters for large writes only if there is
2280 * a jumbo cluster pool and if the socket is
2281 * marked accordingly.
2282 */
2283 jumbocl = sosendjcl && njcl > 0 &&
2284 ((so->so_flags & SOF_MULTIPAGES) ||
2285 sosendjcl_ignore_capab) &&
2286 bigcl;
2287
2288 socket_unlock(so, 0);
2289
2290 do {
2291 int num_needed;
2292 int hdrs_needed = (top == NULL) ? 1 : 0;
2293
2294 /*
2295 * try to maintain a local cache of mbuf
2296 * clusters needed to complete this
2297 * write the list is further limited to
2298 * the number that are currently needed
2299 * to fill the socket this mechanism
2300 * allows a large number of mbufs/
2301 * clusters to be grabbed under a single
2302 * mbuf lock... if we can't get any
2303 * clusters, than fall back to trying
2304 * for mbufs if we fail early (or
2305 * miscalcluate the number needed) make
2306 * sure to release any clusters we
2307 * haven't yet consumed.
2308 */
2309 if (freelist == NULL &&
2310 bytes_to_alloc > MBIGCLBYTES &&
2311 jumbocl) {
2312 num_needed =
2313 bytes_to_alloc / M16KCLBYTES;
2314
2315 if ((bytes_to_alloc -
2316 (num_needed * M16KCLBYTES))
2317 >= MINCLSIZE) {
2318 num_needed++;
2319 }
2320
2321 freelist =
2322 m_getpackets_internal(
2323 (unsigned int *)&num_needed,
2324 hdrs_needed, M_WAIT, 0,
2325 M16KCLBYTES);
2326 /*
2327 * Fall back to 4K cluster size
2328 * if allocation failed
2329 */
2330 }
2331
2332 if (freelist == NULL &&
2333 bytes_to_alloc > MCLBYTES &&
2334 bigcl) {
2335 num_needed =
2336 bytes_to_alloc / MBIGCLBYTES;
2337
2338 if ((bytes_to_alloc -
2339 (num_needed * MBIGCLBYTES)) >=
2340 MINCLSIZE) {
2341 num_needed++;
2342 }
2343
2344 freelist =
2345 m_getpackets_internal(
2346 (unsigned int *)&num_needed,
2347 hdrs_needed, M_WAIT, 0,
2348 MBIGCLBYTES);
2349 /*
2350 * Fall back to cluster size
2351 * if allocation failed
2352 */
2353 }
2354
2355 /*
2356 * Allocate a cluster as we want to
2357 * avoid to split the data in more
2358 * that one segment and using MINCLSIZE
2359 * would lead us to allocate two mbufs
2360 */
2361 if (soreserveheadroom != 0 &&
2362 freelist == NULL &&
2363 ((top == NULL &&
2364 bytes_to_alloc > _MHLEN) ||
2365 bytes_to_alloc > _MLEN)) {
2366 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2367 MCLBYTES;
2368 freelist =
2369 m_getpackets_internal(
2370 (unsigned int *)&num_needed,
2371 hdrs_needed, M_WAIT, 0,
2372 MCLBYTES);
2373 /*
2374 * Fall back to a single mbuf
2375 * if allocation failed
2376 */
2377 } else if (freelist == NULL &&
2378 bytes_to_alloc > MINCLSIZE) {
2379 num_needed =
2380 bytes_to_alloc / MCLBYTES;
2381
2382 if ((bytes_to_alloc -
2383 (num_needed * MCLBYTES)) >=
2384 MINCLSIZE) {
2385 num_needed++;
2386 }
2387
2388 freelist =
2389 m_getpackets_internal(
2390 (unsigned int *)&num_needed,
2391 hdrs_needed, M_WAIT, 0,
2392 MCLBYTES);
2393 /*
2394 * Fall back to a single mbuf
2395 * if allocation failed
2396 */
2397 }
2398 /*
2399 * For datagram protocols, leave
2400 * headroom for protocol headers
2401 * in the first cluster of the chain
2402 */
2403 if (freelist != NULL && atomic &&
2404 top == NULL && headroom > 0) {
2405 freelist->m_data += headroom;
2406 }
2407
2408 /*
2409 * Fall back to regular mbufs without
2410 * reserving the socket headroom
2411 */
2412 if (freelist == NULL) {
2413 if (top == NULL) {
2414 MGETHDR(freelist,
2415 M_WAIT, MT_DATA);
2416 } else {
2417 MGET(freelist,
2418 M_WAIT, MT_DATA);
2419 }
2420
2421 if (freelist == NULL) {
2422 error = ENOBUFS;
2423 socket_lock(so, 0);
2424 goto out_locked;
2425 }
2426 /*
2427 * For datagram protocols,
2428 * leave room for protocol
2429 * headers in first mbuf.
2430 */
2431 if (atomic && top == NULL &&
2432 bytes_to_copy < MHLEN) {
2433 MH_ALIGN(freelist,
2434 bytes_to_copy);
2435 }
2436 }
2437 m = freelist;
2438 freelist = m->m_next;
2439 m->m_next = NULL;
2440
2441 if ((m->m_flags & M_EXT)) {
2442 mlen = m->m_ext.ext_size -
2443 M_LEADINGSPACE(m);
2444 } else if ((m->m_flags & M_PKTHDR)) {
2445 mlen =
2446 MHLEN - M_LEADINGSPACE(m);
2447 } else {
2448 mlen = MLEN - M_LEADINGSPACE(m);
2449 }
2450 len = imin(mlen, bytes_to_copy);
2451
2452 chainlength += len;
2453
2454 space -= len;
2455
2456 error = uiomove(mtod(m, caddr_t),
2457 len, uio);
2458
2459 resid = uio_resid(uio);
2460
2461 m->m_len = len;
2462 *mp = m;
2463 top->m_pkthdr.len += len;
2464 if (error) {
2465 break;
2466 }
2467 mp = &m->m_next;
2468 if (resid <= 0) {
2469 if (flags & MSG_EOR) {
2470 top->m_flags |= M_EOR;
2471 }
2472 break;
2473 }
2474 bytes_to_copy = min(resid, space);
2475 } while (space > 0 &&
2476 (chainlength < sosendmaxchain || atomic ||
2477 resid < MINCLSIZE));
2478
2479 socket_lock(so, 0);
2480
2481 if (error) {
2482 goto out_locked;
2483 }
2484 }
2485
2486 if (dontroute) {
2487 so->so_options |= SO_DONTROUTE;
2488 }
2489
2490 /*
2491 * Compute flags here, for pru_send and NKEs
2492 *
2493 * If the user set MSG_EOF, the protocol
2494 * understands this flag and nothing left to
2495 * send then use PRU_SEND_EOF instead of PRU_SEND.
2496 */
2497 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2498 ((flags & MSG_EOF) &&
2499 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2500 (resid <= 0)) ? PRUS_EOF :
2501 /* If there is more to send set PRUS_MORETOCOME */
2502 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2503
2504 if ((flags & MSG_SKIPCFIL) == 0) {
2505 /*
2506 * Socket filter processing
2507 */
2508 error = sflt_data_out(so, addr, &top,
2509 &control, (sendflags & MSG_OOB) ?
2510 sock_data_filt_flag_oob : 0);
2511 if (error) {
2512 if (error == EJUSTRETURN) {
2513 error = 0;
2514 clen = 0;
2515 control = NULL;
2516 top = NULL;
2517 }
2518 goto out_locked;
2519 }
2520 #if CONTENT_FILTER
2521 /*
2522 * Content filter processing
2523 */
2524 error = cfil_sock_data_out(so, addr, top,
2525 control, sendflags);
2526 if (error) {
2527 if (error == EJUSTRETURN) {
2528 error = 0;
2529 clen = 0;
2530 control = NULL;
2531 top = NULL;
2532 }
2533 goto out_locked;
2534 }
2535 #endif /* CONTENT_FILTER */
2536 }
2537 if (so->so_flags & SOF_ENABLE_MSGS) {
2538 /*
2539 * Make a copy of control mbuf,
2540 * so that msg priority can be
2541 * passed to subsequent mbufs.
2542 */
2543 control_copy = m_dup(control, M_NOWAIT);
2544 }
2545 error = (*so->so_proto->pr_usrreqs->pru_send)
2546 (so, sendflags, top, addr, control, p);
2547
2548 if (dontroute) {
2549 so->so_options &= ~SO_DONTROUTE;
2550 }
2551
2552 clen = 0;
2553 control = control_copy;
2554 control_copy = NULL;
2555 top = NULL;
2556 mp = &top;
2557 if (error) {
2558 goto out_locked;
2559 }
2560 } while (resid && space > 0);
2561 } while (resid);
2562
2563 out_locked:
2564 if (sblocked) {
2565 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2566 } else {
2567 socket_unlock(so, 1);
2568 }
2569 if (top != NULL) {
2570 m_freem(top);
2571 }
2572 if (control != NULL) {
2573 m_freem(control);
2574 }
2575 if (freelist != NULL) {
2576 m_freem_list(freelist);
2577 }
2578 if (control_copy != NULL) {
2579 m_freem(control_copy);
2580 }
2581
2582 soclearfastopen(so);
2583
2584 if (en_tracing) {
2585 /* resid passed here is the bytes left in uio */
2586 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2587 VM_KERNEL_ADDRPERM(so),
2588 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2589 (int64_t)(orig_resid - resid));
2590 }
2591 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2592 so->so_snd.sb_cc, space, error);
2593
2594 return error;
2595 }
2596
2597 int
2598 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2599 {
2600 struct mbuf *m0 = NULL, *control_end = NULL;
2601
2602 socket_lock_assert_owned(so);
2603
2604 /*
2605 * top must points to mbuf chain to be sent.
2606 * If control is not NULL, top must be packet header
2607 */
2608 VERIFY(top != NULL &&
2609 (control == NULL || top->m_flags & M_PKTHDR));
2610
2611 /*
2612 * If control is not passed in, see if we can get it
2613 * from top.
2614 */
2615 if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2616 // Locate start of control if present and start of data
2617 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2618 if (m0->m_flags & M_PKTHDR) {
2619 top = m0;
2620 break;
2621 } else if (m0->m_type == MT_CONTROL) {
2622 if (control == NULL) {
2623 // Found start of control
2624 control = m0;
2625 }
2626 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2627 // Found end of control
2628 control_end = m0;
2629 }
2630 }
2631 }
2632 if (control_end != NULL) {
2633 control_end->m_next = NULL;
2634 }
2635 }
2636
2637 int error = (*so->so_proto->pr_usrreqs->pru_send)
2638 (so, sendflags, top, addr, control, current_proc());
2639
2640 return error;
2641 }
2642
2643 /*
2644 * Supported only connected sockets (no address) without ancillary data
2645 * (control mbuf) for atomic protocols
2646 */
2647 int
2648 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2649 {
2650 struct mbuf *m, *freelist = NULL;
2651 user_ssize_t len, resid;
2652 int error, dontroute, mlen;
2653 int atomic = sosendallatonce(so);
2654 int sblocked = 0;
2655 struct proc *p = current_proc();
2656 u_int uiofirst = 0;
2657 u_int uiolast = 0;
2658 struct mbuf *top = NULL;
2659 uint16_t headroom = 0;
2660 boolean_t bigcl;
2661
2662 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2663 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2664
2665 if (so->so_type != SOCK_DGRAM) {
2666 error = EINVAL;
2667 goto out;
2668 }
2669 if (atomic == 0) {
2670 error = EINVAL;
2671 goto out;
2672 }
2673 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2674 error = EPROTONOSUPPORT;
2675 goto out;
2676 }
2677 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2678 error = EINVAL;
2679 goto out;
2680 }
2681 resid = uio_array_resid(uioarray, uiocnt);
2682
2683 /*
2684 * In theory resid should be unsigned.
2685 * However, space must be signed, as it might be less than 0
2686 * if we over-committed, and we must use a signed comparison
2687 * of space and resid. On the other hand, a negative resid
2688 * causes us to loop sending 0-length segments to the protocol.
2689 *
2690 * Note: We limit resid to be a positive int value as we use
2691 * imin() to set bytes_to_copy -- radr://14558484
2692 */
2693 if (resid < 0 || resid > INT_MAX) {
2694 error = EINVAL;
2695 goto out;
2696 }
2697
2698 socket_lock(so, 1);
2699 so_update_last_owner_locked(so, p);
2700 so_update_policy(so);
2701
2702 #if NECP
2703 so_update_necp_policy(so, NULL, NULL);
2704 #endif /* NECP */
2705
2706 dontroute = (flags & MSG_DONTROUTE) &&
2707 (so->so_options & SO_DONTROUTE) == 0 &&
2708 (so->so_proto->pr_flags & PR_ATOMIC);
2709 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2710
2711 error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2712 &sblocked, NULL);
2713 if (error) {
2714 goto release;
2715 }
2716
2717 /*
2718 * Use big 4 KB clusters when the outgoing interface does not prefer
2719 * 2 KB clusters
2720 */
2721 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2722
2723 if (soreserveheadroom != 0) {
2724 headroom = so->so_pktheadroom;
2725 }
2726
2727 do {
2728 int i;
2729 int num_needed = 0;
2730 int chainlength;
2731 size_t maxpktlen = 0;
2732 int bytes_to_alloc;
2733
2734 if (sosendminchain > 0) {
2735 chainlength = 0;
2736 } else {
2737 chainlength = sosendmaxchain;
2738 }
2739
2740 socket_unlock(so, 0);
2741
2742 /*
2743 * Find a set of uio that fit in a reasonable number
2744 * of mbuf packets
2745 */
2746 for (i = uiofirst; i < uiocnt; i++) {
2747 struct uio *auio = uioarray[i];
2748
2749 len = uio_resid(auio);
2750
2751 /* Do nothing for empty messages */
2752 if (len == 0) {
2753 continue;
2754 }
2755
2756 num_needed += 1;
2757 uiolast += 1;
2758
2759 if (len > maxpktlen) {
2760 maxpktlen = len;
2761 }
2762
2763 chainlength += len;
2764 if (chainlength > sosendmaxchain) {
2765 break;
2766 }
2767 }
2768 /*
2769 * Nothing left to send
2770 */
2771 if (num_needed == 0) {
2772 socket_lock(so, 0);
2773 break;
2774 }
2775 /*
2776 * Allocate buffer large enough to include headroom space for
2777 * network and link header
2778 *
2779 */
2780 bytes_to_alloc = maxpktlen + headroom;
2781
2782 /*
2783 * Allocate a single contiguous buffer of the smallest available
2784 * size when possible
2785 */
2786 if (bytes_to_alloc > MCLBYTES &&
2787 bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2788 freelist = m_getpackets_internal(
2789 (unsigned int *)&num_needed,
2790 num_needed, M_WAIT, 1,
2791 MBIGCLBYTES);
2792 } else if (bytes_to_alloc > _MHLEN &&
2793 bytes_to_alloc <= MCLBYTES) {
2794 freelist = m_getpackets_internal(
2795 (unsigned int *)&num_needed,
2796 num_needed, M_WAIT, 1,
2797 MCLBYTES);
2798 } else {
2799 freelist = m_allocpacket_internal(
2800 (unsigned int *)&num_needed,
2801 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2802 }
2803
2804 if (freelist == NULL) {
2805 socket_lock(so, 0);
2806 error = ENOMEM;
2807 goto release;
2808 }
2809 /*
2810 * Copy each uio of the set into its own mbuf packet
2811 */
2812 for (i = uiofirst, m = freelist;
2813 i < uiolast && m != NULL;
2814 i++) {
2815 int bytes_to_copy;
2816 struct mbuf *n;
2817 struct uio *auio = uioarray[i];
2818
2819 bytes_to_copy = uio_resid(auio);
2820
2821 /* Do nothing for empty messages */
2822 if (bytes_to_copy == 0) {
2823 continue;
2824 }
2825 /*
2826 * Leave headroom for protocol headers
2827 * in the first mbuf of the chain
2828 */
2829 m->m_data += headroom;
2830
2831 for (n = m; n != NULL; n = n->m_next) {
2832 if ((m->m_flags & M_EXT)) {
2833 mlen = m->m_ext.ext_size -
2834 M_LEADINGSPACE(m);
2835 } else if ((m->m_flags & M_PKTHDR)) {
2836 mlen =
2837 MHLEN - M_LEADINGSPACE(m);
2838 } else {
2839 mlen = MLEN - M_LEADINGSPACE(m);
2840 }
2841 len = imin(mlen, bytes_to_copy);
2842
2843 /*
2844 * Note: uiomove() decrements the iovec
2845 * length
2846 */
2847 error = uiomove(mtod(n, caddr_t),
2848 len, auio);
2849 if (error != 0) {
2850 break;
2851 }
2852 n->m_len = len;
2853 m->m_pkthdr.len += len;
2854
2855 VERIFY(m->m_pkthdr.len <= maxpktlen);
2856
2857 bytes_to_copy -= len;
2858 resid -= len;
2859 }
2860 if (m->m_pkthdr.len == 0) {
2861 printf(
2862 "%s:%d so %llx pkt %llx type %u len null\n",
2863 __func__, __LINE__,
2864 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2865 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2866 m->m_type);
2867 }
2868 if (error != 0) {
2869 break;
2870 }
2871 m = m->m_nextpkt;
2872 }
2873
2874 socket_lock(so, 0);
2875
2876 if (error) {
2877 goto release;
2878 }
2879 top = freelist;
2880 freelist = NULL;
2881
2882 if (dontroute) {
2883 so->so_options |= SO_DONTROUTE;
2884 }
2885
2886 if ((flags & MSG_SKIPCFIL) == 0) {
2887 struct mbuf **prevnextp = NULL;
2888
2889 for (i = uiofirst, m = top;
2890 i < uiolast && m != NULL;
2891 i++) {
2892 struct mbuf *nextpkt = m->m_nextpkt;
2893
2894 /*
2895 * Socket filter processing
2896 */
2897 error = sflt_data_out(so, NULL, &m,
2898 NULL, 0);
2899 if (error != 0 && error != EJUSTRETURN) {
2900 goto release;
2901 }
2902
2903 #if CONTENT_FILTER
2904 if (error == 0) {
2905 /*
2906 * Content filter processing
2907 */
2908 error = cfil_sock_data_out(so, NULL, m,
2909 NULL, 0);
2910 if (error != 0 && error != EJUSTRETURN) {
2911 goto release;
2912 }
2913 }
2914 #endif /* CONTENT_FILTER */
2915 /*
2916 * Remove packet from the list when
2917 * swallowed by a filter
2918 */
2919 if (error == EJUSTRETURN) {
2920 error = 0;
2921 if (prevnextp != NULL) {
2922 *prevnextp = nextpkt;
2923 } else {
2924 top = nextpkt;
2925 }
2926 }
2927
2928 m = nextpkt;
2929 if (m != NULL) {
2930 prevnextp = &m->m_nextpkt;
2931 }
2932 }
2933 }
2934 if (top != NULL) {
2935 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2936 (so, 0, top, NULL, NULL, p);
2937 }
2938
2939 if (dontroute) {
2940 so->so_options &= ~SO_DONTROUTE;
2941 }
2942
2943 top = NULL;
2944 uiofirst = uiolast;
2945 } while (resid > 0 && error == 0);
2946 release:
2947 if (sblocked) {
2948 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2949 } else {
2950 socket_unlock(so, 1);
2951 }
2952 out:
2953 if (top != NULL) {
2954 m_freem(top);
2955 }
2956 if (freelist != NULL) {
2957 m_freem_list(freelist);
2958 }
2959
2960 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2961 so->so_snd.sb_cc, 0, error);
2962
2963 return error;
2964 }
2965
2966 /*
2967 * May return ERESTART when packet is dropped by MAC policy check
2968 */
2969 static int
2970 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2971 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2972 {
2973 int error = 0;
2974 struct mbuf *m = *mp;
2975 struct mbuf *nextrecord = *nextrecordp;
2976
2977 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2978 #if CONFIG_MACF_SOCKET_SUBSET
2979 /*
2980 * Call the MAC framework for policy checking if we're in
2981 * the user process context and the socket isn't connected.
2982 */
2983 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2984 struct mbuf *m0 = m;
2985 /*
2986 * Dequeue this record (temporarily) from the receive
2987 * list since we're about to drop the socket's lock
2988 * where a new record may arrive and be appended to
2989 * the list. Upon MAC policy failure, the record
2990 * will be freed. Otherwise, we'll add it back to
2991 * the head of the list. We cannot rely on SB_LOCK
2992 * because append operation uses the socket's lock.
2993 */
2994 do {
2995 m->m_nextpkt = NULL;
2996 sbfree(&so->so_rcv, m);
2997 m = m->m_next;
2998 } while (m != NULL);
2999 m = m0;
3000 so->so_rcv.sb_mb = nextrecord;
3001 SB_EMPTY_FIXUP(&so->so_rcv);
3002 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
3003 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
3004 socket_unlock(so, 0);
3005
3006 if (mac_socket_check_received(proc_ucred(p), so,
3007 mtod(m, struct sockaddr *)) != 0) {
3008 /*
3009 * MAC policy failure; free this record and
3010 * process the next record (or block until
3011 * one is available). We have adjusted sb_cc
3012 * and sb_mbcnt above so there is no need to
3013 * call sbfree() again.
3014 */
3015 m_freem(m);
3016 /*
3017 * Clear SB_LOCK but don't unlock the socket.
3018 * Process the next record or wait for one.
3019 */
3020 socket_lock(so, 0);
3021 sbunlock(&so->so_rcv, TRUE); /* stay locked */
3022 error = ERESTART;
3023 goto done;
3024 }
3025 socket_lock(so, 0);
3026 /*
3027 * If the socket has been defunct'd, drop it.
3028 */
3029 if (so->so_flags & SOF_DEFUNCT) {
3030 m_freem(m);
3031 error = ENOTCONN;
3032 goto done;
3033 }
3034 /*
3035 * Re-adjust the socket receive list and re-enqueue
3036 * the record in front of any packets which may have
3037 * been appended while we dropped the lock.
3038 */
3039 for (m = m0; m->m_next != NULL; m = m->m_next) {
3040 sballoc(&so->so_rcv, m);
3041 }
3042 sballoc(&so->so_rcv, m);
3043 if (so->so_rcv.sb_mb == NULL) {
3044 so->so_rcv.sb_lastrecord = m0;
3045 so->so_rcv.sb_mbtail = m;
3046 }
3047 m = m0;
3048 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3049 so->so_rcv.sb_mb = m;
3050 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3051 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3052 }
3053 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3054 if (psa != NULL) {
3055 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3056 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3057 error = EWOULDBLOCK;
3058 goto done;
3059 }
3060 }
3061 if (flags & MSG_PEEK) {
3062 m = m->m_next;
3063 } else {
3064 sbfree(&so->so_rcv, m);
3065 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3066 panic("%s: about to create invalid socketbuf",
3067 __func__);
3068 /* NOTREACHED */
3069 }
3070 MFREE(m, so->so_rcv.sb_mb);
3071 m = so->so_rcv.sb_mb;
3072 if (m != NULL) {
3073 m->m_nextpkt = nextrecord;
3074 } else {
3075 so->so_rcv.sb_mb = nextrecord;
3076 SB_EMPTY_FIXUP(&so->so_rcv);
3077 }
3078 }
3079 done:
3080 *mp = m;
3081 *nextrecordp = nextrecord;
3082
3083 return error;
3084 }
3085
3086 /*
3087 * Process one or more MT_CONTROL mbufs present before any data mbufs
3088 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3089 * just copy the data; if !MSG_PEEK, we call into the protocol to
3090 * perform externalization.
3091 */
3092 static int
3093 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3094 struct mbuf **mp, struct mbuf **nextrecordp)
3095 {
3096 int error = 0;
3097 struct mbuf *cm = NULL, *cmn;
3098 struct mbuf **cme = &cm;
3099 struct sockbuf *sb_rcv = &so->so_rcv;
3100 struct mbuf **msgpcm = NULL;
3101 struct mbuf *m = *mp;
3102 struct mbuf *nextrecord = *nextrecordp;
3103 struct protosw *pr = so->so_proto;
3104
3105 /*
3106 * Externalizing the control messages would require us to
3107 * drop the socket's lock below. Once we re-acquire the
3108 * lock, the mbuf chain might change. In order to preserve
3109 * consistency, we unlink all control messages from the
3110 * first mbuf chain in one shot and link them separately
3111 * onto a different chain.
3112 */
3113 do {
3114 if (flags & MSG_PEEK) {
3115 if (controlp != NULL) {
3116 if (*controlp == NULL) {
3117 msgpcm = controlp;
3118 }
3119 *controlp = m_copy(m, 0, m->m_len);
3120
3121 /*
3122 * If we failed to allocate an mbuf,
3123 * release any previously allocated
3124 * mbufs for control data. Return
3125 * an error. Keep the mbufs in the
3126 * socket as this is using
3127 * MSG_PEEK flag.
3128 */
3129 if (*controlp == NULL) {
3130 m_freem(*msgpcm);
3131 error = ENOBUFS;
3132 goto done;
3133 }
3134 controlp = &(*controlp)->m_next;
3135 }
3136 m = m->m_next;
3137 } else {
3138 m->m_nextpkt = NULL;
3139 sbfree(sb_rcv, m);
3140 sb_rcv->sb_mb = m->m_next;
3141 m->m_next = NULL;
3142 *cme = m;
3143 cme = &(*cme)->m_next;
3144 m = sb_rcv->sb_mb;
3145 }
3146 } while (m != NULL && m->m_type == MT_CONTROL);
3147
3148 if (!(flags & MSG_PEEK)) {
3149 if (sb_rcv->sb_mb != NULL) {
3150 sb_rcv->sb_mb->m_nextpkt = nextrecord;
3151 } else {
3152 sb_rcv->sb_mb = nextrecord;
3153 SB_EMPTY_FIXUP(sb_rcv);
3154 }
3155 if (nextrecord == NULL) {
3156 sb_rcv->sb_lastrecord = m;
3157 }
3158 }
3159
3160 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3161 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3162
3163 while (cm != NULL) {
3164 int cmsg_type;
3165
3166 cmn = cm->m_next;
3167 cm->m_next = NULL;
3168 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3169
3170 /*
3171 * Call the protocol to externalize SCM_RIGHTS message
3172 * and return the modified message to the caller upon
3173 * success. Otherwise, all other control messages are
3174 * returned unmodified to the caller. Note that we
3175 * only get into this loop if MSG_PEEK is not set.
3176 */
3177 if (pr->pr_domain->dom_externalize != NULL &&
3178 cmsg_type == SCM_RIGHTS) {
3179 /*
3180 * Release socket lock: see 3903171. This
3181 * would also allow more records to be appended
3182 * to the socket buffer. We still have SB_LOCK
3183 * set on it, so we can be sure that the head
3184 * of the mbuf chain won't change.
3185 */
3186 socket_unlock(so, 0);
3187 error = (*pr->pr_domain->dom_externalize)(cm);
3188 socket_lock(so, 0);
3189 } else {
3190 error = 0;
3191 }
3192
3193 if (controlp != NULL && error == 0) {
3194 *controlp = cm;
3195 controlp = &(*controlp)->m_next;
3196 } else {
3197 (void) m_free(cm);
3198 }
3199 cm = cmn;
3200 }
3201 /*
3202 * Update the value of nextrecord in case we received new
3203 * records when the socket was unlocked above for
3204 * externalizing SCM_RIGHTS.
3205 */
3206 if (m != NULL) {
3207 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3208 } else {
3209 nextrecord = sb_rcv->sb_mb;
3210 }
3211
3212 done:
3213 *mp = m;
3214 *nextrecordp = nextrecord;
3215
3216 return error;
3217 }
3218
3219 /*
3220 * Implement receive operations on a socket.
3221 * We depend on the way that records are added to the sockbuf
3222 * by sbappend*. In particular, each record (mbufs linked through m_next)
3223 * must begin with an address if the protocol so specifies,
3224 * followed by an optional mbuf or mbufs containing ancillary data,
3225 * and then zero or more mbufs of data.
3226 * In order to avoid blocking network interrupts for the entire time here,
3227 * we splx() while doing the actual copy to user space.
3228 * Although the sockbuf is locked, new data may still be appended,
3229 * and thus we must maintain consistency of the sockbuf during that time.
3230 *
3231 * The caller may receive the data as a single mbuf chain by supplying
3232 * an mbuf **mp0 for use in returning the chain. The uio is then used
3233 * only for the count in uio_resid.
3234 *
3235 * Returns: 0 Success
3236 * ENOBUFS
3237 * ENOTCONN
3238 * EWOULDBLOCK
3239 * uiomove:EFAULT
3240 * sblock:EWOULDBLOCK
3241 * sblock:EINTR
3242 * sbwait:EBADF
3243 * sbwait:EINTR
3244 * sodelayed_copy:EFAULT
3245 * <pru_rcvoob>:EINVAL[TCP]
3246 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3247 * <pru_rcvoob>:???
3248 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3249 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3250 * <pr_domain->dom_externalize>:???
3251 *
3252 * Notes: Additional return values from calls through <pru_rcvoob> and
3253 * <pr_domain->dom_externalize> depend on protocols other than
3254 * TCP or AF_UNIX, which are documented above.
3255 */
3256 int
3257 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3258 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3259 {
3260 struct mbuf *m, **mp, *ml = NULL;
3261 struct mbuf *nextrecord, *free_list;
3262 int flags, error, offset;
3263 user_ssize_t len;
3264 struct protosw *pr = so->so_proto;
3265 int moff, type = 0;
3266 user_ssize_t orig_resid = uio_resid(uio);
3267 user_ssize_t delayed_copy_len;
3268 int can_delay;
3269 int need_event;
3270 struct proc *p = current_proc();
3271 boolean_t en_tracing = FALSE;
3272
3273 /*
3274 * Sanity check on the length passed by caller as we are making 'int'
3275 * comparisons
3276 */
3277 if (orig_resid < 0 || orig_resid > INT_MAX) {
3278 return EINVAL;
3279 }
3280
3281 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3282 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3283 so->so_rcv.sb_hiwat);
3284
3285 socket_lock(so, 1);
3286 so_update_last_owner_locked(so, p);
3287 so_update_policy(so);
3288
3289 #ifdef MORE_LOCKING_DEBUG
3290 if (so->so_usecount == 1) {
3291 panic("%s: so=%x no other reference on socket\n", __func__, so);
3292 /* NOTREACHED */
3293 }
3294 #endif
3295 mp = mp0;
3296 if (psa != NULL) {
3297 *psa = NULL;
3298 }
3299 if (controlp != NULL) {
3300 *controlp = NULL;
3301 }
3302 if (flagsp != NULL) {
3303 flags = *flagsp & ~MSG_EOR;
3304 } else {
3305 flags = 0;
3306 }
3307
3308 /*
3309 * If a recv attempt is made on a previously-accepted socket
3310 * that has been marked as inactive (disconnected), reject
3311 * the request.
3312 */
3313 if (so->so_flags & SOF_DEFUNCT) {
3314 struct sockbuf *sb = &so->so_rcv;
3315
3316 error = ENOTCONN;
3317 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3318 __func__, proc_pid(p), proc_best_name(p),
3319 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3320 SOCK_DOM(so), SOCK_TYPE(so), error);
3321 /*
3322 * This socket should have been disconnected and flushed
3323 * prior to being returned from sodefunct(); there should
3324 * be no data on its receive list, so panic otherwise.
3325 */
3326 if (so->so_state & SS_DEFUNCT) {
3327 sb_empty_assert(sb, __func__);
3328 }
3329 socket_unlock(so, 1);
3330 return error;
3331 }
3332
3333 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3334 pr->pr_usrreqs->pru_preconnect) {
3335 /*
3336 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3337 * calling write() right after this. *If* the app calls a read
3338 * we do not want to block this read indefinetely. Thus,
3339 * we trigger a connect so that the session gets initiated.
3340 */
3341 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3342
3343 if (error) {
3344 socket_unlock(so, 1);
3345 return error;
3346 }
3347 }
3348
3349 if (ENTR_SHOULDTRACE &&
3350 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3351 /*
3352 * enable energy tracing for inet sockets that go over
3353 * non-loopback interfaces only.
3354 */
3355 struct inpcb *inp = sotoinpcb(so);
3356 if (inp->inp_last_outifp != NULL &&
3357 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3358 en_tracing = TRUE;
3359 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3360 VM_KERNEL_ADDRPERM(so),
3361 ((so->so_state & SS_NBIO) ?
3362 kEnTrFlagNonBlocking : 0),
3363 (int64_t)orig_resid);
3364 }
3365 }
3366
3367 /*
3368 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3369 * regardless of the flags argument. Here is the case were
3370 * out-of-band data is not inline.
3371 */
3372 if ((flags & MSG_OOB) ||
3373 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3374 (so->so_options & SO_OOBINLINE) == 0 &&
3375 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3376 m = m_get(M_WAIT, MT_DATA);
3377 if (m == NULL) {
3378 socket_unlock(so, 1);
3379 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3380 ENOBUFS, 0, 0, 0, 0);
3381 return ENOBUFS;
3382 }
3383 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3384 if (error) {
3385 goto bad;
3386 }
3387 socket_unlock(so, 0);
3388 do {
3389 error = uiomove(mtod(m, caddr_t),
3390 imin(uio_resid(uio), m->m_len), uio);
3391 m = m_free(m);
3392 } while (uio_resid(uio) && error == 0 && m != NULL);
3393 socket_lock(so, 0);
3394 bad:
3395 if (m != NULL) {
3396 m_freem(m);
3397 }
3398
3399 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3400 if (error == EWOULDBLOCK || error == EINVAL) {
3401 /*
3402 * Let's try to get normal data:
3403 * EWOULDBLOCK: out-of-band data not
3404 * receive yet. EINVAL: out-of-band data
3405 * already read.
3406 */
3407 error = 0;
3408 goto nooob;
3409 } else if (error == 0 && flagsp != NULL) {
3410 *flagsp |= MSG_OOB;
3411 }
3412 }
3413 socket_unlock(so, 1);
3414 if (en_tracing) {
3415 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3416 VM_KERNEL_ADDRPERM(so), 0,
3417 (int64_t)(orig_resid - uio_resid(uio)));
3418 }
3419 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3420 0, 0, 0, 0);
3421
3422 return error;
3423 }
3424 nooob:
3425 if (mp != NULL) {
3426 *mp = NULL;
3427 }
3428
3429 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3430 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3431 }
3432
3433 free_list = NULL;
3434 delayed_copy_len = 0;
3435 restart:
3436 #ifdef MORE_LOCKING_DEBUG
3437 if (so->so_usecount <= 1) {
3438 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3439 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3440 }
3441 #endif
3442 /*
3443 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3444 * and if so just return to the caller. This could happen when
3445 * soreceive() is called by a socket upcall function during the
3446 * time the socket is freed. The socket buffer would have been
3447 * locked across the upcall, therefore we cannot put this thread
3448 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3449 * we may livelock), because the lock on the socket buffer will
3450 * only be released when the upcall routine returns to its caller.
3451 * Because the socket has been officially closed, there can be
3452 * no further read on it.
3453 *
3454 * A multipath subflow socket would have its SS_NOFDREF set by
3455 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3456 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3457 */
3458 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3459 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3460 socket_unlock(so, 1);
3461 return 0;
3462 }
3463
3464 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3465 if (error) {
3466 socket_unlock(so, 1);
3467 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3468 0, 0, 0, 0);
3469 if (en_tracing) {
3470 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3471 VM_KERNEL_ADDRPERM(so), 0,
3472 (int64_t)(orig_resid - uio_resid(uio)));
3473 }
3474 return error;
3475 }
3476
3477 m = so->so_rcv.sb_mb;
3478 /*
3479 * If we have less data than requested, block awaiting more
3480 * (subject to any timeout) if:
3481 * 1. the current count is less than the low water mark, or
3482 * 2. MSG_WAITALL is set, and it is possible to do the entire
3483 * receive operation at once if we block (resid <= hiwat).
3484 * 3. MSG_DONTWAIT is not set
3485 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3486 * we have to do the receive in sections, and thus risk returning
3487 * a short count if a timeout or signal occurs after we start.
3488 */
3489 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3490 so->so_rcv.sb_cc < uio_resid(uio)) &&
3491 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3492 ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
3493 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
3494 /*
3495 * Panic if we notice inconsistencies in the socket's
3496 * receive list; both sb_mb and sb_cc should correctly
3497 * reflect the contents of the list, otherwise we may
3498 * end up with false positives during select() or poll()
3499 * which could put the application in a bad state.
3500 */
3501 SB_MB_CHECK(&so->so_rcv);
3502
3503 if (so->so_error) {
3504 if (m != NULL) {
3505 goto dontblock;
3506 }
3507 error = so->so_error;
3508 if ((flags & MSG_PEEK) == 0) {
3509 so->so_error = 0;
3510 }
3511 goto release;
3512 }
3513 if (so->so_state & SS_CANTRCVMORE) {
3514 #if CONTENT_FILTER
3515 /*
3516 * Deal with half closed connections
3517 */
3518 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3519 cfil_sock_data_pending(&so->so_rcv) != 0) {
3520 CFIL_LOG(LOG_INFO,
3521 "so %llx ignore SS_CANTRCVMORE",
3522 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3523 } else
3524 #endif /* CONTENT_FILTER */
3525 if (m != NULL) {
3526 goto dontblock;
3527 } else {
3528 goto release;
3529 }
3530 }
3531 for (; m != NULL; m = m->m_next) {
3532 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3533 m = so->so_rcv.sb_mb;
3534 goto dontblock;
3535 }
3536 }
3537 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3538 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3539 error = ENOTCONN;
3540 goto release;
3541 }
3542 if (uio_resid(uio) == 0) {
3543 goto release;
3544 }
3545
3546 if ((so->so_state & SS_NBIO) ||
3547 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3548 error = EWOULDBLOCK;
3549 goto release;
3550 }
3551 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3552 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3553 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3554 #if EVEN_MORE_LOCKING_DEBUG
3555 if (socket_debug) {
3556 printf("Waiting for socket data\n");
3557 }
3558 #endif
3559
3560 error = sbwait(&so->so_rcv);
3561 #if EVEN_MORE_LOCKING_DEBUG
3562 if (socket_debug) {
3563 printf("SORECEIVE - sbwait returned %d\n", error);
3564 }
3565 #endif
3566 if (so->so_usecount < 1) {
3567 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3568 __func__, so, so->so_usecount);
3569 /* NOTREACHED */
3570 }
3571 if (error) {
3572 socket_unlock(so, 1);
3573 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3574 0, 0, 0, 0);
3575 if (en_tracing) {
3576 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3577 VM_KERNEL_ADDRPERM(so), 0,
3578 (int64_t)(orig_resid - uio_resid(uio)));
3579 }
3580 return error;
3581 }
3582 goto restart;
3583 }
3584 dontblock:
3585 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3586 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3587 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3588 nextrecord = m->m_nextpkt;
3589
3590 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3591 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3592 mp0 == NULL);
3593 if (error == ERESTART) {
3594 goto restart;
3595 } else if (error != 0) {
3596 goto release;
3597 }
3598 orig_resid = 0;
3599 }
3600
3601 /*
3602 * Process one or more MT_CONTROL mbufs present before any data mbufs
3603 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3604 * just copy the data; if !MSG_PEEK, we call into the protocol to
3605 * perform externalization.
3606 */
3607 if (m != NULL && m->m_type == MT_CONTROL) {
3608 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3609 if (error != 0) {
3610 goto release;
3611 }
3612 orig_resid = 0;
3613 }
3614
3615 /*
3616 * If the socket is a TCP socket with message delivery
3617 * enabled, then create a control msg to deliver the
3618 * relative TCP sequence number for this data. Waiting
3619 * until this point will protect against failures to
3620 * allocate an mbuf for control msgs.
3621 */
3622 if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3623 (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3624 struct mbuf *seq_cm;
3625
3626 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
3627 sizeof(uint32_t), SCM_SEQNUM, SOL_SOCKET);
3628 if (seq_cm == NULL) {
3629 /* unable to allocate a control mbuf */
3630 error = ENOBUFS;
3631 goto release;
3632 }
3633 *controlp = seq_cm;
3634 controlp = &seq_cm->m_next;
3635 }
3636
3637 if (m != NULL) {
3638 if (!(flags & MSG_PEEK)) {
3639 /*
3640 * We get here because m points to an mbuf following
3641 * any MT_SONAME or MT_CONTROL mbufs which have been
3642 * processed above. In any case, m should be pointing
3643 * to the head of the mbuf chain, and the nextrecord
3644 * should be either NULL or equal to m->m_nextpkt.
3645 * See comments above about SB_LOCK.
3646 */
3647 if (m != so->so_rcv.sb_mb ||
3648 m->m_nextpkt != nextrecord) {
3649 panic("%s: post-control !sync so=%p m=%p "
3650 "nextrecord=%p\n", __func__, so, m,
3651 nextrecord);
3652 /* NOTREACHED */
3653 }
3654 if (nextrecord == NULL) {
3655 so->so_rcv.sb_lastrecord = m;
3656 }
3657 }
3658 type = m->m_type;
3659 if (type == MT_OOBDATA) {
3660 flags |= MSG_OOB;
3661 }
3662 } else {
3663 if (!(flags & MSG_PEEK)) {
3664 SB_EMPTY_FIXUP(&so->so_rcv);
3665 }
3666 }
3667 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3668 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3669
3670 moff = 0;
3671 offset = 0;
3672
3673 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3674 can_delay = 1;
3675 } else {
3676 can_delay = 0;
3677 }
3678
3679 need_event = 0;
3680
3681 while (m != NULL &&
3682 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3683 if (m->m_type == MT_OOBDATA) {
3684 if (type != MT_OOBDATA) {
3685 break;
3686 }
3687 } else if (type == MT_OOBDATA) {
3688 break;
3689 }
3690 /*
3691 * Make sure to allways set MSG_OOB event when getting
3692 * out of band data inline.
3693 */
3694 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3695 (so->so_options & SO_OOBINLINE) != 0 &&
3696 (so->so_state & SS_RCVATMARK) != 0) {
3697 flags |= MSG_OOB;
3698 }
3699 so->so_state &= ~SS_RCVATMARK;
3700 len = uio_resid(uio) - delayed_copy_len;
3701 if (so->so_oobmark && len > so->so_oobmark - offset) {
3702 len = so->so_oobmark - offset;
3703 }
3704 if (len > m->m_len - moff) {
3705 len = m->m_len - moff;
3706 }
3707 /*
3708 * If mp is set, just pass back the mbufs.
3709 * Otherwise copy them out via the uio, then free.
3710 * Sockbuf must be consistent here (points to current mbuf,
3711 * it points to next record) when we drop priority;
3712 * we must note any additions to the sockbuf when we
3713 * block interrupts again.
3714 */
3715 if (mp == NULL) {
3716 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3717 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3718 if (can_delay && len == m->m_len) {
3719 /*
3720 * only delay the copy if we're consuming the
3721 * mbuf and we're NOT in MSG_PEEK mode
3722 * and we have enough data to make it worthwile
3723 * to drop and retake the lock... can_delay
3724 * reflects the state of the 2 latter
3725 * constraints moff should always be zero
3726 * in these cases
3727 */
3728 delayed_copy_len += len;
3729 } else {
3730 if (delayed_copy_len) {
3731 error = sodelayed_copy(so, uio,
3732 &free_list, &delayed_copy_len);
3733
3734 if (error) {
3735 goto release;
3736 }
3737 /*
3738 * can only get here if MSG_PEEK is not
3739 * set therefore, m should point at the
3740 * head of the rcv queue; if it doesn't,
3741 * it means something drastically
3742 * changed while we were out from behind
3743 * the lock in sodelayed_copy. perhaps
3744 * a RST on the stream. in any event,
3745 * the stream has been interrupted. it's
3746 * probably best just to return whatever
3747 * data we've moved and let the caller
3748 * sort it out...
3749 */
3750 if (m != so->so_rcv.sb_mb) {
3751 break;
3752 }
3753 }
3754 socket_unlock(so, 0);
3755 error = uiomove(mtod(m, caddr_t) + moff,
3756 (int)len, uio);
3757 socket_lock(so, 0);
3758
3759 if (error) {
3760 goto release;
3761 }
3762 }
3763 } else {
3764 uio_setresid(uio, (uio_resid(uio) - len));
3765 }
3766 if (len == m->m_len - moff) {
3767 if (m->m_flags & M_EOR) {
3768 flags |= MSG_EOR;
3769 }
3770 if (flags & MSG_PEEK) {
3771 m = m->m_next;
3772 moff = 0;
3773 } else {
3774 nextrecord = m->m_nextpkt;
3775 sbfree(&so->so_rcv, m);
3776 m->m_nextpkt = NULL;
3777
3778 /*
3779 * If this packet is an unordered packet
3780 * (indicated by M_UNORDERED_DATA flag), remove
3781 * the additional bytes added to the
3782 * receive socket buffer size.
3783 */
3784 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3785 m->m_len &&
3786 (m->m_flags & M_UNORDERED_DATA) &&
3787 sbreserve(&so->so_rcv,
3788 so->so_rcv.sb_hiwat - m->m_len)) {
3789 if (so->so_msg_state->msg_uno_bytes >
3790 m->m_len) {
3791 so->so_msg_state->
3792 msg_uno_bytes -= m->m_len;
3793 } else {
3794 so->so_msg_state->
3795 msg_uno_bytes = 0;
3796 }
3797 m->m_flags &= ~M_UNORDERED_DATA;
3798 }
3799
3800 if (mp != NULL) {
3801 *mp = m;
3802 mp = &m->m_next;
3803 so->so_rcv.sb_mb = m = m->m_next;
3804 *mp = NULL;
3805 } else {
3806 if (free_list == NULL) {
3807 free_list = m;
3808 } else {
3809 ml->m_next = m;
3810 }
3811 ml = m;
3812 so->so_rcv.sb_mb = m = m->m_next;
3813 ml->m_next = NULL;
3814 }
3815 if (m != NULL) {
3816 m->m_nextpkt = nextrecord;
3817 if (nextrecord == NULL) {
3818 so->so_rcv.sb_lastrecord = m;
3819 }
3820 } else {
3821 so->so_rcv.sb_mb = nextrecord;
3822 SB_EMPTY_FIXUP(&so->so_rcv);
3823 }
3824 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3825 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3826 }
3827 } else {
3828 if (flags & MSG_PEEK) {
3829 moff += len;
3830 } else {
3831 if (mp != NULL) {
3832 int copy_flag;
3833
3834 if (flags & MSG_DONTWAIT) {
3835 copy_flag = M_DONTWAIT;
3836 } else {
3837 copy_flag = M_WAIT;
3838 }
3839 *mp = m_copym(m, 0, len, copy_flag);
3840 /*
3841 * Failed to allocate an mbuf?
3842 * Adjust uio_resid back, it was
3843 * adjusted down by len bytes which
3844 * we didn't copy over.
3845 */
3846 if (*mp == NULL) {
3847 uio_setresid(uio,
3848 (uio_resid(uio) + len));
3849 break;
3850 }
3851 }
3852 m->m_data += len;
3853 m->m_len -= len;
3854 so->so_rcv.sb_cc -= len;
3855 }
3856 }
3857 if (so->so_oobmark) {
3858 if ((flags & MSG_PEEK) == 0) {
3859 so->so_oobmark -= len;
3860 if (so->so_oobmark == 0) {
3861 so->so_state |= SS_RCVATMARK;
3862 /*
3863 * delay posting the actual event until
3864 * after any delayed copy processing
3865 * has finished
3866 */
3867 need_event = 1;
3868 break;
3869 }
3870 } else {
3871 offset += len;
3872 if (offset == so->so_oobmark) {
3873 break;
3874 }
3875 }
3876 }
3877 if (flags & MSG_EOR) {
3878 break;
3879 }
3880 /*
3881 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3882 * (for non-atomic socket), we must not quit until
3883 * "uio->uio_resid == 0" or an error termination.
3884 * If a signal/timeout occurs, return with a short
3885 * count but without error. Keep sockbuf locked
3886 * against other readers.
3887 */
3888 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3889 (uio_resid(uio) - delayed_copy_len) > 0 &&
3890 !sosendallatonce(so) && !nextrecord) {
3891 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3892 #if CONTENT_FILTER
3893 && cfil_sock_data_pending(&so->so_rcv) == 0
3894 #endif /* CONTENT_FILTER */
3895 )) {
3896 goto release;
3897 }
3898
3899 /*
3900 * Depending on the protocol (e.g. TCP), the following
3901 * might cause the socket lock to be dropped and later
3902 * be reacquired, and more data could have arrived and
3903 * have been appended to the receive socket buffer by
3904 * the time it returns. Therefore, we only sleep in
3905 * sbwait() below if and only if the socket buffer is
3906 * empty, in order to avoid a false sleep.
3907 */
3908 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3909 (((struct inpcb *)so->so_pcb)->inp_state !=
3910 INPCB_STATE_DEAD)) {
3911 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3912 }
3913
3914 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3915 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3916
3917 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3918 error = 0;
3919 goto release;
3920 }
3921 /*
3922 * have to wait until after we get back from the sbwait
3923 * to do the copy because we will drop the lock if we
3924 * have enough data that has been delayed... by dropping
3925 * the lock we open up a window allowing the netisr
3926 * thread to process the incoming packets and to change
3927 * the state of this socket... we're issuing the sbwait
3928 * because the socket is empty and we're expecting the
3929 * netisr thread to wake us up when more packets arrive;
3930 * if we allow that processing to happen and then sbwait
3931 * we could stall forever with packets sitting in the
3932 * socket if no further packets arrive from the remote
3933 * side.
3934 *
3935 * we want to copy before we've collected all the data
3936 * to satisfy this request to allow the copy to overlap
3937 * the incoming packet processing on an MP system
3938 */
3939 if (delayed_copy_len > sorecvmincopy &&
3940 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3941 error = sodelayed_copy(so, uio,
3942 &free_list, &delayed_copy_len);
3943
3944 if (error) {
3945 goto release;
3946 }
3947 }
3948 m = so->so_rcv.sb_mb;
3949 if (m != NULL) {
3950 nextrecord = m->m_nextpkt;
3951 }
3952 SB_MB_CHECK(&so->so_rcv);
3953 }
3954 }
3955 #ifdef MORE_LOCKING_DEBUG
3956 if (so->so_usecount <= 1) {
3957 panic("%s: after big while so=%p ref=%d on socket\n",
3958 __func__, so, so->so_usecount);
3959 /* NOTREACHED */
3960 }
3961 #endif
3962
3963 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3964 if (so->so_options & SO_DONTTRUNC) {
3965 flags |= MSG_RCVMORE;
3966 } else {
3967 flags |= MSG_TRUNC;
3968 if ((flags & MSG_PEEK) == 0) {
3969 (void) sbdroprecord(&so->so_rcv);
3970 }
3971 }
3972 }
3973
3974 /*
3975 * pru_rcvd below (for TCP) may cause more data to be received
3976 * if the socket lock is dropped prior to sending the ACK; some
3977 * legacy OpenTransport applications don't handle this well
3978 * (if it receives less data than requested while MSG_HAVEMORE
3979 * is set), and so we set the flag now based on what we know
3980 * prior to calling pru_rcvd.
3981 */
3982 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
3983 flags |= MSG_HAVEMORE;
3984 }
3985
3986 if ((flags & MSG_PEEK) == 0) {
3987 if (m == NULL) {
3988 so->so_rcv.sb_mb = nextrecord;
3989 /*
3990 * First part is an inline SB_EMPTY_FIXUP(). Second
3991 * part makes sure sb_lastrecord is up-to-date if
3992 * there is still data in the socket buffer.
3993 */
3994 if (so->so_rcv.sb_mb == NULL) {
3995 so->so_rcv.sb_mbtail = NULL;
3996 so->so_rcv.sb_lastrecord = NULL;
3997 } else if (nextrecord->m_nextpkt == NULL) {
3998 so->so_rcv.sb_lastrecord = nextrecord;
3999 }
4000 SB_MB_CHECK(&so->so_rcv);
4001 }
4002 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4003 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4004 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4005 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4006 }
4007 }
4008
4009 if (delayed_copy_len) {
4010 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4011 if (error) {
4012 goto release;
4013 }
4014 }
4015 if (free_list != NULL) {
4016 m_freem_list(free_list);
4017 free_list = NULL;
4018 }
4019 if (need_event) {
4020 postevent(so, 0, EV_OOB);
4021 }
4022
4023 if (orig_resid == uio_resid(uio) && orig_resid &&
4024 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4025 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4026 goto restart;
4027 }
4028
4029 if (flagsp != NULL) {
4030 *flagsp |= flags;
4031 }
4032 release:
4033 #ifdef MORE_LOCKING_DEBUG
4034 if (so->so_usecount <= 1) {
4035 panic("%s: release so=%p ref=%d on socket\n", __func__,
4036 so, so->so_usecount);
4037 /* NOTREACHED */
4038 }
4039 #endif
4040 if (delayed_copy_len) {
4041 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4042 }
4043
4044 if (free_list != NULL) {
4045 m_freem_list(free_list);
4046 }
4047
4048 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4049
4050 if (en_tracing) {
4051 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4052 VM_KERNEL_ADDRPERM(so),
4053 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4054 (int64_t)(orig_resid - uio_resid(uio)));
4055 }
4056 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4057 so->so_rcv.sb_cc, 0, error);
4058
4059 return error;
4060 }
4061
4062 /*
4063 * Returns: 0 Success
4064 * uiomove:EFAULT
4065 */
4066 static int
4067 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4068 user_ssize_t *resid)
4069 {
4070 int error = 0;
4071 struct mbuf *m;
4072
4073 m = *free_list;
4074
4075 socket_unlock(so, 0);
4076
4077 while (m != NULL && error == 0) {
4078 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4079 m = m->m_next;
4080 }
4081 m_freem_list(*free_list);
4082
4083 *free_list = NULL;
4084 *resid = 0;
4085
4086 socket_lock(so, 0);
4087
4088 return error;
4089 }
4090
4091 static int
4092 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4093 u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4094 {
4095 #pragma unused(so)
4096 int error = 0;
4097 struct mbuf *ml, *m;
4098 int i = 0;
4099 struct uio *auio;
4100
4101 for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4102 ml = ml->m_nextpkt, i++) {
4103 auio = msgarray[i].uio;
4104 for (m = ml; m != NULL; m = m->m_next) {
4105 error = uiomove(mtod(m, caddr_t), m->m_len, auio);
4106 if (error != 0) {
4107 goto out;
4108 }
4109 }
4110 }
4111 out:
4112 m_freem_list(*free_list);
4113
4114 *free_list = NULL;
4115 *resid = 0;
4116
4117 return error;
4118 }
4119
4120 int
4121 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4122 int *flagsp)
4123 {
4124 struct mbuf *m;
4125 struct mbuf *nextrecord;
4126 struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4127 int error;
4128 user_ssize_t len, pktlen, delayed_copy_len = 0;
4129 struct protosw *pr = so->so_proto;
4130 user_ssize_t resid;
4131 struct proc *p = current_proc();
4132 struct uio *auio = NULL;
4133 int npkts = 0;
4134 int sblocked = 0;
4135 struct sockaddr **psa = NULL;
4136 struct mbuf **controlp = NULL;
4137 int can_delay;
4138 int flags;
4139 struct mbuf *free_others = NULL;
4140
4141 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4142 so, uiocnt,
4143 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4144
4145 /*
4146 * Sanity checks:
4147 * - Only supports don't wait flags
4148 * - Only support datagram sockets (could be extended to raw)
4149 * - Must be atomic
4150 * - Protocol must support packet chains
4151 * - The uio array is NULL (should we panic?)
4152 */
4153 if (flagsp != NULL) {
4154 flags = *flagsp;
4155 } else {
4156 flags = 0;
4157 }
4158 if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4159 MSG_NBIO)) {
4160 printf("%s invalid flags 0x%x\n", __func__, flags);
4161 error = EINVAL;
4162 goto out;
4163 }
4164 if (so->so_type != SOCK_DGRAM) {
4165 error = EINVAL;
4166 goto out;
4167 }
4168 if (sosendallatonce(so) == 0) {
4169 error = EINVAL;
4170 goto out;
4171 }
4172 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4173 error = EPROTONOSUPPORT;
4174 goto out;
4175 }
4176 if (msgarray == NULL) {
4177 printf("%s uioarray is NULL\n", __func__);
4178 error = EINVAL;
4179 goto out;
4180 }
4181 if (uiocnt == 0) {
4182 printf("%s uiocnt is 0\n", __func__);
4183 error = EINVAL;
4184 goto out;
4185 }
4186 /*
4187 * Sanity check on the length passed by caller as we are making 'int'
4188 * comparisons
4189 */
4190 resid = recv_msg_array_resid(msgarray, uiocnt);
4191 if (resid < 0 || resid > INT_MAX) {
4192 error = EINVAL;
4193 goto out;
4194 }
4195
4196 if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
4197 can_delay = 1;
4198 } else {
4199 can_delay = 0;
4200 }
4201
4202 socket_lock(so, 1);
4203 so_update_last_owner_locked(so, p);
4204 so_update_policy(so);
4205
4206 #if NECP
4207 so_update_necp_policy(so, NULL, NULL);
4208 #endif /* NECP */
4209
4210 /*
4211 * If a recv attempt is made on a previously-accepted socket
4212 * that has been marked as inactive (disconnected), reject
4213 * the request.
4214 */
4215 if (so->so_flags & SOF_DEFUNCT) {
4216 struct sockbuf *sb = &so->so_rcv;
4217
4218 error = ENOTCONN;
4219 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4220 __func__, proc_pid(p), proc_best_name(p),
4221 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4222 SOCK_DOM(so), SOCK_TYPE(so), error);
4223 /*
4224 * This socket should have been disconnected and flushed
4225 * prior to being returned from sodefunct(); there should
4226 * be no data on its receive list, so panic otherwise.
4227 */
4228 if (so->so_state & SS_DEFUNCT) {
4229 sb_empty_assert(sb, __func__);
4230 }
4231 goto release;
4232 }
4233
4234 next:
4235 /*
4236 * The uio may be empty
4237 */
4238 if (npkts >= uiocnt) {
4239 error = 0;
4240 goto release;
4241 }
4242 restart:
4243 /*
4244 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4245 * and if so just return to the caller. This could happen when
4246 * soreceive() is called by a socket upcall function during the
4247 * time the socket is freed. The socket buffer would have been
4248 * locked across the upcall, therefore we cannot put this thread
4249 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4250 * we may livelock), because the lock on the socket buffer will
4251 * only be released when the upcall routine returns to its caller.
4252 * Because the socket has been officially closed, there can be
4253 * no further read on it.
4254 */
4255 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4256 (SS_NOFDREF | SS_CANTRCVMORE)) {
4257 error = 0;
4258 goto release;
4259 }
4260
4261 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4262 if (error) {
4263 goto release;
4264 }
4265 sblocked = 1;
4266
4267 m = so->so_rcv.sb_mb;
4268 /*
4269 * Block awaiting more datagram if needed
4270 */
4271 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4272 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4273 ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4274 /*
4275 * Panic if we notice inconsistencies in the socket's
4276 * receive list; both sb_mb and sb_cc should correctly
4277 * reflect the contents of the list, otherwise we may
4278 * end up with false positives during select() or poll()
4279 * which could put the application in a bad state.
4280 */
4281 SB_MB_CHECK(&so->so_rcv);
4282
4283 if (so->so_error) {
4284 error = so->so_error;
4285 if ((flags & MSG_PEEK) == 0) {
4286 so->so_error = 0;
4287 }
4288 goto release;
4289 }
4290 if (so->so_state & SS_CANTRCVMORE) {
4291 goto release;
4292 }
4293 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4294 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4295 error = ENOTCONN;
4296 goto release;
4297 }
4298 if ((so->so_state & SS_NBIO) ||
4299 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4300 error = EWOULDBLOCK;
4301 goto release;
4302 }
4303 /*
4304 * Do not block if we got some data
4305 */
4306 if (free_list != NULL) {
4307 error = 0;
4308 goto release;
4309 }
4310
4311 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4312 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4313
4314 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4315 sblocked = 0;
4316
4317 error = sbwait(&so->so_rcv);
4318 if (error) {
4319 goto release;
4320 }
4321 goto restart;
4322 }
4323
4324 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4325 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4326 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4327
4328 /*
4329 * Consume the current uio index as we have a datagram
4330 */
4331 auio = msgarray[npkts].uio;
4332 resid = uio_resid(auio);
4333 msgarray[npkts].which |= SOCK_MSG_DATA;
4334 psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4335 &msgarray[npkts].psa : NULL;
4336 controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4337 &msgarray[npkts].controlp : NULL;
4338 npkts += 1;
4339 nextrecord = m->m_nextpkt;
4340
4341 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4342 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4343 if (error == ERESTART) {
4344 goto restart;
4345 } else if (error != 0) {
4346 goto release;
4347 }
4348 }
4349
4350 if (m != NULL && m->m_type == MT_CONTROL) {
4351 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4352 if (error != 0) {
4353 goto release;
4354 }
4355 }
4356
4357 if (m->m_pkthdr.len == 0) {
4358 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4359 __func__, __LINE__,
4360 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4361 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4362 m->m_type);
4363 }
4364
4365 /*
4366 * Loop to copy the mbufs of the current record
4367 * Support zero length packets
4368 */
4369 ml = NULL;
4370 pktlen = 0;
4371 while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4372 if (m->m_len == 0) {
4373 panic("%p m_len zero", m);
4374 }
4375 if (m->m_type == 0) {
4376 panic("%p m_type zero", m);
4377 }
4378 /*
4379 * Clip to the residual length
4380 */
4381 if (len > m->m_len) {
4382 len = m->m_len;
4383 }
4384 pktlen += len;
4385 /*
4386 * Copy the mbufs via the uio or delay the copy
4387 * Sockbuf must be consistent here (points to current mbuf,
4388 * it points to next record) when we drop priority;
4389 * we must note any additions to the sockbuf when we
4390 * block interrupts again.
4391 */
4392 if (len > 0 && can_delay == 0) {
4393 socket_unlock(so, 0);
4394 error = uiomove(mtod(m, caddr_t), (int)len, auio);
4395 socket_lock(so, 0);
4396 if (error) {
4397 goto release;
4398 }
4399 } else {
4400 delayed_copy_len += len;
4401 }
4402
4403 if (len == m->m_len) {
4404 /*
4405 * m was entirely copied
4406 */
4407 sbfree(&so->so_rcv, m);
4408 nextrecord = m->m_nextpkt;
4409 m->m_nextpkt = NULL;
4410
4411 /*
4412 * Set the first packet to the head of the free list
4413 */
4414 if (free_list == NULL) {
4415 free_list = m;
4416 }
4417 /*
4418 * Link current packet to tail of free list
4419 */
4420 if (ml == NULL) {
4421 if (free_tail != NULL) {
4422 free_tail->m_nextpkt = m;
4423 }
4424 free_tail = m;
4425 }
4426 /*
4427 * Link current mbuf to last mbuf of current packet
4428 */
4429 if (ml != NULL) {
4430 ml->m_next = m;
4431 }
4432 ml = m;
4433
4434 /*
4435 * Move next buf to head of socket buffer
4436 */
4437 so->so_rcv.sb_mb = m = ml->m_next;
4438 ml->m_next = NULL;
4439
4440 if (m != NULL) {
4441 m->m_nextpkt = nextrecord;
4442 if (nextrecord == NULL) {
4443 so->so_rcv.sb_lastrecord = m;
4444 }
4445 } else {
4446 so->so_rcv.sb_mb = nextrecord;
4447 SB_EMPTY_FIXUP(&so->so_rcv);
4448 }
4449 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4450 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4451 } else {
4452 /*
4453 * Stop the loop on partial copy
4454 */
4455 break;
4456 }
4457 }
4458 #ifdef MORE_LOCKING_DEBUG
4459 if (so->so_usecount <= 1) {
4460 panic("%s: after big while so=%llx ref=%d on socket\n",
4461 __func__,
4462 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4463 /* NOTREACHED */
4464 }
4465 #endif
4466 /*
4467 * Tell the caller we made a partial copy
4468 */
4469 if (m != NULL) {
4470 if (so->so_options & SO_DONTTRUNC) {
4471 /*
4472 * Copyout first the freelist then the partial mbuf
4473 */
4474 socket_unlock(so, 0);
4475 if (delayed_copy_len) {
4476 error = sodelayed_copy_list(so, msgarray,
4477 uiocnt, &free_list, &delayed_copy_len);
4478 }
4479
4480 if (error == 0) {
4481 error = uiomove(mtod(m, caddr_t), (int)len,
4482 auio);
4483 }
4484 socket_lock(so, 0);
4485 if (error) {
4486 goto release;
4487 }
4488
4489 m->m_data += len;
4490 m->m_len -= len;
4491 so->so_rcv.sb_cc -= len;
4492 flags |= MSG_RCVMORE;
4493 } else {
4494 (void) sbdroprecord(&so->so_rcv);
4495 nextrecord = so->so_rcv.sb_mb;
4496 m = NULL;
4497 flags |= MSG_TRUNC;
4498 }
4499 }
4500
4501 if (m == NULL) {
4502 so->so_rcv.sb_mb = nextrecord;
4503 /*
4504 * First part is an inline SB_EMPTY_FIXUP(). Second
4505 * part makes sure sb_lastrecord is up-to-date if
4506 * there is still data in the socket buffer.
4507 */
4508 if (so->so_rcv.sb_mb == NULL) {
4509 so->so_rcv.sb_mbtail = NULL;
4510 so->so_rcv.sb_lastrecord = NULL;
4511 } else if (nextrecord->m_nextpkt == NULL) {
4512 so->so_rcv.sb_lastrecord = nextrecord;
4513 }
4514 SB_MB_CHECK(&so->so_rcv);
4515 }
4516 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4517 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4518
4519 /*
4520 * We can continue to the next packet as long as:
4521 * - We haven't exhausted the uio array
4522 * - There was no error
4523 * - A packet was not truncated
4524 * - We can still receive more data
4525 */
4526 if (npkts < uiocnt && error == 0 &&
4527 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4528 (so->so_state & SS_CANTRCVMORE) == 0) {
4529 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4530 sblocked = 0;
4531
4532 goto next;
4533 }
4534 if (flagsp != NULL) {
4535 *flagsp |= flags;
4536 }
4537
4538 release:
4539 /*
4540 * pru_rcvd may cause more data to be received if the socket lock
4541 * is dropped so we set MSG_HAVEMORE now based on what we know.
4542 * That way the caller won't be surprised if it receives less data
4543 * than requested.
4544 */
4545 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4546 flags |= MSG_HAVEMORE;
4547 }
4548
4549 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4550 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4551 }
4552
4553 if (sblocked) {
4554 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4555 } else {
4556 socket_unlock(so, 1);
4557 }
4558
4559 if (delayed_copy_len) {
4560 error = sodelayed_copy_list(so, msgarray, uiocnt,
4561 &free_list, &delayed_copy_len);
4562 }
4563 out:
4564 /*
4565 * Amortize the cost of freeing the mbufs
4566 */
4567 if (free_list != NULL) {
4568 m_freem_list(free_list);
4569 }
4570 if (free_others != NULL) {
4571 m_freem_list(free_others);
4572 }
4573
4574 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4575 0, 0, 0, 0);
4576 return error;
4577 }
4578
4579 static int
4580 so_statistics_event_to_nstat_event(int64_t *input_options,
4581 uint64_t *nstat_event)
4582 {
4583 int error = 0;
4584 switch (*input_options) {
4585 case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4586 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4587 break;
4588 case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4589 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4590 break;
4591 #if (DEBUG || DEVELOPMENT)
4592 case SO_STATISTICS_EVENT_RESERVED_1:
4593 *nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4594 break;
4595 case SO_STATISTICS_EVENT_RESERVED_2:
4596 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4597 break;
4598 #endif /* (DEBUG || DEVELOPMENT) */
4599 default:
4600 error = EINVAL;
4601 break;
4602 }
4603 return error;
4604 }
4605
4606 /*
4607 * Returns: 0 Success
4608 * EINVAL
4609 * ENOTCONN
4610 * <pru_shutdown>:EINVAL
4611 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4612 * <pru_shutdown>:ENOBUFS[TCP]
4613 * <pru_shutdown>:EMSGSIZE[TCP]
4614 * <pru_shutdown>:EHOSTUNREACH[TCP]
4615 * <pru_shutdown>:ENETUNREACH[TCP]
4616 * <pru_shutdown>:ENETDOWN[TCP]
4617 * <pru_shutdown>:ENOMEM[TCP]
4618 * <pru_shutdown>:EACCES[TCP]
4619 * <pru_shutdown>:EMSGSIZE[TCP]
4620 * <pru_shutdown>:ENOBUFS[TCP]
4621 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4622 * <pru_shutdown>:??? [other protocol families]
4623 */
4624 int
4625 soshutdown(struct socket *so, int how)
4626 {
4627 int error;
4628
4629 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4630
4631 switch (how) {
4632 case SHUT_RD:
4633 case SHUT_WR:
4634 case SHUT_RDWR:
4635 socket_lock(so, 1);
4636 if ((so->so_state &
4637 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4638 error = ENOTCONN;
4639 } else {
4640 error = soshutdownlock(so, how);
4641 }
4642 socket_unlock(so, 1);
4643 break;
4644 default:
4645 error = EINVAL;
4646 break;
4647 }
4648
4649 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4650
4651 return error;
4652 }
4653
4654 int
4655 soshutdownlock_final(struct socket *so, int how)
4656 {
4657 struct protosw *pr = so->so_proto;
4658 int error = 0;
4659
4660 sflt_notify(so, sock_evt_shutdown, &how);
4661
4662 if (how != SHUT_WR) {
4663 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4664 /* read already shut down */
4665 error = ENOTCONN;
4666 goto done;
4667 }
4668 sorflush(so);
4669 postevent(so, 0, EV_RCLOSED);
4670 }
4671 if (how != SHUT_RD) {
4672 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4673 /* write already shut down */
4674 error = ENOTCONN;
4675 goto done;
4676 }
4677 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4678 postevent(so, 0, EV_WCLOSED);
4679 }
4680 done:
4681 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4682 return error;
4683 }
4684
4685 int
4686 soshutdownlock(struct socket *so, int how)
4687 {
4688 int error = 0;
4689
4690 #if CONTENT_FILTER
4691 /*
4692 * A content filter may delay the actual shutdown until it
4693 * has processed the pending data
4694 */
4695 if (so->so_flags & SOF_CONTENT_FILTER) {
4696 error = cfil_sock_shutdown(so, &how);
4697 if (error == EJUSTRETURN) {
4698 error = 0;
4699 goto done;
4700 } else if (error != 0) {
4701 goto done;
4702 }
4703 }
4704 #endif /* CONTENT_FILTER */
4705
4706 error = soshutdownlock_final(so, how);
4707
4708 done:
4709 return error;
4710 }
4711
4712 void
4713 sowflush(struct socket *so)
4714 {
4715 struct sockbuf *sb = &so->so_snd;
4716
4717 /*
4718 * Obtain lock on the socket buffer (SB_LOCK). This is required
4719 * to prevent the socket buffer from being unexpectedly altered
4720 * while it is used by another thread in socket send/receive.
4721 *
4722 * sblock() must not fail here, hence the assertion.
4723 */
4724 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4725 VERIFY(sb->sb_flags & SB_LOCK);
4726
4727 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4728 sb->sb_flags |= SB_DROP;
4729 sb->sb_upcall = NULL;
4730 sb->sb_upcallarg = NULL;
4731
4732 sbunlock(sb, TRUE); /* keep socket locked */
4733
4734 selthreadclear(&sb->sb_sel);
4735 sbrelease(sb);
4736 }
4737
4738 void
4739 sorflush(struct socket *so)
4740 {
4741 struct sockbuf *sb = &so->so_rcv;
4742 struct protosw *pr = so->so_proto;
4743 struct sockbuf asb;
4744 #ifdef notyet
4745 lck_mtx_t *mutex_held;
4746 /*
4747 * XXX: This code is currently commented out, because we may get here
4748 * as part of sofreelastref(), and at that time, pr_getlock() may no
4749 * longer be able to return us the lock; this will be fixed in future.
4750 */
4751 if (so->so_proto->pr_getlock != NULL) {
4752 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4753 } else {
4754 mutex_held = so->so_proto->pr_domain->dom_mtx;
4755 }
4756
4757 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4758 #endif /* notyet */
4759
4760 sflt_notify(so, sock_evt_flush_read, NULL);
4761
4762 socantrcvmore(so);
4763
4764 /*
4765 * Obtain lock on the socket buffer (SB_LOCK). This is required
4766 * to prevent the socket buffer from being unexpectedly altered
4767 * while it is used by another thread in socket send/receive.
4768 *
4769 * sblock() must not fail here, hence the assertion.
4770 */
4771 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4772 VERIFY(sb->sb_flags & SB_LOCK);
4773
4774 /*
4775 * Copy only the relevant fields from "sb" to "asb" which we
4776 * need for sbrelease() to function. In particular, skip
4777 * sb_sel as it contains the wait queue linkage, which would
4778 * wreak havoc if we were to issue selthreadclear() on "asb".
4779 * Make sure to not carry over SB_LOCK in "asb", as we need
4780 * to acquire it later as part of sbrelease().
4781 */
4782 bzero(&asb, sizeof(asb));
4783 asb.sb_cc = sb->sb_cc;
4784 asb.sb_hiwat = sb->sb_hiwat;
4785 asb.sb_mbcnt = sb->sb_mbcnt;
4786 asb.sb_mbmax = sb->sb_mbmax;
4787 asb.sb_ctl = sb->sb_ctl;
4788 asb.sb_lowat = sb->sb_lowat;
4789 asb.sb_mb = sb->sb_mb;
4790 asb.sb_mbtail = sb->sb_mbtail;
4791 asb.sb_lastrecord = sb->sb_lastrecord;
4792 asb.sb_so = sb->sb_so;
4793 asb.sb_flags = sb->sb_flags;
4794 asb.sb_flags &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4795 asb.sb_flags |= SB_DROP;
4796
4797 /*
4798 * Ideally we'd bzero() these and preserve the ones we need;
4799 * but to do that we'd need to shuffle things around in the
4800 * sockbuf, and we can't do it now because there are KEXTS
4801 * that are directly referring to the socket structure.
4802 *
4803 * Setting SB_DROP acts as a barrier to prevent further appends.
4804 * Clearing SB_SEL is done for selthreadclear() below.
4805 */
4806 sb->sb_cc = 0;
4807 sb->sb_hiwat = 0;
4808 sb->sb_mbcnt = 0;
4809 sb->sb_mbmax = 0;
4810 sb->sb_ctl = 0;
4811 sb->sb_lowat = 0;
4812 sb->sb_mb = NULL;
4813 sb->sb_mbtail = NULL;
4814 sb->sb_lastrecord = NULL;
4815 sb->sb_timeo.tv_sec = 0;
4816 sb->sb_timeo.tv_usec = 0;
4817 sb->sb_upcall = NULL;
4818 sb->sb_upcallarg = NULL;
4819 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4820 sb->sb_flags |= SB_DROP;
4821
4822 sbunlock(sb, TRUE); /* keep socket locked */
4823
4824 /*
4825 * Note that selthreadclear() is called on the original "sb" and
4826 * not the local "asb" because of the way wait queue linkage is
4827 * implemented. Given that selwakeup() may be triggered, SB_SEL
4828 * should no longer be set (cleared above.)
4829 */
4830 selthreadclear(&sb->sb_sel);
4831
4832 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4833 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4834 }
4835
4836 sbrelease(&asb);
4837 }
4838
4839 /*
4840 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4841 * an additional variant to handle the case where the option value needs
4842 * to be some kind of integer, but not a specific size.
4843 * In addition to their use here, these functions are also called by the
4844 * protocol-level pr_ctloutput() routines.
4845 *
4846 * Returns: 0 Success
4847 * EINVAL
4848 * copyin:EFAULT
4849 */
4850 int
4851 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4852 {
4853 size_t valsize;
4854
4855 /*
4856 * If the user gives us more than we wanted, we ignore it,
4857 * but if we don't get the minimum length the caller
4858 * wants, we return EINVAL. On success, sopt->sopt_valsize
4859 * is set to however much we actually retrieved.
4860 */
4861 if ((valsize = sopt->sopt_valsize) < minlen) {
4862 return EINVAL;
4863 }
4864 if (valsize > len) {
4865 sopt->sopt_valsize = valsize = len;
4866 }
4867
4868 if (sopt->sopt_p != kernproc) {
4869 return copyin(sopt->sopt_val, buf, valsize);
4870 }
4871
4872 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4873 return 0;
4874 }
4875
4876 /*
4877 * sooptcopyin_timeval
4878 * Copy in a timeval value into tv_p, and take into account whether the
4879 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4880 * code here so that we can verify the 64-bit tv_sec value before we lose
4881 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4882 */
4883 static int
4884 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4885 {
4886 int error;
4887
4888 if (proc_is64bit(sopt->sopt_p)) {
4889 struct user64_timeval tv64;
4890
4891 if (sopt->sopt_valsize < sizeof(tv64)) {
4892 return EINVAL;
4893 }
4894
4895 sopt->sopt_valsize = sizeof(tv64);
4896 if (sopt->sopt_p != kernproc) {
4897 error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4898 if (error != 0) {
4899 return error;
4900 }
4901 } else {
4902 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4903 sizeof(tv64));
4904 }
4905 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4906 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4907 return EDOM;
4908 }
4909
4910 tv_p->tv_sec = tv64.tv_sec;
4911 tv_p->tv_usec = tv64.tv_usec;
4912 } else {
4913 struct user32_timeval tv32;
4914
4915 if (sopt->sopt_valsize < sizeof(tv32)) {
4916 return EINVAL;
4917 }
4918
4919 sopt->sopt_valsize = sizeof(tv32);
4920 if (sopt->sopt_p != kernproc) {
4921 error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4922 if (error != 0) {
4923 return error;
4924 }
4925 } else {
4926 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4927 sizeof(tv32));
4928 }
4929 #ifndef __LP64__
4930 /*
4931 * K64todo "comparison is always false due to
4932 * limited range of data type"
4933 */
4934 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4935 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4936 return EDOM;
4937 }
4938 #endif
4939 tv_p->tv_sec = tv32.tv_sec;
4940 tv_p->tv_usec = tv32.tv_usec;
4941 }
4942 return 0;
4943 }
4944
4945 int
4946 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4947 boolean_t ignore_delegate)
4948 {
4949 kauth_cred_t cred = NULL;
4950 proc_t ep = PROC_NULL;
4951 uid_t uid;
4952 int error = 0;
4953
4954 if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4955 ep = proc_find(so->e_pid);
4956 if (ep) {
4957 cred = kauth_cred_proc_ref(ep);
4958 }
4959 }
4960
4961 uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4962
4963 /* uid is 0 for root */
4964 if (uid != 0 || !allow_root) {
4965 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4966 }
4967 if (cred) {
4968 kauth_cred_unref(&cred);
4969 }
4970 if (ep != PROC_NULL) {
4971 proc_rele(ep);
4972 }
4973
4974 return error;
4975 }
4976
4977 /*
4978 * Returns: 0 Success
4979 * EINVAL
4980 * ENOPROTOOPT
4981 * ENOBUFS
4982 * EDOM
4983 * sooptcopyin:EINVAL
4984 * sooptcopyin:EFAULT
4985 * sooptcopyin_timeval:EINVAL
4986 * sooptcopyin_timeval:EFAULT
4987 * sooptcopyin_timeval:EDOM
4988 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4989 * <pr_ctloutput>:???w
4990 * sflt_attach_private:??? [whatever a filter author chooses]
4991 * <sf_setoption>:??? [whatever a filter author chooses]
4992 *
4993 * Notes: Other <pru_listen> returns depend on the protocol family; all
4994 * <sf_listen> returns depend on what the filter author causes
4995 * their filter to return.
4996 */
4997 int
4998 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4999 {
5000 int error, optval;
5001 int64_t long_optval;
5002 struct linger l;
5003 struct timeval tv;
5004 #if CONFIG_MACF_SOCKET
5005 struct mac extmac;
5006 #endif /* MAC_SOCKET */
5007
5008 if (sopt->sopt_dir != SOPT_SET) {
5009 sopt->sopt_dir = SOPT_SET;
5010 }
5011
5012 if (dolock) {
5013 socket_lock(so, 1);
5014 }
5015
5016 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
5017 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
5018 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
5019 /* the socket has been shutdown, no more sockopt's */
5020 error = EINVAL;
5021 goto out;
5022 }
5023
5024 error = sflt_setsockopt(so, sopt);
5025 if (error != 0) {
5026 if (error == EJUSTRETURN) {
5027 error = 0;
5028 }
5029 goto out;
5030 }
5031
5032 if (sopt->sopt_level != SOL_SOCKET) {
5033 if (so->so_proto != NULL &&
5034 so->so_proto->pr_ctloutput != NULL) {
5035 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5036 goto out;
5037 }
5038 error = ENOPROTOOPT;
5039 } else {
5040 /*
5041 * Allow socket-level (SOL_SOCKET) options to be filtered by
5042 * the protocol layer, if needed. A zero value returned from
5043 * the handler means use default socket-level processing as
5044 * done by the rest of this routine. Otherwise, any other
5045 * return value indicates that the option is unsupported.
5046 */
5047 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5048 pru_socheckopt(so, sopt)) != 0) {
5049 goto out;
5050 }
5051
5052 error = 0;
5053 switch (sopt->sopt_name) {
5054 case SO_LINGER:
5055 case SO_LINGER_SEC:
5056 error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5057 if (error != 0) {
5058 goto out;
5059 }
5060
5061 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
5062 l.l_linger : l.l_linger * hz;
5063 if (l.l_onoff != 0) {
5064 so->so_options |= SO_LINGER;
5065 } else {
5066 so->so_options &= ~SO_LINGER;
5067 }
5068 break;
5069
5070 case SO_DEBUG:
5071 case SO_KEEPALIVE:
5072 case SO_DONTROUTE:
5073 case SO_USELOOPBACK:
5074 case SO_BROADCAST:
5075 case SO_REUSEADDR:
5076 case SO_REUSEPORT:
5077 case SO_OOBINLINE:
5078 case SO_TIMESTAMP:
5079 case SO_TIMESTAMP_MONOTONIC:
5080 case SO_TIMESTAMP_CONTINUOUS:
5081 case SO_DONTTRUNC:
5082 case SO_WANTMORE:
5083 case SO_WANTOOBFLAG:
5084 case SO_NOWAKEFROMSLEEP:
5085 case SO_NOAPNFALLBK:
5086 error = sooptcopyin(sopt, &optval, sizeof(optval),
5087 sizeof(optval));
5088 if (error != 0) {
5089 goto out;
5090 }
5091 if (optval) {
5092 so->so_options |= sopt->sopt_name;
5093 } else {
5094 so->so_options &= ~sopt->sopt_name;
5095 }
5096 break;
5097
5098 case SO_SNDBUF:
5099 case SO_RCVBUF:
5100 case SO_SNDLOWAT:
5101 case SO_RCVLOWAT:
5102 error = sooptcopyin(sopt, &optval, sizeof(optval),
5103 sizeof(optval));
5104 if (error != 0) {
5105 goto out;
5106 }
5107
5108 /*
5109 * Values < 1 make no sense for any of these
5110 * options, so disallow them.
5111 */
5112 if (optval < 1) {
5113 error = EINVAL;
5114 goto out;
5115 }
5116
5117 switch (sopt->sopt_name) {
5118 case SO_SNDBUF:
5119 case SO_RCVBUF: {
5120 struct sockbuf *sb =
5121 (sopt->sopt_name == SO_SNDBUF) ?
5122 &so->so_snd : &so->so_rcv;
5123 if (sbreserve(sb, (u_int32_t)optval) == 0) {
5124 error = ENOBUFS;
5125 goto out;
5126 }
5127 sb->sb_flags |= SB_USRSIZE;
5128 sb->sb_flags &= ~SB_AUTOSIZE;
5129 sb->sb_idealsize = (u_int32_t)optval;
5130 break;
5131 }
5132 /*
5133 * Make sure the low-water is never greater than
5134 * the high-water.
5135 */
5136 case SO_SNDLOWAT: {
5137 int space = sbspace(&so->so_snd);
5138 u_int32_t hiwat = so->so_snd.sb_hiwat;
5139
5140 if (so->so_snd.sb_flags & SB_UNIX) {
5141 struct unpcb *unp =
5142 (struct unpcb *)(so->so_pcb);
5143 if (unp != NULL &&
5144 unp->unp_conn != NULL) {
5145 hiwat += unp->unp_conn->unp_cc;
5146 }
5147 }
5148
5149 so->so_snd.sb_lowat =
5150 (optval > hiwat) ?
5151 hiwat : optval;
5152
5153 if (space >= so->so_snd.sb_lowat) {
5154 sowwakeup(so);
5155 }
5156 break;
5157 }
5158 case SO_RCVLOWAT: {
5159 int64_t data_len;
5160 so->so_rcv.sb_lowat =
5161 (optval > so->so_rcv.sb_hiwat) ?
5162 so->so_rcv.sb_hiwat : optval;
5163 data_len = so->so_rcv.sb_cc
5164 - so->so_rcv.sb_ctl;
5165 if (data_len >= so->so_rcv.sb_lowat) {
5166 sorwakeup(so);
5167 }
5168 break;
5169 }
5170 }
5171 break;
5172
5173 case SO_SNDTIMEO:
5174 case SO_RCVTIMEO:
5175 error = sooptcopyin_timeval(sopt, &tv);
5176 if (error != 0) {
5177 goto out;
5178 }
5179
5180 switch (sopt->sopt_name) {
5181 case SO_SNDTIMEO:
5182 so->so_snd.sb_timeo = tv;
5183 break;
5184 case SO_RCVTIMEO:
5185 so->so_rcv.sb_timeo = tv;
5186 break;
5187 }
5188 break;
5189
5190 case SO_NKE: {
5191 struct so_nke nke;
5192
5193 error = sooptcopyin(sopt, &nke, sizeof(nke),
5194 sizeof(nke));
5195 if (error != 0) {
5196 goto out;
5197 }
5198
5199 error = sflt_attach_internal(so, nke.nke_handle);
5200 break;
5201 }
5202
5203 case SO_NOSIGPIPE:
5204 error = sooptcopyin(sopt, &optval, sizeof(optval),
5205 sizeof(optval));
5206 if (error != 0) {
5207 goto out;
5208 }
5209 if (optval != 0) {
5210 so->so_flags |= SOF_NOSIGPIPE;
5211 } else {
5212 so->so_flags &= ~SOF_NOSIGPIPE;
5213 }
5214 break;
5215
5216 case SO_NOADDRERR:
5217 error = sooptcopyin(sopt, &optval, sizeof(optval),
5218 sizeof(optval));
5219 if (error != 0) {
5220 goto out;
5221 }
5222 if (optval != 0) {
5223 so->so_flags |= SOF_NOADDRAVAIL;
5224 } else {
5225 so->so_flags &= ~SOF_NOADDRAVAIL;
5226 }
5227 break;
5228
5229 case SO_REUSESHAREUID:
5230 error = sooptcopyin(sopt, &optval, sizeof(optval),
5231 sizeof(optval));
5232 if (error != 0) {
5233 goto out;
5234 }
5235 if (optval != 0) {
5236 so->so_flags |= SOF_REUSESHAREUID;
5237 } else {
5238 so->so_flags &= ~SOF_REUSESHAREUID;
5239 }
5240 break;
5241
5242 case SO_NOTIFYCONFLICT:
5243 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5244 error = EPERM;
5245 goto out;
5246 }
5247 error = sooptcopyin(sopt, &optval, sizeof(optval),
5248 sizeof(optval));
5249 if (error != 0) {
5250 goto out;
5251 }
5252 if (optval != 0) {
5253 so->so_flags |= SOF_NOTIFYCONFLICT;
5254 } else {
5255 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5256 }
5257 break;
5258
5259 case SO_RESTRICTIONS:
5260 error = sooptcopyin(sopt, &optval, sizeof(optval),
5261 sizeof(optval));
5262 if (error != 0) {
5263 goto out;
5264 }
5265
5266 error = so_set_restrictions(so, optval);
5267 break;
5268
5269 case SO_AWDL_UNRESTRICTED:
5270 if (SOCK_DOM(so) != PF_INET &&
5271 SOCK_DOM(so) != PF_INET6) {
5272 error = EOPNOTSUPP;
5273 goto out;
5274 }
5275 error = sooptcopyin(sopt, &optval, sizeof(optval),
5276 sizeof(optval));
5277 if (error != 0) {
5278 goto out;
5279 }
5280 if (optval != 0) {
5281 error = soopt_cred_check(so,
5282 PRIV_NET_RESTRICTED_AWDL, false, false);
5283 if (error == 0) {
5284 inp_set_awdl_unrestricted(
5285 sotoinpcb(so));
5286 }
5287 } else {
5288 inp_clear_awdl_unrestricted(sotoinpcb(so));
5289 }
5290 break;
5291 case SO_INTCOPROC_ALLOW:
5292 if (SOCK_DOM(so) != PF_INET6) {
5293 error = EOPNOTSUPP;
5294 goto out;
5295 }
5296 error = sooptcopyin(sopt, &optval, sizeof(optval),
5297 sizeof(optval));
5298 if (error != 0) {
5299 goto out;
5300 }
5301 if (optval != 0 &&
5302 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5303 error = soopt_cred_check(so,
5304 PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5305 if (error == 0) {
5306 inp_set_intcoproc_allowed(
5307 sotoinpcb(so));
5308 }
5309 } else if (optval == 0) {
5310 inp_clear_intcoproc_allowed(sotoinpcb(so));
5311 }
5312 break;
5313
5314 case SO_LABEL:
5315 #if CONFIG_MACF_SOCKET
5316 if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
5317 sizeof(extmac))) != 0) {
5318 goto out;
5319 }
5320
5321 error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
5322 so, &extmac);
5323 #else
5324 error = EOPNOTSUPP;
5325 #endif /* MAC_SOCKET */
5326 break;
5327
5328 case SO_UPCALLCLOSEWAIT:
5329 error = sooptcopyin(sopt, &optval, sizeof(optval),
5330 sizeof(optval));
5331 if (error != 0) {
5332 goto out;
5333 }
5334 if (optval != 0) {
5335 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5336 } else {
5337 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5338 }
5339 break;
5340
5341 case SO_RANDOMPORT:
5342 error = sooptcopyin(sopt, &optval, sizeof(optval),
5343 sizeof(optval));
5344 if (error != 0) {
5345 goto out;
5346 }
5347 if (optval != 0) {
5348 so->so_flags |= SOF_BINDRANDOMPORT;
5349 } else {
5350 so->so_flags &= ~SOF_BINDRANDOMPORT;
5351 }
5352 break;
5353
5354 case SO_NP_EXTENSIONS: {
5355 struct so_np_extensions sonpx;
5356
5357 error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5358 sizeof(sonpx));
5359 if (error != 0) {
5360 goto out;
5361 }
5362 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5363 error = EINVAL;
5364 goto out;
5365 }
5366 /*
5367 * Only one bit defined for now
5368 */
5369 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5370 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5371 so->so_flags |= SOF_NPX_SETOPTSHUT;
5372 } else {
5373 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5374 }
5375 }
5376 break;
5377 }
5378
5379 case SO_TRAFFIC_CLASS: {
5380 error = sooptcopyin(sopt, &optval, sizeof(optval),
5381 sizeof(optval));
5382 if (error != 0) {
5383 goto out;
5384 }
5385 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5386 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5387 error = so_set_net_service_type(so, netsvc);
5388 goto out;
5389 }
5390 error = so_set_traffic_class(so, optval);
5391 if (error != 0) {
5392 goto out;
5393 }
5394 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5395 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5396 break;
5397 }
5398
5399 case SO_RECV_TRAFFIC_CLASS: {
5400 error = sooptcopyin(sopt, &optval, sizeof(optval),
5401 sizeof(optval));
5402 if (error != 0) {
5403 goto out;
5404 }
5405 if (optval == 0) {
5406 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5407 } else {
5408 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5409 }
5410 break;
5411 }
5412
5413 #if (DEVELOPMENT || DEBUG)
5414 case SO_TRAFFIC_CLASS_DBG: {
5415 struct so_tcdbg so_tcdbg;
5416
5417 error = sooptcopyin(sopt, &so_tcdbg,
5418 sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5419 if (error != 0) {
5420 goto out;
5421 }
5422 error = so_set_tcdbg(so, &so_tcdbg);
5423 if (error != 0) {
5424 goto out;
5425 }
5426 break;
5427 }
5428 #endif /* (DEVELOPMENT || DEBUG) */
5429
5430 case SO_PRIVILEGED_TRAFFIC_CLASS:
5431 error = priv_check_cred(kauth_cred_get(),
5432 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5433 if (error != 0) {
5434 goto out;
5435 }
5436 error = sooptcopyin(sopt, &optval, sizeof(optval),
5437 sizeof(optval));
5438 if (error != 0) {
5439 goto out;
5440 }
5441 if (optval == 0) {
5442 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5443 } else {
5444 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5445 }
5446 break;
5447
5448 #if (DEVELOPMENT || DEBUG)
5449 case SO_DEFUNCTIT:
5450 error = sosetdefunct(current_proc(), so, 0, FALSE);
5451 if (error == 0) {
5452 error = sodefunct(current_proc(), so, 0);
5453 }
5454
5455 break;
5456 #endif /* (DEVELOPMENT || DEBUG) */
5457
5458 case SO_DEFUNCTOK:
5459 error = sooptcopyin(sopt, &optval, sizeof(optval),
5460 sizeof(optval));
5461 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5462 if (error == 0) {
5463 error = EBADF;
5464 }
5465 goto out;
5466 }
5467 /*
5468 * Any process can set SO_DEFUNCTOK (clear
5469 * SOF_NODEFUNCT), but only root can clear
5470 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5471 */
5472 if (optval == 0 &&
5473 kauth_cred_issuser(kauth_cred_get()) == 0) {
5474 error = EPERM;
5475 goto out;
5476 }
5477 if (optval) {
5478 so->so_flags &= ~SOF_NODEFUNCT;
5479 } else {
5480 so->so_flags |= SOF_NODEFUNCT;
5481 }
5482
5483 if (SOCK_DOM(so) == PF_INET ||
5484 SOCK_DOM(so) == PF_INET6) {
5485 char s[MAX_IPv6_STR_LEN];
5486 char d[MAX_IPv6_STR_LEN];
5487 struct inpcb *inp = sotoinpcb(so);
5488
5489 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5490 "[%s %s:%d -> %s:%d] is now marked "
5491 "as %seligible for "
5492 "defunct\n", __func__, proc_selfpid(),
5493 proc_best_name(current_proc()),
5494 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5495 (SOCK_TYPE(so) == SOCK_STREAM) ?
5496 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5497 ((SOCK_DOM(so) == PF_INET) ?
5498 (void *)&inp->inp_laddr.s_addr :
5499 (void *)&inp->in6p_laddr), s, sizeof(s)),
5500 ntohs(inp->in6p_lport),
5501 inet_ntop(SOCK_DOM(so),
5502 (SOCK_DOM(so) == PF_INET) ?
5503 (void *)&inp->inp_faddr.s_addr :
5504 (void *)&inp->in6p_faddr, d, sizeof(d)),
5505 ntohs(inp->in6p_fport),
5506 (so->so_flags & SOF_NODEFUNCT) ?
5507 "not " : "");
5508 } else {
5509 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5510 "is now marked as %seligible for "
5511 "defunct\n",
5512 __func__, proc_selfpid(),
5513 proc_best_name(current_proc()),
5514 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5515 SOCK_DOM(so), SOCK_TYPE(so),
5516 (so->so_flags & SOF_NODEFUNCT) ?
5517 "not " : "");
5518 }
5519 break;
5520
5521 case SO_ISDEFUNCT:
5522 /* This option is not settable */
5523 error = EINVAL;
5524 break;
5525
5526 case SO_OPPORTUNISTIC:
5527 error = sooptcopyin(sopt, &optval, sizeof(optval),
5528 sizeof(optval));
5529 if (error == 0) {
5530 error = so_set_opportunistic(so, optval);
5531 }
5532 break;
5533
5534 case SO_FLUSH:
5535 /* This option is handled by lower layer(s) */
5536 error = 0;
5537 break;
5538
5539 case SO_RECV_ANYIF:
5540 error = sooptcopyin(sopt, &optval, sizeof(optval),
5541 sizeof(optval));
5542 if (error == 0) {
5543 error = so_set_recv_anyif(so, optval);
5544 }
5545 break;
5546
5547 case SO_TRAFFIC_MGT_BACKGROUND: {
5548 /* This option is handled by lower layer(s) */
5549 error = 0;
5550 break;
5551 }
5552
5553 #if FLOW_DIVERT
5554 case SO_FLOW_DIVERT_TOKEN:
5555 error = flow_divert_token_set(so, sopt);
5556 break;
5557 #endif /* FLOW_DIVERT */
5558
5559
5560 case SO_DELEGATED:
5561 if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5562 sizeof(optval))) != 0) {
5563 break;
5564 }
5565
5566 error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5567 break;
5568
5569 case SO_DELEGATED_UUID: {
5570 uuid_t euuid;
5571
5572 if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5573 sizeof(euuid))) != 0) {
5574 break;
5575 }
5576
5577 error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5578 break;
5579 }
5580
5581 #if NECP
5582 case SO_NECP_ATTRIBUTES:
5583 error = necp_set_socket_attributes(so, sopt);
5584 break;
5585
5586 case SO_NECP_CLIENTUUID: {
5587 if (SOCK_DOM(so) == PF_MULTIPATH) {
5588 /* Handled by MPTCP itself */
5589 break;
5590 }
5591
5592 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5593 error = EINVAL;
5594 goto out;
5595 }
5596
5597 struct inpcb *inp = sotoinpcb(so);
5598 if (!uuid_is_null(inp->necp_client_uuid)) {
5599 // Clear out the old client UUID if present
5600 necp_inpcb_remove_cb(inp);
5601 }
5602
5603 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5604 sizeof(uuid_t), sizeof(uuid_t));
5605 if (error != 0) {
5606 goto out;
5607 }
5608
5609 if (uuid_is_null(inp->necp_client_uuid)) {
5610 error = EINVAL;
5611 goto out;
5612 }
5613
5614 pid_t current_pid = proc_pid(current_proc());
5615 error = necp_client_register_socket_flow(current_pid,
5616 inp->necp_client_uuid, inp);
5617 if (error != 0) {
5618 uuid_clear(inp->necp_client_uuid);
5619 goto out;
5620 }
5621
5622 if (inp->inp_lport != 0) {
5623 // There is a bound local port, so this is not
5624 // a fresh socket. Assign to the client.
5625 necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5626 }
5627
5628 break;
5629 }
5630 case SO_NECP_LISTENUUID: {
5631 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5632 error = EINVAL;
5633 goto out;
5634 }
5635
5636 struct inpcb *inp = sotoinpcb(so);
5637 if (!uuid_is_null(inp->necp_client_uuid)) {
5638 error = EINVAL;
5639 goto out;
5640 }
5641
5642 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5643 sizeof(uuid_t), sizeof(uuid_t));
5644 if (error != 0) {
5645 goto out;
5646 }
5647
5648 if (uuid_is_null(inp->necp_client_uuid)) {
5649 error = EINVAL;
5650 goto out;
5651 }
5652
5653 error = necp_client_register_socket_listener(proc_pid(current_proc()),
5654 inp->necp_client_uuid, inp);
5655 if (error != 0) {
5656 uuid_clear(inp->necp_client_uuid);
5657 goto out;
5658 }
5659
5660 // Mark that the port registration is held by NECP
5661 inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5662
5663 break;
5664 }
5665 #endif /* NECP */
5666
5667 case SO_EXTENDED_BK_IDLE:
5668 error = sooptcopyin(sopt, &optval, sizeof(optval),
5669 sizeof(optval));
5670 if (error == 0) {
5671 error = so_set_extended_bk_idle(so, optval);
5672 }
5673 break;
5674
5675 case SO_MARK_CELLFALLBACK:
5676 error = sooptcopyin(sopt, &optval, sizeof(optval),
5677 sizeof(optval));
5678 if (error != 0) {
5679 goto out;
5680 }
5681 if (optval < 0) {
5682 error = EINVAL;
5683 goto out;
5684 }
5685 if (optval == 0) {
5686 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5687 } else {
5688 so->so_flags1 |= SOF1_CELLFALLBACK;
5689 }
5690 break;
5691
5692 case SO_STATISTICS_EVENT:
5693 error = sooptcopyin(sopt, &long_optval,
5694 sizeof(long_optval), sizeof(long_optval));
5695 if (error != 0) {
5696 goto out;
5697 }
5698 u_int64_t nstat_event = 0;
5699 error = so_statistics_event_to_nstat_event(
5700 &long_optval, &nstat_event);
5701 if (error != 0) {
5702 goto out;
5703 }
5704 nstat_pcb_event(sotoinpcb(so), nstat_event);
5705 break;
5706
5707 case SO_NET_SERVICE_TYPE: {
5708 error = sooptcopyin(sopt, &optval, sizeof(optval),
5709 sizeof(optval));
5710 if (error != 0) {
5711 goto out;
5712 }
5713 error = so_set_net_service_type(so, optval);
5714 break;
5715 }
5716
5717 case SO_QOSMARKING_POLICY_OVERRIDE:
5718 error = priv_check_cred(kauth_cred_get(),
5719 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5720 if (error != 0) {
5721 goto out;
5722 }
5723 error = sooptcopyin(sopt, &optval, sizeof(optval),
5724 sizeof(optval));
5725 if (error != 0) {
5726 goto out;
5727 }
5728 if (optval == 0) {
5729 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5730 } else {
5731 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5732 }
5733 break;
5734
5735 case SO_MPKL_SEND_INFO: {
5736 struct so_mpkl_send_info so_mpkl_send_info;
5737
5738 error = sooptcopyin(sopt, &so_mpkl_send_info,
5739 sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5740 if (error != 0) {
5741 goto out;
5742 }
5743 uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5744 so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5745
5746 if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5747 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5748 } else {
5749 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5750 }
5751 break;
5752 }
5753 default:
5754 error = ENOPROTOOPT;
5755 break;
5756 }
5757 if (error == 0 && so->so_proto != NULL &&
5758 so->so_proto->pr_ctloutput != NULL) {
5759 (void) so->so_proto->pr_ctloutput(so, sopt);
5760 }
5761 }
5762 out:
5763 if (dolock) {
5764 socket_unlock(so, 1);
5765 }
5766 return error;
5767 }
5768
5769 /* Helper routines for getsockopt */
5770 int
5771 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5772 {
5773 int error;
5774 size_t valsize;
5775
5776 error = 0;
5777
5778 /*
5779 * Documented get behavior is that we always return a value,
5780 * possibly truncated to fit in the user's buffer.
5781 * Traditional behavior is that we always tell the user
5782 * precisely how much we copied, rather than something useful
5783 * like the total amount we had available for her.
5784 * Note that this interface is not idempotent; the entire answer must
5785 * generated ahead of time.
5786 */
5787 valsize = min(len, sopt->sopt_valsize);
5788 sopt->sopt_valsize = valsize;
5789 if (sopt->sopt_val != USER_ADDR_NULL) {
5790 if (sopt->sopt_p != kernproc) {
5791 error = copyout(buf, sopt->sopt_val, valsize);
5792 } else {
5793 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5794 }
5795 }
5796 return error;
5797 }
5798
5799 static int
5800 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5801 {
5802 int error;
5803 size_t len;
5804 struct user64_timeval tv64 = {};
5805 struct user32_timeval tv32 = {};
5806 const void * val;
5807 size_t valsize;
5808
5809 error = 0;
5810 if (proc_is64bit(sopt->sopt_p)) {
5811 len = sizeof(tv64);
5812 tv64.tv_sec = tv_p->tv_sec;
5813 tv64.tv_usec = tv_p->tv_usec;
5814 val = &tv64;
5815 } else {
5816 len = sizeof(tv32);
5817 tv32.tv_sec = tv_p->tv_sec;
5818 tv32.tv_usec = tv_p->tv_usec;
5819 val = &tv32;
5820 }
5821 valsize = min(len, sopt->sopt_valsize);
5822 sopt->sopt_valsize = valsize;
5823 if (sopt->sopt_val != USER_ADDR_NULL) {
5824 if (sopt->sopt_p != kernproc) {
5825 error = copyout(val, sopt->sopt_val, valsize);
5826 } else {
5827 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5828 }
5829 }
5830 return error;
5831 }
5832
5833 /*
5834 * Return: 0 Success
5835 * ENOPROTOOPT
5836 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5837 * <pr_ctloutput>:???
5838 * <sf_getoption>:???
5839 */
5840 int
5841 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5842 {
5843 int error, optval;
5844 struct linger l;
5845 struct timeval tv;
5846 #if CONFIG_MACF_SOCKET
5847 struct mac extmac;
5848 #endif /* MAC_SOCKET */
5849
5850 if (sopt->sopt_dir != SOPT_GET) {
5851 sopt->sopt_dir = SOPT_GET;
5852 }
5853
5854 if (dolock) {
5855 socket_lock(so, 1);
5856 }
5857
5858 error = sflt_getsockopt(so, sopt);
5859 if (error != 0) {
5860 if (error == EJUSTRETURN) {
5861 error = 0;
5862 }
5863 goto out;
5864 }
5865
5866 if (sopt->sopt_level != SOL_SOCKET) {
5867 if (so->so_proto != NULL &&
5868 so->so_proto->pr_ctloutput != NULL) {
5869 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5870 goto out;
5871 }
5872 error = ENOPROTOOPT;
5873 } else {
5874 /*
5875 * Allow socket-level (SOL_SOCKET) options to be filtered by
5876 * the protocol layer, if needed. A zero value returned from
5877 * the handler means use default socket-level processing as
5878 * done by the rest of this routine. Otherwise, any other
5879 * return value indicates that the option is unsupported.
5880 */
5881 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5882 pru_socheckopt(so, sopt)) != 0) {
5883 goto out;
5884 }
5885
5886 error = 0;
5887 switch (sopt->sopt_name) {
5888 case SO_LINGER:
5889 case SO_LINGER_SEC:
5890 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5891 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5892 so->so_linger : so->so_linger / hz;
5893 error = sooptcopyout(sopt, &l, sizeof(l));
5894 break;
5895
5896 case SO_USELOOPBACK:
5897 case SO_DONTROUTE:
5898 case SO_DEBUG:
5899 case SO_KEEPALIVE:
5900 case SO_REUSEADDR:
5901 case SO_REUSEPORT:
5902 case SO_BROADCAST:
5903 case SO_OOBINLINE:
5904 case SO_TIMESTAMP:
5905 case SO_TIMESTAMP_MONOTONIC:
5906 case SO_TIMESTAMP_CONTINUOUS:
5907 case SO_DONTTRUNC:
5908 case SO_WANTMORE:
5909 case SO_WANTOOBFLAG:
5910 case SO_NOWAKEFROMSLEEP:
5911 case SO_NOAPNFALLBK:
5912 optval = so->so_options & sopt->sopt_name;
5913 integer:
5914 error = sooptcopyout(sopt, &optval, sizeof(optval));
5915 break;
5916
5917 case SO_TYPE:
5918 optval = so->so_type;
5919 goto integer;
5920
5921 case SO_NREAD:
5922 if (so->so_proto->pr_flags & PR_ATOMIC) {
5923 int pkt_total;
5924 struct mbuf *m1;
5925
5926 pkt_total = 0;
5927 m1 = so->so_rcv.sb_mb;
5928 while (m1 != NULL) {
5929 if (m1->m_type == MT_DATA ||
5930 m1->m_type == MT_HEADER ||
5931 m1->m_type == MT_OOBDATA) {
5932 pkt_total += m1->m_len;
5933 }
5934 m1 = m1->m_next;
5935 }
5936 optval = pkt_total;
5937 } else {
5938 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5939 }
5940 goto integer;
5941
5942 case SO_NUMRCVPKT:
5943 if (so->so_proto->pr_flags & PR_ATOMIC) {
5944 int cnt = 0;
5945 struct mbuf *m1;
5946
5947 m1 = so->so_rcv.sb_mb;
5948 while (m1 != NULL) {
5949 cnt += 1;
5950 m1 = m1->m_nextpkt;
5951 }
5952 optval = cnt;
5953 goto integer;
5954 } else {
5955 error = ENOPROTOOPT;
5956 break;
5957 }
5958
5959 case SO_NWRITE:
5960 optval = so->so_snd.sb_cc;
5961 goto integer;
5962
5963 case SO_ERROR:
5964 optval = so->so_error;
5965 so->so_error = 0;
5966 goto integer;
5967
5968 case SO_SNDBUF: {
5969 u_int32_t hiwat = so->so_snd.sb_hiwat;
5970
5971 if (so->so_snd.sb_flags & SB_UNIX) {
5972 struct unpcb *unp =
5973 (struct unpcb *)(so->so_pcb);
5974 if (unp != NULL && unp->unp_conn != NULL) {
5975 hiwat += unp->unp_conn->unp_cc;
5976 }
5977 }
5978
5979 optval = hiwat;
5980 goto integer;
5981 }
5982 case SO_RCVBUF:
5983 optval = so->so_rcv.sb_hiwat;
5984 goto integer;
5985
5986 case SO_SNDLOWAT:
5987 optval = so->so_snd.sb_lowat;
5988 goto integer;
5989
5990 case SO_RCVLOWAT:
5991 optval = so->so_rcv.sb_lowat;
5992 goto integer;
5993
5994 case SO_SNDTIMEO:
5995 case SO_RCVTIMEO:
5996 tv = (sopt->sopt_name == SO_SNDTIMEO ?
5997 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5998
5999 error = sooptcopyout_timeval(sopt, &tv);
6000 break;
6001
6002 case SO_NOSIGPIPE:
6003 optval = (so->so_flags & SOF_NOSIGPIPE);
6004 goto integer;
6005
6006 case SO_NOADDRERR:
6007 optval = (so->so_flags & SOF_NOADDRAVAIL);
6008 goto integer;
6009
6010 case SO_REUSESHAREUID:
6011 optval = (so->so_flags & SOF_REUSESHAREUID);
6012 goto integer;
6013
6014
6015 case SO_NOTIFYCONFLICT:
6016 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
6017 goto integer;
6018
6019 case SO_RESTRICTIONS:
6020 optval = so_get_restrictions(so);
6021 goto integer;
6022
6023 case SO_AWDL_UNRESTRICTED:
6024 if (SOCK_DOM(so) == PF_INET ||
6025 SOCK_DOM(so) == PF_INET6) {
6026 optval = inp_get_awdl_unrestricted(
6027 sotoinpcb(so));
6028 goto integer;
6029 } else {
6030 error = EOPNOTSUPP;
6031 }
6032 break;
6033
6034 case SO_INTCOPROC_ALLOW:
6035 if (SOCK_DOM(so) == PF_INET6) {
6036 optval = inp_get_intcoproc_allowed(
6037 sotoinpcb(so));
6038 goto integer;
6039 } else {
6040 error = EOPNOTSUPP;
6041 }
6042 break;
6043
6044 case SO_LABEL:
6045 #if CONFIG_MACF_SOCKET
6046 if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
6047 sizeof(extmac))) != 0 ||
6048 (error = mac_socket_label_get(proc_ucred(
6049 sopt->sopt_p), so, &extmac)) != 0) {
6050 break;
6051 }
6052
6053 error = sooptcopyout(sopt, &extmac, sizeof(extmac));
6054 #else
6055 error = EOPNOTSUPP;
6056 #endif /* MAC_SOCKET */
6057 break;
6058
6059 case SO_PEERLABEL:
6060 #if CONFIG_MACF_SOCKET
6061 if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
6062 sizeof(extmac))) != 0 ||
6063 (error = mac_socketpeer_label_get(proc_ucred(
6064 sopt->sopt_p), so, &extmac)) != 0) {
6065 break;
6066 }
6067
6068 error = sooptcopyout(sopt, &extmac, sizeof(extmac));
6069 #else
6070 error = EOPNOTSUPP;
6071 #endif /* MAC_SOCKET */
6072 break;
6073
6074 #ifdef __APPLE_API_PRIVATE
6075 case SO_UPCALLCLOSEWAIT:
6076 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6077 goto integer;
6078 #endif
6079 case SO_RANDOMPORT:
6080 optval = (so->so_flags & SOF_BINDRANDOMPORT);
6081 goto integer;
6082
6083 case SO_NP_EXTENSIONS: {
6084 struct so_np_extensions sonpx = {};
6085
6086 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6087 SONPX_SETOPTSHUT : 0;
6088 sonpx.npx_mask = SONPX_MASK_VALID;
6089
6090 error = sooptcopyout(sopt, &sonpx,
6091 sizeof(struct so_np_extensions));
6092 break;
6093 }
6094
6095 case SO_TRAFFIC_CLASS:
6096 optval = so->so_traffic_class;
6097 goto integer;
6098
6099 case SO_RECV_TRAFFIC_CLASS:
6100 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6101 goto integer;
6102
6103 case SO_TRAFFIC_CLASS_STATS:
6104 error = sooptcopyout(sopt, &so->so_tc_stats,
6105 sizeof(so->so_tc_stats));
6106 break;
6107
6108 #if (DEVELOPMENT || DEBUG)
6109 case SO_TRAFFIC_CLASS_DBG:
6110 error = sogetopt_tcdbg(so, sopt);
6111 break;
6112 #endif /* (DEVELOPMENT || DEBUG) */
6113
6114 case SO_PRIVILEGED_TRAFFIC_CLASS:
6115 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6116 goto integer;
6117
6118 case SO_DEFUNCTOK:
6119 optval = !(so->so_flags & SOF_NODEFUNCT);
6120 goto integer;
6121
6122 case SO_ISDEFUNCT:
6123 optval = (so->so_flags & SOF_DEFUNCT);
6124 goto integer;
6125
6126 case SO_OPPORTUNISTIC:
6127 optval = so_get_opportunistic(so);
6128 goto integer;
6129
6130 case SO_FLUSH:
6131 /* This option is not gettable */
6132 error = EINVAL;
6133 break;
6134
6135 case SO_RECV_ANYIF:
6136 optval = so_get_recv_anyif(so);
6137 goto integer;
6138
6139 case SO_TRAFFIC_MGT_BACKGROUND:
6140 /* This option is handled by lower layer(s) */
6141 if (so->so_proto != NULL &&
6142 so->so_proto->pr_ctloutput != NULL) {
6143 (void) so->so_proto->pr_ctloutput(so, sopt);
6144 }
6145 break;
6146
6147 #if FLOW_DIVERT
6148 case SO_FLOW_DIVERT_TOKEN:
6149 error = flow_divert_token_get(so, sopt);
6150 break;
6151 #endif /* FLOW_DIVERT */
6152
6153 #if NECP
6154 case SO_NECP_ATTRIBUTES:
6155 error = necp_get_socket_attributes(so, sopt);
6156 break;
6157
6158 case SO_NECP_CLIENTUUID: {
6159 uuid_t *ncu;
6160
6161 if (SOCK_DOM(so) == PF_MULTIPATH) {
6162 ncu = &mpsotomppcb(so)->necp_client_uuid;
6163 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6164 ncu = &sotoinpcb(so)->necp_client_uuid;
6165 } else {
6166 error = EINVAL;
6167 goto out;
6168 }
6169
6170 error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6171 break;
6172 }
6173
6174 case SO_NECP_LISTENUUID: {
6175 uuid_t *nlu;
6176
6177 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6178 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6179 nlu = &sotoinpcb(so)->necp_client_uuid;
6180 } else {
6181 error = ENOENT;
6182 goto out;
6183 }
6184 } else {
6185 error = EINVAL;
6186 goto out;
6187 }
6188
6189 error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6190 break;
6191 }
6192 #endif /* NECP */
6193
6194 #if CONTENT_FILTER
6195 case SO_CFIL_SOCK_ID: {
6196 cfil_sock_id_t sock_id;
6197
6198 sock_id = cfil_sock_id_from_socket(so);
6199
6200 error = sooptcopyout(sopt, &sock_id,
6201 sizeof(cfil_sock_id_t));
6202 break;
6203 }
6204 #endif /* CONTENT_FILTER */
6205
6206 case SO_EXTENDED_BK_IDLE:
6207 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6208 goto integer;
6209 case SO_MARK_CELLFALLBACK:
6210 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6211 ? 1 : 0;
6212 goto integer;
6213 case SO_NET_SERVICE_TYPE: {
6214 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6215 optval = so->so_netsvctype;
6216 } else {
6217 optval = NET_SERVICE_TYPE_BE;
6218 }
6219 goto integer;
6220 }
6221 case SO_NETSVC_MARKING_LEVEL:
6222 optval = so_get_netsvc_marking_level(so);
6223 goto integer;
6224
6225 case SO_MPKL_SEND_INFO: {
6226 struct so_mpkl_send_info so_mpkl_send_info;
6227
6228 uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6229 so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6230 error = sooptcopyout(sopt, &so_mpkl_send_info,
6231 sizeof(struct so_mpkl_send_info));
6232 break;
6233 }
6234 default:
6235 error = ENOPROTOOPT;
6236 break;
6237 }
6238 }
6239 out:
6240 if (dolock) {
6241 socket_unlock(so, 1);
6242 }
6243 return error;
6244 }
6245
6246 /*
6247 * The size limits on our soopt_getm is different from that on FreeBSD.
6248 * We limit the size of options to MCLBYTES. This will have to change
6249 * if we need to define options that need more space than MCLBYTES.
6250 */
6251 int
6252 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6253 {
6254 struct mbuf *m, *m_prev;
6255 int sopt_size = sopt->sopt_valsize;
6256 int how;
6257
6258 if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6259 return EMSGSIZE;
6260 }
6261
6262 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6263 MGET(m, how, MT_DATA);
6264 if (m == NULL) {
6265 return ENOBUFS;
6266 }
6267 if (sopt_size > MLEN) {
6268 MCLGET(m, how);
6269 if ((m->m_flags & M_EXT) == 0) {
6270 m_free(m);
6271 return ENOBUFS;
6272 }
6273 m->m_len = min(MCLBYTES, sopt_size);
6274 } else {
6275 m->m_len = min(MLEN, sopt_size);
6276 }
6277 sopt_size -= m->m_len;
6278 *mp = m;
6279 m_prev = m;
6280
6281 while (sopt_size > 0) {
6282 MGET(m, how, MT_DATA);
6283 if (m == NULL) {
6284 m_freem(*mp);
6285 return ENOBUFS;
6286 }
6287 if (sopt_size > MLEN) {
6288 MCLGET(m, how);
6289 if ((m->m_flags & M_EXT) == 0) {
6290 m_freem(*mp);
6291 m_freem(m);
6292 return ENOBUFS;
6293 }
6294 m->m_len = min(MCLBYTES, sopt_size);
6295 } else {
6296 m->m_len = min(MLEN, sopt_size);
6297 }
6298 sopt_size -= m->m_len;
6299 m_prev->m_next = m;
6300 m_prev = m;
6301 }
6302 return 0;
6303 }
6304
6305 /* copyin sopt data into mbuf chain */
6306 int
6307 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6308 {
6309 struct mbuf *m0 = m;
6310
6311 if (sopt->sopt_val == USER_ADDR_NULL) {
6312 return 0;
6313 }
6314 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6315 if (sopt->sopt_p != kernproc) {
6316 int error;
6317
6318 error = copyin(sopt->sopt_val, mtod(m, char *),
6319 m->m_len);
6320 if (error != 0) {
6321 m_freem(m0);
6322 return error;
6323 }
6324 } else {
6325 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6326 mtod(m, char *), m->m_len);
6327 }
6328 sopt->sopt_valsize -= m->m_len;
6329 sopt->sopt_val += m->m_len;
6330 m = m->m_next;
6331 }
6332 /* should be allocated enoughly at ip6_sooptmcopyin() */
6333 if (m != NULL) {
6334 panic("soopt_mcopyin");
6335 /* NOTREACHED */
6336 }
6337 return 0;
6338 }
6339
6340 /* copyout mbuf chain data into soopt */
6341 int
6342 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6343 {
6344 struct mbuf *m0 = m;
6345 size_t valsize = 0;
6346
6347 if (sopt->sopt_val == USER_ADDR_NULL) {
6348 return 0;
6349 }
6350 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6351 if (sopt->sopt_p != kernproc) {
6352 int error;
6353
6354 error = copyout(mtod(m, char *), sopt->sopt_val,
6355 m->m_len);
6356 if (error != 0) {
6357 m_freem(m0);
6358 return error;
6359 }
6360 } else {
6361 bcopy(mtod(m, char *),
6362 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6363 }
6364 sopt->sopt_valsize -= m->m_len;
6365 sopt->sopt_val += m->m_len;
6366 valsize += m->m_len;
6367 m = m->m_next;
6368 }
6369 if (m != NULL) {
6370 /* enough soopt buffer should be given from user-land */
6371 m_freem(m0);
6372 return EINVAL;
6373 }
6374 sopt->sopt_valsize = valsize;
6375 return 0;
6376 }
6377
6378 void
6379 sohasoutofband(struct socket *so)
6380 {
6381 if (so->so_pgid < 0) {
6382 gsignal(-so->so_pgid, SIGURG);
6383 } else if (so->so_pgid > 0) {
6384 proc_signal(so->so_pgid, SIGURG);
6385 }
6386 selwakeup(&so->so_rcv.sb_sel);
6387 if (so->so_rcv.sb_flags & SB_KNOTE) {
6388 KNOTE(&so->so_rcv.sb_sel.si_note,
6389 (NOTE_OOB | SO_FILT_HINT_LOCKED));
6390 }
6391 }
6392
6393 int
6394 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6395 {
6396 #pragma unused(cred)
6397 struct proc *p = current_proc();
6398 int revents = 0;
6399
6400 socket_lock(so, 1);
6401 so_update_last_owner_locked(so, PROC_NULL);
6402 so_update_policy(so);
6403
6404 if (events & (POLLIN | POLLRDNORM)) {
6405 if (soreadable(so)) {
6406 revents |= events & (POLLIN | POLLRDNORM);
6407 }
6408 }
6409
6410 if (events & (POLLOUT | POLLWRNORM)) {
6411 if (sowriteable(so)) {
6412 revents |= events & (POLLOUT | POLLWRNORM);
6413 }
6414 }
6415
6416 if (events & (POLLPRI | POLLRDBAND)) {
6417 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6418 revents |= events & (POLLPRI | POLLRDBAND);
6419 }
6420 }
6421
6422 if (revents == 0) {
6423 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6424 /*
6425 * Darwin sets the flag first,
6426 * BSD calls selrecord first
6427 */
6428 so->so_rcv.sb_flags |= SB_SEL;
6429 selrecord(p, &so->so_rcv.sb_sel, wql);
6430 }
6431
6432 if (events & (POLLOUT | POLLWRNORM)) {
6433 /*
6434 * Darwin sets the flag first,
6435 * BSD calls selrecord first
6436 */
6437 so->so_snd.sb_flags |= SB_SEL;
6438 selrecord(p, &so->so_snd.sb_sel, wql);
6439 }
6440 }
6441
6442 socket_unlock(so, 1);
6443 return revents;
6444 }
6445
6446 int
6447 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6448 {
6449 struct socket *so = (struct socket *)fp->f_fglob->fg_data;
6450 int result;
6451
6452 socket_lock(so, 1);
6453 so_update_last_owner_locked(so, PROC_NULL);
6454 so_update_policy(so);
6455
6456 #if CONFIG_MACF_SOCKET
6457 proc_t p = knote_get_kq(kn)->kq_p;
6458 if (mac_socket_check_kqfilter(proc_ucred(p), kn, so) != 0) {
6459 socket_unlock(so, 1);
6460 knote_set_error(kn, EPERM);
6461 return 0;
6462 }
6463 #endif /* MAC_SOCKET */
6464
6465 switch (kn->kn_filter) {
6466 case EVFILT_READ:
6467 kn->kn_filtid = EVFILTID_SOREAD;
6468 break;
6469 case EVFILT_WRITE:
6470 kn->kn_filtid = EVFILTID_SOWRITE;
6471 break;
6472 case EVFILT_SOCK:
6473 kn->kn_filtid = EVFILTID_SCK;
6474 break;
6475 case EVFILT_EXCEPT:
6476 kn->kn_filtid = EVFILTID_SOEXCEPT;
6477 break;
6478 default:
6479 socket_unlock(so, 1);
6480 knote_set_error(kn, EINVAL);
6481 return 0;
6482 }
6483
6484 /*
6485 * call the appropriate sub-filter attach
6486 * with the socket still locked
6487 */
6488 result = knote_fops(kn)->f_attach(kn, kev);
6489
6490 socket_unlock(so, 1);
6491
6492 return result;
6493 }
6494
6495 static int
6496 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6497 {
6498 int retval = 0;
6499 int64_t data = 0;
6500
6501 if (so->so_options & SO_ACCEPTCONN) {
6502 /*
6503 * Radar 6615193 handle the listen case dynamically
6504 * for kqueue read filter. This allows to call listen()
6505 * after registering the kqueue EVFILT_READ.
6506 */
6507
6508 retval = !TAILQ_EMPTY(&so->so_comp);
6509 data = so->so_qlen;
6510 goto out;
6511 }
6512
6513 /* socket isn't a listener */
6514 /*
6515 * NOTE_LOWAT specifies new low water mark in data, i.e.
6516 * the bytes of protocol data. We therefore exclude any
6517 * control bytes.
6518 */
6519 data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6520
6521 if (kn->kn_sfflags & NOTE_OOB) {
6522 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6523 kn->kn_fflags |= NOTE_OOB;
6524 data -= so->so_oobmark;
6525 retval = 1;
6526 goto out;
6527 }
6528 }
6529
6530 if ((so->so_state & SS_CANTRCVMORE)
6531 #if CONTENT_FILTER
6532 && cfil_sock_data_pending(&so->so_rcv) == 0
6533 #endif /* CONTENT_FILTER */
6534 ) {
6535 kn->kn_flags |= EV_EOF;
6536 kn->kn_fflags = so->so_error;
6537 retval = 1;
6538 goto out;
6539 }
6540
6541 if (so->so_error) { /* temporary udp error */
6542 retval = 1;
6543 goto out;
6544 }
6545
6546 int64_t lowwat = so->so_rcv.sb_lowat;
6547 /*
6548 * Ensure that when NOTE_LOWAT is used, the derived
6549 * low water mark is bounded by socket's rcv buf's
6550 * high and low water mark values.
6551 */
6552 if (kn->kn_sfflags & NOTE_LOWAT) {
6553 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6554 lowwat = so->so_rcv.sb_hiwat;
6555 } else if (kn->kn_sdata > lowwat) {
6556 lowwat = kn->kn_sdata;
6557 }
6558 }
6559
6560 /*
6561 * While the `data` field is the amount of data to read,
6562 * 0-sized packets need to wake up the kqueue, see 58140856,
6563 * so we need to take control bytes into account too.
6564 */
6565 retval = (so->so_rcv.sb_cc >= lowwat);
6566
6567 out:
6568 if (retval && kev) {
6569 knote_fill_kevent(kn, kev, data);
6570 }
6571 return retval;
6572 }
6573
6574 static int
6575 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6576 {
6577 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6578
6579 /* socket locked */
6580
6581 /*
6582 * If the caller explicitly asked for OOB results (e.g. poll())
6583 * from EVFILT_READ, then save that off in the hookid field
6584 * and reserve the kn_flags EV_OOBAND bit for output only.
6585 */
6586 if (kn->kn_filter == EVFILT_READ &&
6587 kn->kn_flags & EV_OOBAND) {
6588 kn->kn_flags &= ~EV_OOBAND;
6589 kn->kn_hook32 = EV_OOBAND;
6590 } else {
6591 kn->kn_hook32 = 0;
6592 }
6593 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6594 so->so_rcv.sb_flags |= SB_KNOTE;
6595 }
6596
6597 /* indicate if event is already fired */
6598 return filt_soread_common(kn, NULL, so);
6599 }
6600
6601 static void
6602 filt_sordetach(struct knote *kn)
6603 {
6604 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6605
6606 socket_lock(so, 1);
6607 if (so->so_rcv.sb_flags & SB_KNOTE) {
6608 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6609 so->so_rcv.sb_flags &= ~SB_KNOTE;
6610 }
6611 }
6612 socket_unlock(so, 1);
6613 }
6614
6615 /*ARGSUSED*/
6616 static int
6617 filt_soread(struct knote *kn, long hint)
6618 {
6619 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6620 int retval;
6621
6622 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6623 socket_lock(so, 1);
6624 }
6625
6626 retval = filt_soread_common(kn, NULL, so);
6627
6628 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6629 socket_unlock(so, 1);
6630 }
6631
6632 return retval;
6633 }
6634
6635 static int
6636 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6637 {
6638 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6639 int retval;
6640
6641 socket_lock(so, 1);
6642
6643 /* save off the new input fflags and data */
6644 kn->kn_sfflags = kev->fflags;
6645 kn->kn_sdata = kev->data;
6646
6647 /* determine if changes result in fired events */
6648 retval = filt_soread_common(kn, NULL, so);
6649
6650 socket_unlock(so, 1);
6651
6652 return retval;
6653 }
6654
6655 static int
6656 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6657 {
6658 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6659 int retval;
6660
6661 socket_lock(so, 1);
6662 retval = filt_soread_common(kn, kev, so);
6663 socket_unlock(so, 1);
6664
6665 return retval;
6666 }
6667
6668 int
6669 so_wait_for_if_feedback(struct socket *so)
6670 {
6671 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6672 (so->so_state & SS_ISCONNECTED)) {
6673 struct inpcb *inp = sotoinpcb(so);
6674 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6675 return 1;
6676 }
6677 }
6678 return 0;
6679 }
6680
6681 static int
6682 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6683 {
6684 int ret = 0;
6685 int64_t data = sbspace(&so->so_snd);
6686
6687 if (so->so_state & SS_CANTSENDMORE) {
6688 kn->kn_flags |= EV_EOF;
6689 kn->kn_fflags = so->so_error;
6690 ret = 1;
6691 goto out;
6692 }
6693
6694 if (so->so_error) { /* temporary udp error */
6695 ret = 1;
6696 goto out;
6697 }
6698
6699 if (!socanwrite(so)) {
6700 ret = 0;
6701 goto out;
6702 }
6703
6704 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6705 ret = 1;
6706 goto out;
6707 }
6708
6709 int64_t lowwat = so->so_snd.sb_lowat;
6710
6711 if (kn->kn_sfflags & NOTE_LOWAT) {
6712 if (kn->kn_sdata > so->so_snd.sb_hiwat) {
6713 lowwat = so->so_snd.sb_hiwat;
6714 } else if (kn->kn_sdata > lowwat) {
6715 lowwat = kn->kn_sdata;
6716 }
6717 }
6718
6719 if (data >= lowwat) {
6720 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6721 #if (DEBUG || DEVELOPMENT)
6722 && so_notsent_lowat_check == 1
6723 #endif /* DEBUG || DEVELOPMENT */
6724 ) {
6725 if ((SOCK_DOM(so) == PF_INET ||
6726 SOCK_DOM(so) == PF_INET6) &&
6727 so->so_type == SOCK_STREAM) {
6728 ret = tcp_notsent_lowat_check(so);
6729 }
6730 #if MPTCP
6731 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6732 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6733 ret = mptcp_notsent_lowat_check(so);
6734 }
6735 #endif
6736 else {
6737 ret = 1;
6738 goto out;
6739 }
6740 } else {
6741 ret = 1;
6742 }
6743 }
6744 if (so_wait_for_if_feedback(so)) {
6745 ret = 0;
6746 }
6747
6748 out:
6749 if (ret && kev) {
6750 knote_fill_kevent(kn, kev, data);
6751 }
6752 return ret;
6753 }
6754
6755 static int
6756 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6757 {
6758 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6759
6760 /* socket locked */
6761 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6762 so->so_snd.sb_flags |= SB_KNOTE;
6763 }
6764
6765 /* determine if its already fired */
6766 return filt_sowrite_common(kn, NULL, so);
6767 }
6768
6769 static void
6770 filt_sowdetach(struct knote *kn)
6771 {
6772 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6773 socket_lock(so, 1);
6774
6775 if (so->so_snd.sb_flags & SB_KNOTE) {
6776 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6777 so->so_snd.sb_flags &= ~SB_KNOTE;
6778 }
6779 }
6780 socket_unlock(so, 1);
6781 }
6782
6783 /*ARGSUSED*/
6784 static int
6785 filt_sowrite(struct knote *kn, long hint)
6786 {
6787 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6788 int ret;
6789
6790 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6791 socket_lock(so, 1);
6792 }
6793
6794 ret = filt_sowrite_common(kn, NULL, so);
6795
6796 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6797 socket_unlock(so, 1);
6798 }
6799
6800 return ret;
6801 }
6802
6803 static int
6804 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6805 {
6806 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6807 int ret;
6808
6809 socket_lock(so, 1);
6810
6811 /*save off the new input fflags and data */
6812 kn->kn_sfflags = kev->fflags;
6813 kn->kn_sdata = kev->data;
6814
6815 /* determine if these changes result in a triggered event */
6816 ret = filt_sowrite_common(kn, NULL, so);
6817
6818 socket_unlock(so, 1);
6819
6820 return ret;
6821 }
6822
6823 static int
6824 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6825 {
6826 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6827 int ret;
6828
6829 socket_lock(so, 1);
6830 ret = filt_sowrite_common(kn, kev, so);
6831 socket_unlock(so, 1);
6832
6833 return ret;
6834 }
6835
6836 static int
6837 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6838 struct socket *so, long ev_hint)
6839 {
6840 int ret = 0;
6841 int64_t data = 0;
6842 uint32_t level_trigger = 0;
6843
6844 if (ev_hint & SO_FILT_HINT_CONNRESET) {
6845 kn->kn_fflags |= NOTE_CONNRESET;
6846 }
6847 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6848 kn->kn_fflags |= NOTE_TIMEOUT;
6849 }
6850 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6851 kn->kn_fflags |= NOTE_NOSRCADDR;
6852 }
6853 if (ev_hint & SO_FILT_HINT_IFDENIED) {
6854 kn->kn_fflags |= NOTE_IFDENIED;
6855 }
6856 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6857 kn->kn_fflags |= NOTE_KEEPALIVE;
6858 }
6859 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6860 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6861 }
6862 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6863 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6864 }
6865 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6866 (so->so_state & SS_ISCONNECTED)) {
6867 kn->kn_fflags |= NOTE_CONNECTED;
6868 level_trigger |= NOTE_CONNECTED;
6869 }
6870 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6871 (so->so_state & SS_ISDISCONNECTED)) {
6872 kn->kn_fflags |= NOTE_DISCONNECTED;
6873 level_trigger |= NOTE_DISCONNECTED;
6874 }
6875 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6876 if (so->so_proto != NULL &&
6877 (so->so_proto->pr_flags & PR_EVCONNINFO)) {
6878 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6879 }
6880 }
6881
6882 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6883 tcp_notify_ack_active(so)) {
6884 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6885 }
6886
6887 if ((so->so_state & SS_CANTRCVMORE)
6888 #if CONTENT_FILTER
6889 && cfil_sock_data_pending(&so->so_rcv) == 0
6890 #endif /* CONTENT_FILTER */
6891 ) {
6892 kn->kn_fflags |= NOTE_READCLOSED;
6893 level_trigger |= NOTE_READCLOSED;
6894 }
6895
6896 if (so->so_state & SS_CANTSENDMORE) {
6897 kn->kn_fflags |= NOTE_WRITECLOSED;
6898 level_trigger |= NOTE_WRITECLOSED;
6899 }
6900
6901 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6902 (so->so_flags & SOF_SUSPENDED)) {
6903 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6904
6905 /* If resume event was delivered before, reset it */
6906 kn->kn_hook32 &= ~NOTE_RESUME;
6907
6908 kn->kn_fflags |= NOTE_SUSPEND;
6909 level_trigger |= NOTE_SUSPEND;
6910 }
6911
6912 if ((ev_hint & SO_FILT_HINT_RESUME) ||
6913 (so->so_flags & SOF_SUSPENDED) == 0) {
6914 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6915
6916 /* If suspend event was delivered before, reset it */
6917 kn->kn_hook32 &= ~NOTE_SUSPEND;
6918
6919 kn->kn_fflags |= NOTE_RESUME;
6920 level_trigger |= NOTE_RESUME;
6921 }
6922
6923 if (so->so_error != 0) {
6924 ret = 1;
6925 data = so->so_error;
6926 kn->kn_flags |= EV_EOF;
6927 } else {
6928 u_int32_t data32;
6929 get_sockev_state(so, &data32);
6930 data = data32;
6931 }
6932
6933 /* Reset any events that are not requested on this knote */
6934 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6935 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6936
6937 /* Find the level triggerred events that are already delivered */
6938 level_trigger &= kn->kn_hook32;
6939 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6940
6941 /* Do not deliver level triggerred events more than once */
6942 if ((kn->kn_fflags & ~level_trigger) != 0) {
6943 ret = 1;
6944 }
6945
6946 if (ret && kev) {
6947 /*
6948 * Store the state of the events being delivered. This
6949 * state can be used to deliver level triggered events
6950 * ateast once and still avoid waking up the application
6951 * multiple times as long as the event is active.
6952 */
6953 if (kn->kn_fflags != 0) {
6954 kn->kn_hook32 |= (kn->kn_fflags &
6955 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6956 }
6957
6958 /*
6959 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6960 * only one of them and remember the last one that was
6961 * delivered last
6962 */
6963 if (kn->kn_fflags & NOTE_SUSPEND) {
6964 kn->kn_hook32 &= ~NOTE_RESUME;
6965 }
6966 if (kn->kn_fflags & NOTE_RESUME) {
6967 kn->kn_hook32 &= ~NOTE_SUSPEND;
6968 }
6969
6970 knote_fill_kevent(kn, kev, data);
6971 }
6972 return ret;
6973 }
6974
6975 static int
6976 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6977 {
6978 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6979
6980 /* socket locked */
6981 kn->kn_hook32 = 0;
6982 if (KNOTE_ATTACH(&so->so_klist, kn)) {
6983 so->so_flags |= SOF_KNOTE;
6984 }
6985
6986 /* determine if event already fired */
6987 return filt_sockev_common(kn, NULL, so, 0);
6988 }
6989
6990 static void
6991 filt_sockdetach(struct knote *kn)
6992 {
6993 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6994 socket_lock(so, 1);
6995
6996 if ((so->so_flags & SOF_KNOTE) != 0) {
6997 if (KNOTE_DETACH(&so->so_klist, kn)) {
6998 so->so_flags &= ~SOF_KNOTE;
6999 }
7000 }
7001 socket_unlock(so, 1);
7002 }
7003
7004 static int
7005 filt_sockev(struct knote *kn, long hint)
7006 {
7007 int ret = 0, locked = 0;
7008 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
7009 long ev_hint = (hint & SO_FILT_HINT_EV);
7010
7011 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7012 socket_lock(so, 1);
7013 locked = 1;
7014 }
7015
7016 ret = filt_sockev_common(kn, NULL, so, ev_hint);
7017
7018 if (locked) {
7019 socket_unlock(so, 1);
7020 }
7021
7022 return ret;
7023 }
7024
7025
7026
7027 /*
7028 * filt_socktouch - update event state
7029 */
7030 static int
7031 filt_socktouch(
7032 struct knote *kn,
7033 struct kevent_qos_s *kev)
7034 {
7035 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
7036 uint32_t changed_flags;
7037 int ret;
7038
7039 socket_lock(so, 1);
7040
7041 /* save off the [result] data and fflags */
7042 changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7043
7044 /* save off the new input fflags and data */
7045 kn->kn_sfflags = kev->fflags;
7046 kn->kn_sdata = kev->data;
7047
7048 /* restrict the current results to the (smaller?) set of new interest */
7049 /*
7050 * For compatibility with previous implementations, we leave kn_fflags
7051 * as they were before.
7052 */
7053 //kn->kn_fflags &= kev->fflags;
7054
7055 /*
7056 * Since we keep track of events that are already
7057 * delivered, if any of those events are not requested
7058 * anymore the state related to them can be reset
7059 */
7060 kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7061
7062 /* determine if we have events to deliver */
7063 ret = filt_sockev_common(kn, NULL, so, 0);
7064
7065 socket_unlock(so, 1);
7066
7067 return ret;
7068 }
7069
7070 /*
7071 * filt_sockprocess - query event fired state and return data
7072 */
7073 static int
7074 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7075 {
7076 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
7077 int ret = 0;
7078
7079 socket_lock(so, 1);
7080
7081 ret = filt_sockev_common(kn, kev, so, 0);
7082
7083 socket_unlock(so, 1);
7084
7085 return ret;
7086 }
7087
7088 void
7089 get_sockev_state(struct socket *so, u_int32_t *statep)
7090 {
7091 u_int32_t state = *(statep);
7092
7093 /*
7094 * If the state variable is already used by a previous event,
7095 * reset it.
7096 */
7097 if (state != 0) {
7098 return;
7099 }
7100
7101 if (so->so_state & SS_ISCONNECTED) {
7102 state |= SOCKEV_CONNECTED;
7103 } else {
7104 state &= ~(SOCKEV_CONNECTED);
7105 }
7106 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7107 *(statep) = state;
7108 }
7109
7110 #define SO_LOCK_HISTORY_STR_LEN \
7111 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7112
7113 __private_extern__ const char *
7114 solockhistory_nr(struct socket *so)
7115 {
7116 size_t n = 0;
7117 int i;
7118 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7119
7120 bzero(lock_history_str, sizeof(lock_history_str));
7121 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7122 n += scnprintf(lock_history_str + n,
7123 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7124 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7125 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7126 }
7127 return lock_history_str;
7128 }
7129
7130 lck_mtx_t *
7131 socket_getlock(struct socket *so, int flags)
7132 {
7133 if (so->so_proto->pr_getlock != NULL) {
7134 return (*so->so_proto->pr_getlock)(so, flags);
7135 } else {
7136 return so->so_proto->pr_domain->dom_mtx;
7137 }
7138 }
7139
7140 void
7141 socket_lock(struct socket *so, int refcount)
7142 {
7143 void *lr_saved;
7144
7145 lr_saved = __builtin_return_address(0);
7146
7147 if (so->so_proto->pr_lock) {
7148 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
7149 } else {
7150 #ifdef MORE_LOCKING_DEBUG
7151 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7152 LCK_MTX_ASSERT_NOTOWNED);
7153 #endif
7154 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7155 if (refcount) {
7156 so->so_usecount++;
7157 }
7158 so->lock_lr[so->next_lock_lr] = lr_saved;
7159 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7160 }
7161 }
7162
7163 void
7164 socket_lock_assert_owned(struct socket *so)
7165 {
7166 lck_mtx_t *mutex_held;
7167
7168 if (so->so_proto->pr_getlock != NULL) {
7169 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7170 } else {
7171 mutex_held = so->so_proto->pr_domain->dom_mtx;
7172 }
7173
7174 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7175 }
7176
7177 int
7178 socket_try_lock(struct socket *so)
7179 {
7180 lck_mtx_t *mtx;
7181
7182 if (so->so_proto->pr_getlock != NULL) {
7183 mtx = (*so->so_proto->pr_getlock)(so, 0);
7184 } else {
7185 mtx = so->so_proto->pr_domain->dom_mtx;
7186 }
7187
7188 return lck_mtx_try_lock(mtx);
7189 }
7190
7191 void
7192 socket_unlock(struct socket *so, int refcount)
7193 {
7194 void *lr_saved;
7195 lck_mtx_t *mutex_held;
7196
7197 lr_saved = __builtin_return_address(0);
7198
7199 if (so == NULL || so->so_proto == NULL) {
7200 panic("%s: null so_proto so=%p\n", __func__, so);
7201 /* NOTREACHED */
7202 }
7203
7204 if (so->so_proto->pr_unlock) {
7205 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7206 } else {
7207 mutex_held = so->so_proto->pr_domain->dom_mtx;
7208 #ifdef MORE_LOCKING_DEBUG
7209 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7210 #endif
7211 so->unlock_lr[so->next_unlock_lr] = lr_saved;
7212 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7213
7214 if (refcount) {
7215 if (so->so_usecount <= 0) {
7216 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7217 "lrh=%s", __func__, so->so_usecount, so,
7218 SOCK_DOM(so), so->so_type,
7219 SOCK_PROTO(so), solockhistory_nr(so));
7220 /* NOTREACHED */
7221 }
7222
7223 so->so_usecount--;
7224 if (so->so_usecount == 0) {
7225 sofreelastref(so, 1);
7226 }
7227 }
7228 lck_mtx_unlock(mutex_held);
7229 }
7230 }
7231
7232 /* Called with socket locked, will unlock socket */
7233 void
7234 sofree(struct socket *so)
7235 {
7236 lck_mtx_t *mutex_held;
7237
7238 if (so->so_proto->pr_getlock != NULL) {
7239 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7240 } else {
7241 mutex_held = so->so_proto->pr_domain->dom_mtx;
7242 }
7243 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7244
7245 sofreelastref(so, 0);
7246 }
7247
7248 void
7249 soreference(struct socket *so)
7250 {
7251 socket_lock(so, 1); /* locks & take one reference on socket */
7252 socket_unlock(so, 0); /* unlock only */
7253 }
7254
7255 void
7256 sodereference(struct socket *so)
7257 {
7258 socket_lock(so, 0);
7259 socket_unlock(so, 1);
7260 }
7261
7262 /*
7263 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7264 * possibility of using jumbo clusters. Caller must ensure to hold
7265 * the socket lock.
7266 */
7267 void
7268 somultipages(struct socket *so, boolean_t set)
7269 {
7270 if (set) {
7271 so->so_flags |= SOF_MULTIPAGES;
7272 } else {
7273 so->so_flags &= ~SOF_MULTIPAGES;
7274 }
7275 }
7276
7277 void
7278 soif2kcl(struct socket *so, boolean_t set)
7279 {
7280 if (set) {
7281 so->so_flags1 |= SOF1_IF_2KCL;
7282 } else {
7283 so->so_flags1 &= ~SOF1_IF_2KCL;
7284 }
7285 }
7286
7287 int
7288 so_isdstlocal(struct socket *so)
7289 {
7290 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7291
7292 if (SOCK_DOM(so) == PF_INET) {
7293 return inaddr_local(inp->inp_faddr);
7294 } else if (SOCK_DOM(so) == PF_INET6) {
7295 return in6addr_local(&inp->in6p_faddr);
7296 }
7297
7298 return 0;
7299 }
7300
7301 int
7302 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7303 {
7304 struct sockbuf *rcv, *snd;
7305 int err = 0, defunct;
7306
7307 rcv = &so->so_rcv;
7308 snd = &so->so_snd;
7309
7310 defunct = (so->so_flags & SOF_DEFUNCT);
7311 if (defunct) {
7312 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7313 panic("%s: SB_DROP not set", __func__);
7314 /* NOTREACHED */
7315 }
7316 goto done;
7317 }
7318
7319 if (so->so_flags & SOF_NODEFUNCT) {
7320 if (noforce) {
7321 err = EOPNOTSUPP;
7322 if (p != PROC_NULL) {
7323 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7324 "name %s level %d) so 0x%llx [%d,%d] "
7325 "is not eligible for defunct "
7326 "(%d)\n", __func__, proc_selfpid(),
7327 proc_best_name(current_proc()), proc_pid(p),
7328 proc_best_name(p), level,
7329 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7330 SOCK_DOM(so), SOCK_TYPE(so), err);
7331 }
7332 return err;
7333 }
7334 so->so_flags &= ~SOF_NODEFUNCT;
7335 if (p != PROC_NULL) {
7336 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7337 "name %s level %d) so 0x%llx [%d,%d] "
7338 "defunct by force "
7339 "(%d)\n", __func__, proc_selfpid(),
7340 proc_best_name(current_proc()), proc_pid(p),
7341 proc_best_name(p), level,
7342 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7343 SOCK_DOM(so), SOCK_TYPE(so), err);
7344 }
7345 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7346 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7347 struct ifnet *ifp = inp->inp_last_outifp;
7348
7349 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7350 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7351 } else if (so->so_flags & SOF_DELEGATED) {
7352 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7353 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7354 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7355 } else if (noforce && p != PROC_NULL) {
7356 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7357
7358 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7359 so->so_extended_bk_start = net_uptime();
7360 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7361
7362 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7363
7364 err = EOPNOTSUPP;
7365 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7366 "name %s level %d) so 0x%llx [%d,%d] "
7367 "extend bk idle "
7368 "(%d)\n", __func__, proc_selfpid(),
7369 proc_best_name(current_proc()), proc_pid(p),
7370 proc_best_name(p), level,
7371 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7372 SOCK_DOM(so), SOCK_TYPE(so), err);
7373 return err;
7374 } else {
7375 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7376 }
7377 }
7378
7379 so->so_flags |= SOF_DEFUNCT;
7380
7381 /* Prevent further data from being appended to the socket buffers */
7382 snd->sb_flags |= SB_DROP;
7383 rcv->sb_flags |= SB_DROP;
7384
7385 /* Flush any existing data in the socket buffers */
7386 if (rcv->sb_cc != 0) {
7387 rcv->sb_flags &= ~SB_SEL;
7388 selthreadclear(&rcv->sb_sel);
7389 sbrelease(rcv);
7390 }
7391 if (snd->sb_cc != 0) {
7392 snd->sb_flags &= ~SB_SEL;
7393 selthreadclear(&snd->sb_sel);
7394 sbrelease(snd);
7395 }
7396
7397 done:
7398 if (p != PROC_NULL) {
7399 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7400 "so 0x%llx [%d,%d] %s defunct%s\n", __func__,
7401 proc_selfpid(), proc_best_name(current_proc()),
7402 proc_pid(p), proc_best_name(p), level,
7403 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7404 SOCK_TYPE(so), defunct ? "is already" : "marked as",
7405 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7406 " extbkidle" : "");
7407 }
7408 return err;
7409 }
7410
7411 int
7412 sodefunct(struct proc *p, struct socket *so, int level)
7413 {
7414 struct sockbuf *rcv, *snd;
7415
7416 if (!(so->so_flags & SOF_DEFUNCT)) {
7417 panic("%s improperly called", __func__);
7418 /* NOTREACHED */
7419 }
7420 if (so->so_state & SS_DEFUNCT) {
7421 goto done;
7422 }
7423
7424 rcv = &so->so_rcv;
7425 snd = &so->so_snd;
7426
7427 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7428 char s[MAX_IPv6_STR_LEN];
7429 char d[MAX_IPv6_STR_LEN];
7430 struct inpcb *inp = sotoinpcb(so);
7431
7432 if (p != PROC_NULL) {
7433 SODEFUNCTLOG(
7434 "%s[%d, %s]: (target pid %d name %s level %d) "
7435 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
7436 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7437 " snd_fl 0x%x]\n", __func__,
7438 proc_selfpid(), proc_best_name(current_proc()),
7439 proc_pid(p), proc_best_name(p), level,
7440 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7441 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7442 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7443 (void *)&inp->inp_laddr.s_addr :
7444 (void *)&inp->in6p_laddr),
7445 s, sizeof(s)), ntohs(inp->in6p_lport),
7446 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7447 (void *)&inp->inp_faddr.s_addr :
7448 (void *)&inp->in6p_faddr,
7449 d, sizeof(d)), ntohs(inp->in6p_fport),
7450 (uint32_t)rcv->sb_sel.si_flags,
7451 (uint32_t)snd->sb_sel.si_flags,
7452 rcv->sb_flags, snd->sb_flags);
7453 }
7454 } else if (p != PROC_NULL) {
7455 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7456 "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7457 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7458 proc_selfpid(), proc_best_name(current_proc()),
7459 proc_pid(p), proc_best_name(p), level,
7460 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7461 SOCK_DOM(so), SOCK_TYPE(so),
7462 (uint32_t)rcv->sb_sel.si_flags,
7463 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7464 snd->sb_flags);
7465 }
7466
7467 /*
7468 * Unwedge threads blocked on sbwait() and sb_lock().
7469 */
7470 sbwakeup(rcv);
7471 sbwakeup(snd);
7472
7473 so->so_flags1 |= SOF1_DEFUNCTINPROG;
7474 if (rcv->sb_flags & SB_LOCK) {
7475 sbunlock(rcv, TRUE); /* keep socket locked */
7476 }
7477 if (snd->sb_flags & SB_LOCK) {
7478 sbunlock(snd, TRUE); /* keep socket locked */
7479 }
7480 /*
7481 * Flush the buffers and disconnect. We explicitly call shutdown
7482 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7483 * states are set for the socket. This would also flush out data
7484 * hanging off the receive list of this socket.
7485 */
7486 (void) soshutdownlock_final(so, SHUT_RD);
7487 (void) soshutdownlock_final(so, SHUT_WR);
7488 (void) sodisconnectlocked(so);
7489
7490 /*
7491 * Explicitly handle connectionless-protocol disconnection
7492 * and release any remaining data in the socket buffers.
7493 */
7494 if (!(so->so_state & SS_ISDISCONNECTED)) {
7495 (void) soisdisconnected(so);
7496 }
7497
7498 if (so->so_error == 0) {
7499 so->so_error = EBADF;
7500 }
7501
7502 if (rcv->sb_cc != 0) {
7503 rcv->sb_flags &= ~SB_SEL;
7504 selthreadclear(&rcv->sb_sel);
7505 sbrelease(rcv);
7506 }
7507 if (snd->sb_cc != 0) {
7508 snd->sb_flags &= ~SB_SEL;
7509 selthreadclear(&snd->sb_sel);
7510 sbrelease(snd);
7511 }
7512 so->so_state |= SS_DEFUNCT;
7513 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7514
7515 done:
7516 return 0;
7517 }
7518
7519 int
7520 soresume(struct proc *p, struct socket *so, int locked)
7521 {
7522 if (locked == 0) {
7523 socket_lock(so, 1);
7524 }
7525
7526 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7527 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7528 "[%d,%d] resumed from bk idle\n",
7529 __func__, proc_selfpid(), proc_best_name(current_proc()),
7530 proc_pid(p), proc_best_name(p),
7531 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7532 SOCK_DOM(so), SOCK_TYPE(so));
7533
7534 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7535 so->so_extended_bk_start = 0;
7536 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7537
7538 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7539 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7540 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7541 }
7542 if (locked == 0) {
7543 socket_unlock(so, 1);
7544 }
7545
7546 return 0;
7547 }
7548
7549 /*
7550 * Does not attempt to account for sockets that are delegated from
7551 * the current process
7552 */
7553 int
7554 so_set_extended_bk_idle(struct socket *so, int optval)
7555 {
7556 int error = 0;
7557
7558 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7559 SOCK_PROTO(so) != IPPROTO_TCP) {
7560 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7561 error = EOPNOTSUPP;
7562 } else if (optval == 0) {
7563 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7564
7565 soresume(current_proc(), so, 1);
7566 } else {
7567 struct proc *p = current_proc();
7568 int i;
7569 struct filedesc *fdp;
7570 int count = 0;
7571
7572 /*
7573 * Unlock socket to avoid lock ordering issue with
7574 * the proc fd table lock
7575 */
7576 socket_unlock(so, 0);
7577
7578 proc_fdlock(p);
7579
7580 fdp = p->p_fd;
7581 for (i = 0; i < fdp->fd_nfiles; i++) {
7582 struct fileproc *fp = fdp->fd_ofiles[i];
7583 struct socket *so2;
7584
7585 if (fp == NULL ||
7586 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7587 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) {
7588 continue;
7589 }
7590
7591 so2 = (struct socket *)fp->f_fglob->fg_data;
7592 if (so != so2 &&
7593 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7594 count++;
7595 }
7596 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7597 break;
7598 }
7599 }
7600 proc_fdunlock(p);
7601
7602 socket_lock(so, 0);
7603
7604 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7605 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7606 error = EBUSY;
7607 } else if (so->so_flags & SOF_DELEGATED) {
7608 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7609 error = EBUSY;
7610 } else {
7611 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7612 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7613 }
7614 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7615 "%s marked for extended bk idle\n",
7616 __func__, proc_selfpid(), proc_best_name(current_proc()),
7617 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7618 SOCK_DOM(so), SOCK_TYPE(so),
7619 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7620 "is" : "not");
7621 }
7622
7623 return error;
7624 }
7625
7626 static void
7627 so_stop_extended_bk_idle(struct socket *so)
7628 {
7629 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7630 so->so_extended_bk_start = 0;
7631
7632 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7633 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7634 /*
7635 * Force defunct
7636 */
7637 sosetdefunct(current_proc(), so,
7638 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7639 if (so->so_flags & SOF_DEFUNCT) {
7640 sodefunct(current_proc(), so,
7641 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7642 }
7643 }
7644
7645 void
7646 so_drain_extended_bk_idle(struct socket *so)
7647 {
7648 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7649 /*
7650 * Only penalize sockets that have outstanding data
7651 */
7652 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7653 so_stop_extended_bk_idle(so);
7654
7655 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7656 }
7657 }
7658 }
7659
7660 /*
7661 * Return values tells if socket is still in extended background idle
7662 */
7663 int
7664 so_check_extended_bk_idle_time(struct socket *so)
7665 {
7666 int ret = 1;
7667
7668 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7669 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7670 __func__, proc_selfpid(), proc_best_name(current_proc()),
7671 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7672 SOCK_DOM(so), SOCK_TYPE(so));
7673 if (net_uptime() - so->so_extended_bk_start >
7674 soextbkidlestat.so_xbkidle_time) {
7675 so_stop_extended_bk_idle(so);
7676
7677 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7678
7679 ret = 0;
7680 } else {
7681 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7682
7683 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7684 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7685 }
7686 }
7687
7688 return ret;
7689 }
7690
7691 void
7692 resume_proc_sockets(proc_t p)
7693 {
7694 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7695 struct filedesc *fdp;
7696 int i;
7697
7698 proc_fdlock(p);
7699 fdp = p->p_fd;
7700 for (i = 0; i < fdp->fd_nfiles; i++) {
7701 struct fileproc *fp;
7702 struct socket *so;
7703
7704 fp = fdp->fd_ofiles[i];
7705 if (fp == NULL ||
7706 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7707 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) {
7708 continue;
7709 }
7710
7711 so = (struct socket *)fp->f_fglob->fg_data;
7712 (void) soresume(p, so, 0);
7713 }
7714 proc_fdunlock(p);
7715
7716 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7717 }
7718 }
7719
7720 __private_extern__ int
7721 so_set_recv_anyif(struct socket *so, int optval)
7722 {
7723 int ret = 0;
7724
7725 #if INET6
7726 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7727 #else
7728 if (SOCK_DOM(so) == PF_INET) {
7729 #endif /* !INET6 */
7730 if (optval) {
7731 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7732 } else {
7733 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7734 }
7735 }
7736
7737
7738 return ret;
7739 }
7740
7741 __private_extern__ int
7742 so_get_recv_anyif(struct socket *so)
7743 {
7744 int ret = 0;
7745
7746 #if INET6
7747 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7748 #else
7749 if (SOCK_DOM(so) == PF_INET) {
7750 #endif /* !INET6 */
7751 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7752 }
7753
7754 return ret;
7755 }
7756
7757 int
7758 so_set_restrictions(struct socket *so, uint32_t vals)
7759 {
7760 int nocell_old, nocell_new;
7761 int noexpensive_old, noexpensive_new;
7762 int noconstrained_old, noconstrained_new;
7763
7764 /*
7765 * Deny-type restrictions are trapdoors; once set they cannot be
7766 * unset for the lifetime of the socket. This allows them to be
7767 * issued by a framework on behalf of the application without
7768 * having to worry that they can be undone.
7769 *
7770 * Note here that socket-level restrictions overrides any protocol
7771 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7772 * socket restriction issued on the socket has a higher precendence
7773 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7774 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7775 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7776 */
7777 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7778 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7779 noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7780 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7781 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7782 SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7783 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7784 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7785 noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7786
7787 /* we can only set, not clear restrictions */
7788 if ((nocell_new - nocell_old) == 0 &&
7789 (noexpensive_new - noexpensive_old) == 0 &&
7790 (noconstrained_new - noconstrained_old) == 0) {
7791 return 0;
7792 }
7793 #if INET6
7794 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7795 #else
7796 if (SOCK_DOM(so) == PF_INET) {
7797 #endif /* !INET6 */
7798 if (nocell_new - nocell_old != 0) {
7799 /*
7800 * if deny cellular is now set, do what's needed
7801 * for INPCB
7802 */
7803 inp_set_nocellular(sotoinpcb(so));
7804 }
7805 if (noexpensive_new - noexpensive_old != 0) {
7806 inp_set_noexpensive(sotoinpcb(so));
7807 }
7808 if (noconstrained_new - noconstrained_old != 0) {
7809 inp_set_noconstrained(sotoinpcb(so));
7810 }
7811 }
7812
7813 if (SOCK_DOM(so) == PF_MULTIPATH) {
7814 mptcp_set_restrictions(so);
7815 }
7816
7817 return 0;
7818 }
7819
7820 uint32_t
7821 so_get_restrictions(struct socket *so)
7822 {
7823 return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7824 SO_RESTRICT_DENY_OUT |
7825 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7826 }
7827
7828 int
7829 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7830 {
7831 struct proc *ep = PROC_NULL;
7832 int error = 0;
7833
7834 /* pid 0 is reserved for kernel */
7835 if (epid == 0) {
7836 error = EINVAL;
7837 goto done;
7838 }
7839
7840 /*
7841 * If this is an in-kernel socket, prevent its delegate
7842 * association from changing unless the socket option is
7843 * coming from within the kernel itself.
7844 */
7845 if (so->last_pid == 0 && p != kernproc) {
7846 error = EACCES;
7847 goto done;
7848 }
7849
7850 /*
7851 * If this is issued by a process that's recorded as the
7852 * real owner of the socket, or if the pid is the same as
7853 * the process's own pid, then proceed. Otherwise ensure
7854 * that the issuing process has the necessary privileges.
7855 */
7856 if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7857 if ((error = priv_check_cred(kauth_cred_get(),
7858 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7859 error = EACCES;
7860 goto done;
7861 }
7862 }
7863
7864 /* Find the process that corresponds to the effective pid */
7865 if ((ep = proc_find(epid)) == PROC_NULL) {
7866 error = ESRCH;
7867 goto done;
7868 }
7869
7870 /*
7871 * If a process tries to delegate the socket to itself, then
7872 * there's really nothing to do; treat it as a way for the
7873 * delegate association to be cleared. Note that we check
7874 * the passed-in proc rather than calling proc_selfpid(),
7875 * as we need to check the process issuing the socket option
7876 * which could be kernproc. Given that we don't allow 0 for
7877 * effective pid, it means that a delegated in-kernel socket
7878 * stays delegated during its lifetime (which is probably OK.)
7879 */
7880 if (epid == proc_pid(p)) {
7881 so->so_flags &= ~SOF_DELEGATED;
7882 so->e_upid = 0;
7883 so->e_pid = 0;
7884 uuid_clear(so->e_uuid);
7885 } else {
7886 so->so_flags |= SOF_DELEGATED;
7887 so->e_upid = proc_uniqueid(ep);
7888 so->e_pid = proc_pid(ep);
7889 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
7890
7891 #if defined(XNU_TARGET_OS_OSX)
7892 if (ep->p_responsible_pid != so->e_pid) {
7893 proc_t rp = proc_find(ep->p_responsible_pid);
7894 if (rp != PROC_NULL) {
7895 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
7896 so->so_rpid = ep->p_responsible_pid;
7897 proc_rele(rp);
7898 } else {
7899 uuid_clear(so->so_ruuid);
7900 so->so_rpid = -1;
7901 }
7902 }
7903 #endif
7904 }
7905 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7906 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
7907 }
7908 done:
7909 if (error == 0 && net_io_policy_log) {
7910 uuid_string_t buf;
7911
7912 uuid_unparse(so->e_uuid, buf);
7913 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7914 "euuid %s%s\n", __func__, proc_name_address(p),
7915 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7916 SOCK_DOM(so), SOCK_TYPE(so),
7917 so->e_pid, proc_name_address(ep), buf,
7918 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7919 } else if (error != 0 && net_io_policy_log) {
7920 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7921 "ERROR (%d)\n", __func__, proc_name_address(p),
7922 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7923 SOCK_DOM(so), SOCK_TYPE(so),
7924 epid, (ep == PROC_NULL) ? "PROC_NULL" :
7925 proc_name_address(ep), error);
7926 }
7927
7928 /* Update this socket's policy upon success */
7929 if (error == 0) {
7930 so->so_policy_gencnt *= -1;
7931 so_update_policy(so);
7932 #if NECP
7933 so_update_necp_policy(so, NULL, NULL);
7934 #endif /* NECP */
7935 }
7936
7937 if (ep != PROC_NULL) {
7938 proc_rele(ep);
7939 }
7940
7941 return error;
7942 }
7943
7944 int
7945 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
7946 {
7947 uuid_string_t buf;
7948 uuid_t uuid;
7949 int error = 0;
7950
7951 /* UUID must not be all-zeroes (reserved for kernel) */
7952 if (uuid_is_null(euuid)) {
7953 error = EINVAL;
7954 goto done;
7955 }
7956
7957 /*
7958 * If this is an in-kernel socket, prevent its delegate
7959 * association from changing unless the socket option is
7960 * coming from within the kernel itself.
7961 */
7962 if (so->last_pid == 0 && p != kernproc) {
7963 error = EACCES;
7964 goto done;
7965 }
7966
7967 /* Get the UUID of the issuing process */
7968 proc_getexecutableuuid(p, uuid, sizeof(uuid));
7969
7970 /*
7971 * If this is issued by a process that's recorded as the
7972 * real owner of the socket, or if the uuid is the same as
7973 * the process's own uuid, then proceed. Otherwise ensure
7974 * that the issuing process has the necessary privileges.
7975 */
7976 if (check_cred &&
7977 (uuid_compare(euuid, so->last_uuid) != 0 ||
7978 uuid_compare(euuid, uuid) != 0)) {
7979 if ((error = priv_check_cred(kauth_cred_get(),
7980 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7981 error = EACCES;
7982 goto done;
7983 }
7984 }
7985
7986 /*
7987 * If a process tries to delegate the socket to itself, then
7988 * there's really nothing to do; treat it as a way for the
7989 * delegate association to be cleared. Note that we check
7990 * the uuid of the passed-in proc rather than that of the
7991 * current process, as we need to check the process issuing
7992 * the socket option which could be kernproc itself. Given
7993 * that we don't allow 0 for effective uuid, it means that
7994 * a delegated in-kernel socket stays delegated during its
7995 * lifetime (which is okay.)
7996 */
7997 if (uuid_compare(euuid, uuid) == 0) {
7998 so->so_flags &= ~SOF_DELEGATED;
7999 so->e_upid = 0;
8000 so->e_pid = 0;
8001 uuid_clear(so->e_uuid);
8002 } else {
8003 so->so_flags |= SOF_DELEGATED;
8004 /*
8005 * Unlike so_set_effective_pid(), we only have the UUID
8006 * here and the process ID is not known. Inherit the
8007 * real {pid,upid} of the socket.
8008 */
8009 so->e_upid = so->last_upid;
8010 so->e_pid = so->last_pid;
8011 uuid_copy(so->e_uuid, euuid);
8012 }
8013 /*
8014 * The following will clear the effective process name as it's the same
8015 * as the real process
8016 */
8017 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8018 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8019 }
8020 done:
8021 if (error == 0 && net_io_policy_log) {
8022 uuid_unparse(so->e_uuid, buf);
8023 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8024 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8025 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8026 SOCK_TYPE(so), so->e_pid, buf,
8027 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8028 } else if (error != 0 && net_io_policy_log) {
8029 uuid_unparse(euuid, buf);
8030 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8031 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8032 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8033 SOCK_TYPE(so), buf, error);
8034 }
8035
8036 /* Update this socket's policy upon success */
8037 if (error == 0) {
8038 so->so_policy_gencnt *= -1;
8039 so_update_policy(so);
8040 #if NECP
8041 so_update_necp_policy(so, NULL, NULL);
8042 #endif /* NECP */
8043 }
8044
8045 return error;
8046 }
8047
8048 void
8049 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8050 uint32_t ev_datalen)
8051 {
8052 struct kev_msg ev_msg;
8053
8054 /*
8055 * A netpolicy event always starts with a netpolicy_event_data
8056 * structure, but the caller can provide for a longer event
8057 * structure to post, depending on the event code.
8058 */
8059 VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8060
8061 bzero(&ev_msg, sizeof(ev_msg));
8062 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8063 ev_msg.kev_class = KEV_NETWORK_CLASS;
8064 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
8065 ev_msg.event_code = ev_code;
8066
8067 ev_msg.dv[0].data_ptr = ev_data;
8068 ev_msg.dv[0].data_length = ev_datalen;
8069
8070 kev_post_msg(&ev_msg);
8071 }
8072
8073 void
8074 socket_post_kev_msg(uint32_t ev_code,
8075 struct kev_socket_event_data *ev_data,
8076 uint32_t ev_datalen)
8077 {
8078 struct kev_msg ev_msg;
8079
8080 bzero(&ev_msg, sizeof(ev_msg));
8081 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8082 ev_msg.kev_class = KEV_NETWORK_CLASS;
8083 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8084 ev_msg.event_code = ev_code;
8085
8086 ev_msg.dv[0].data_ptr = ev_data;
8087 ev_msg.dv[0].data_length = ev_datalen;
8088
8089 kev_post_msg(&ev_msg);
8090 }
8091
8092 void
8093 socket_post_kev_msg_closed(struct socket *so)
8094 {
8095 struct kev_socket_closed ev;
8096 struct sockaddr *socksa = NULL, *peersa = NULL;
8097 int err;
8098 bzero(&ev, sizeof(ev));
8099 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8100 if (err == 0) {
8101 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8102 &peersa);
8103 if (err == 0) {
8104 memcpy(&ev.ev_data.kev_sockname, socksa,
8105 min(socksa->sa_len,
8106 sizeof(ev.ev_data.kev_sockname)));
8107 memcpy(&ev.ev_data.kev_peername, peersa,
8108 min(peersa->sa_len,
8109 sizeof(ev.ev_data.kev_peername)));
8110 socket_post_kev_msg(KEV_SOCKET_CLOSED,
8111 &ev.ev_data, sizeof(ev));
8112 }
8113 }
8114 if (socksa != NULL) {
8115 FREE(socksa, M_SONAME);
8116 }
8117 if (peersa != NULL) {
8118 FREE(peersa, M_SONAME);
8119 }
8120 }