]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/uipc_socket.c
aa07cc477c0d2913981c1a2ab496d0fb71235dca
[apple/xnu.git] / bsd / kern / uipc_socket.c
1 /*
2 * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/net_api_stats.h>
102 #include <net/ntstat.h>
103 #include <net/content_filter.h>
104 #include <netinet/in.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet/in_tclass.h>
107 #include <netinet/in_var.h>
108 #include <netinet/tcp_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet6/ip6_var.h>
111 #include <netinet/flow_divert.h>
112 #include <kern/zalloc.h>
113 #include <kern/locks.h>
114 #include <machine/limits.h>
115 #include <libkern/OSAtomic.h>
116 #include <pexpert/pexpert.h>
117 #include <kern/assert.h>
118 #include <kern/task.h>
119 #include <kern/policy_internal.h>
120
121 #include <sys/kpi_mbuf.h>
122 #include <sys/mcache.h>
123 #include <sys/unpcb.h>
124 #include <libkern/section_keywords.h>
125
126 #if CONFIG_MACF
127 #include <security/mac_framework.h>
128 #endif /* MAC */
129
130 #if MULTIPATH
131 #include <netinet/mp_pcb.h>
132 #include <netinet/mptcp_var.h>
133 #endif /* MULTIPATH */
134
135 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
136
137 #if DEBUG || DEVELOPMENT
138 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
139 #else
140 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
141 #endif
142
143 /* TODO: this should be in a header file somewhere */
144 extern char *proc_name_address(void *p);
145
146 static u_int32_t so_cache_hw; /* High water mark for socache */
147 static u_int32_t so_cache_timeouts; /* number of timeouts */
148 static u_int32_t so_cache_max_freed; /* max freed per timeout */
149 static u_int32_t cached_sock_count = 0;
150 STAILQ_HEAD(, socket) so_cache_head;
151 int max_cached_sock_count = MAX_CACHED_SOCKETS;
152 static u_int32_t so_cache_time;
153 static int socketinit_done;
154 static struct zone *so_cache_zone;
155
156 static lck_grp_t *so_cache_mtx_grp;
157 static lck_attr_t *so_cache_mtx_attr;
158 static lck_grp_attr_t *so_cache_mtx_grp_attr;
159 static lck_mtx_t *so_cache_mtx;
160
161 #include <machine/limits.h>
162
163 static int filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
164 static void filt_sordetach(struct knote *kn);
165 static int filt_soread(struct knote *kn, long hint);
166 static int filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
167 static int filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
168
169 static int filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
170 static void filt_sowdetach(struct knote *kn);
171 static int filt_sowrite(struct knote *kn, long hint);
172 static int filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
173 static int filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
174
175 static int filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
176 static void filt_sockdetach(struct knote *kn);
177 static int filt_sockev(struct knote *kn, long hint);
178 static int filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
179 static int filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
180
181 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
182 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
183
184 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
185 .f_isfd = 1,
186 .f_attach = filt_sorattach,
187 .f_detach = filt_sordetach,
188 .f_event = filt_soread,
189 .f_touch = filt_sortouch,
190 .f_process = filt_sorprocess,
191 };
192
193 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
194 .f_isfd = 1,
195 .f_attach = filt_sowattach,
196 .f_detach = filt_sowdetach,
197 .f_event = filt_sowrite,
198 .f_touch = filt_sowtouch,
199 .f_process = filt_sowprocess,
200 };
201
202 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
203 .f_isfd = 1,
204 .f_attach = filt_sockattach,
205 .f_detach = filt_sockdetach,
206 .f_event = filt_sockev,
207 .f_touch = filt_socktouch,
208 .f_process = filt_sockprocess,
209 };
210
211 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
212 .f_isfd = 1,
213 .f_attach = filt_sorattach,
214 .f_detach = filt_sordetach,
215 .f_event = filt_soread,
216 .f_touch = filt_sortouch,
217 .f_process = filt_sorprocess,
218 };
219
220 SYSCTL_DECL(_kern_ipc);
221
222 #define EVEN_MORE_LOCKING_DEBUG 0
223
224 int socket_debug = 0;
225 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
226 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
227
228 static unsigned long sodefunct_calls = 0;
229 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
230 &sodefunct_calls, "");
231
232 static int socket_zone = M_SOCKET;
233 so_gen_t so_gencnt; /* generation count for sockets */
234
235 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
236 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
237
238 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
239 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
240 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
241 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
242 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
243 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
244 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
245 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
246 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
247
248 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
249
250 int somaxconn = SOMAXCONN;
251 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
252 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
253
254 /* Should we get a maximum also ??? */
255 static int sosendmaxchain = 65536;
256 static int sosendminchain = 16384;
257 static int sorecvmincopy = 16384;
258 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
259 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
260 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
261 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
262
263 /*
264 * Set to enable jumbo clusters (if available) for large writes when
265 * the socket is marked with SOF_MULTIPAGES; see below.
266 */
267 int sosendjcl = 1;
268 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
269 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
270
271 /*
272 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
273 * writes on the socket for all protocols on any network interfaces,
274 * depending upon sosendjcl above. Be extra careful when setting this
275 * to 1, because sending down packets that cross physical pages down to
276 * broken drivers (those that falsely assume that the physical pages
277 * are contiguous) might lead to system panics or silent data corruption.
278 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
279 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
280 * capable. Set this to 1 only for testing/debugging purposes.
281 */
282 int sosendjcl_ignore_capab = 0;
283 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
284 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
285
286 /*
287 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
288 * writes on the socket for all protocols on any network interfaces.
289 * Be extra careful when setting this to 1, because sending down packets with
290 * clusters larger that 2 KB might lead to system panics or data corruption.
291 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
292 * on the outgoing interface
293 * Set this to 1 for testing/debugging purposes only.
294 */
295 int sosendbigcl_ignore_capab = 0;
296 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
297 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
298
299 int sodefunctlog = 0;
300 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
301 &sodefunctlog, 0, "");
302
303 int sothrottlelog = 0;
304 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
305 &sothrottlelog, 0, "");
306
307 int sorestrictrecv = 1;
308 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
309 &sorestrictrecv, 0, "Enable inbound interface restrictions");
310
311 int sorestrictsend = 1;
312 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
313 &sorestrictsend, 0, "Enable outbound interface restrictions");
314
315 int soreserveheadroom = 1;
316 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
317 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
318
319 #if (DEBUG || DEVELOPMENT)
320 int so_notsent_lowat_check = 1;
321 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
322 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
323 #endif /* DEBUG || DEVELOPMENT */
324
325 int so_accept_list_waits = 0;
326 #if (DEBUG || DEVELOPMENT)
327 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
328 &so_accept_list_waits, 0, "number of waits for listener incomp list");
329 #endif /* DEBUG || DEVELOPMENT */
330
331 extern struct inpcbinfo tcbinfo;
332
333 /* TODO: these should be in header file */
334 extern int get_inpcb_str_size(void);
335 extern int get_tcp_str_size(void);
336
337 vm_size_t so_cache_zone_element_size;
338
339 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
340 user_ssize_t *);
341 static void cached_sock_alloc(struct socket **, int);
342 static void cached_sock_free(struct socket *);
343
344 /*
345 * Maximum of extended background idle sockets per process
346 * Set to zero to disable further setting of the option
347 */
348
349 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
350 #define SO_IDLE_BK_IDLE_TIME 600
351 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
352
353 struct soextbkidlestat soextbkidlestat;
354
355 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
356 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
357 "Maximum of extended background idle sockets per process");
358
359 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
360 &soextbkidlestat.so_xbkidle_time, 0,
361 "Time in seconds to keep extended background idle sockets");
362
363 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
364 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
365 "High water mark for extended background idle sockets");
366
367 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
368 &soextbkidlestat, soextbkidlestat, "");
369
370 int so_set_extended_bk_idle(struct socket *, int);
371
372
373 /*
374 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
375 * setting the DSCP code on the packet based on the service class; see
376 * <rdar://problem/11277343> for details.
377 */
378 __private_extern__ u_int32_t sotcdb = 0;
379 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
380 &sotcdb, 0, "");
381
382 void
383 socketinit(void)
384 {
385 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
386 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
387
388 #ifdef __LP64__
389 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
390 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
391 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
392 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
393 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
394 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
395 #else
396 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
397 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
398 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
399 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
400 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
401 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
402 #endif
403
404 if (socketinit_done) {
405 printf("socketinit: already called...\n");
406 return;
407 }
408 socketinit_done = 1;
409
410 PE_parse_boot_argn("socket_debug", &socket_debug,
411 sizeof(socket_debug));
412
413 /*
414 * allocate lock group attribute and group for socket cache mutex
415 */
416 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
417 so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
418 so_cache_mtx_grp_attr);
419
420 /*
421 * allocate the lock attribute for socket cache mutex
422 */
423 so_cache_mtx_attr = lck_attr_alloc_init();
424
425 /* cached sockets mutex */
426 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
427 if (so_cache_mtx == NULL) {
428 panic("%s: unable to allocate so_cache_mtx\n", __func__);
429 /* NOTREACHED */
430 }
431 STAILQ_INIT(&so_cache_head);
432
433 so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
434 + get_inpcb_str_size() + 4 + get_tcp_str_size());
435
436 so_cache_zone = zinit(so_cache_zone_element_size,
437 (120000 * so_cache_zone_element_size), 8192, "socache zone");
438 zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
439 zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
440
441 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
442 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
443 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
444 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
445
446 in_pcbinit();
447 sflt_init();
448 socket_tclass_init();
449 #if MULTIPATH
450 mp_pcbinit();
451 #endif /* MULTIPATH */
452 }
453
454 static void
455 cached_sock_alloc(struct socket **so, int waitok)
456 {
457 caddr_t temp;
458 uintptr_t offset;
459
460 lck_mtx_lock(so_cache_mtx);
461
462 if (!STAILQ_EMPTY(&so_cache_head)) {
463 VERIFY(cached_sock_count > 0);
464
465 *so = STAILQ_FIRST(&so_cache_head);
466 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
467 STAILQ_NEXT((*so), so_cache_ent) = NULL;
468
469 cached_sock_count--;
470 lck_mtx_unlock(so_cache_mtx);
471
472 temp = (*so)->so_saved_pcb;
473 bzero((caddr_t)*so, sizeof(struct socket));
474
475 (*so)->so_saved_pcb = temp;
476 } else {
477 lck_mtx_unlock(so_cache_mtx);
478
479 if (waitok) {
480 *so = (struct socket *)zalloc(so_cache_zone);
481 } else {
482 *so = (struct socket *)zalloc_noblock(so_cache_zone);
483 }
484
485 if (*so == NULL) {
486 return;
487 }
488
489 bzero((caddr_t)*so, sizeof(struct socket));
490
491 /*
492 * Define offsets for extra structures into our
493 * single block of memory. Align extra structures
494 * on longword boundaries.
495 */
496
497 offset = (uintptr_t)*so;
498 offset += sizeof(struct socket);
499
500 offset = ALIGN(offset);
501
502 (*so)->so_saved_pcb = (caddr_t)offset;
503 offset += get_inpcb_str_size();
504
505 offset = ALIGN(offset);
506
507 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
508 (caddr_t)offset;
509 }
510
511 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
512 }
513
514 static void
515 cached_sock_free(struct socket *so)
516 {
517 lck_mtx_lock(so_cache_mtx);
518
519 so_cache_time = net_uptime();
520 if (++cached_sock_count > max_cached_sock_count) {
521 --cached_sock_count;
522 lck_mtx_unlock(so_cache_mtx);
523 zfree(so_cache_zone, so);
524 } else {
525 if (so_cache_hw < cached_sock_count) {
526 so_cache_hw = cached_sock_count;
527 }
528
529 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
530
531 so->cache_timestamp = so_cache_time;
532 lck_mtx_unlock(so_cache_mtx);
533 }
534 }
535
536 void
537 so_update_last_owner_locked(struct socket *so, proc_t self)
538 {
539 if (so->last_pid != 0) {
540 /*
541 * last_pid and last_upid should remain zero for sockets
542 * created using sock_socket. The check above achieves that
543 */
544 if (self == PROC_NULL) {
545 self = current_proc();
546 }
547
548 if (so->last_upid != proc_uniqueid(self) ||
549 so->last_pid != proc_pid(self)) {
550 so->last_upid = proc_uniqueid(self);
551 so->last_pid = proc_pid(self);
552 proc_getexecutableuuid(self, so->last_uuid,
553 sizeof(so->last_uuid));
554 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
555 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
556 }
557 }
558 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
559 }
560 }
561
562 void
563 so_update_policy(struct socket *so)
564 {
565 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
566 (void) inp_update_policy(sotoinpcb(so));
567 }
568 }
569
570 #if NECP
571 static void
572 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
573 struct sockaddr *override_remote_addr)
574 {
575 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
576 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
577 override_remote_addr, 0);
578 }
579 }
580 #endif /* NECP */
581
582 boolean_t
583 so_cache_timer(void)
584 {
585 struct socket *p;
586 int n_freed = 0;
587 boolean_t rc = FALSE;
588
589 lck_mtx_lock(so_cache_mtx);
590 so_cache_timeouts++;
591 so_cache_time = net_uptime();
592
593 while (!STAILQ_EMPTY(&so_cache_head)) {
594 VERIFY(cached_sock_count > 0);
595 p = STAILQ_FIRST(&so_cache_head);
596 if ((so_cache_time - p->cache_timestamp) <
597 SO_CACHE_TIME_LIMIT) {
598 break;
599 }
600
601 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
602 --cached_sock_count;
603
604 zfree(so_cache_zone, p);
605
606 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
607 so_cache_max_freed++;
608 break;
609 }
610 }
611
612 /* Schedule again if there is more to cleanup */
613 if (!STAILQ_EMPTY(&so_cache_head)) {
614 rc = TRUE;
615 }
616
617 lck_mtx_unlock(so_cache_mtx);
618 return rc;
619 }
620
621 /*
622 * Get a socket structure from our zone, and initialize it.
623 * We don't implement `waitok' yet (see comments in uipc_domain.c).
624 * Note that it would probably be better to allocate socket
625 * and PCB at the same time, but I'm not convinced that all
626 * the protocols can be easily modified to do this.
627 */
628 struct socket *
629 soalloc(int waitok, int dom, int type)
630 {
631 struct socket *so;
632
633 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
634 cached_sock_alloc(&so, waitok);
635 } else {
636 MALLOC_ZONE(so, struct socket *, sizeof(*so), socket_zone,
637 M_WAITOK);
638 if (so != NULL) {
639 bzero(so, sizeof(*so));
640 }
641 }
642 if (so != NULL) {
643 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
644 so->so_zone = socket_zone;
645
646 /*
647 * Increment the socket allocation statistics
648 */
649 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
650
651 #if CONFIG_MACF_SOCKET
652 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
653 if (mac_socket_label_init(so, !waitok) != 0) {
654 sodealloc(so);
655 return NULL;
656 }
657 #endif /* MAC_SOCKET */
658 }
659
660 return so;
661 }
662
663 int
664 socreate_internal(int dom, struct socket **aso, int type, int proto,
665 struct proc *p, uint32_t flags, struct proc *ep)
666 {
667 struct protosw *prp;
668 struct socket *so;
669 int error = 0;
670
671 #if TCPDEBUG
672 extern int tcpconsdebug;
673 #endif
674
675 VERIFY(aso != NULL);
676 *aso = NULL;
677
678 if (proto != 0) {
679 prp = pffindproto(dom, proto, type);
680 } else {
681 prp = pffindtype(dom, type);
682 }
683
684 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
685 if (pffinddomain(dom) == NULL) {
686 return EAFNOSUPPORT;
687 }
688 if (proto != 0) {
689 if (pffindprotonotype(dom, proto) != NULL) {
690 return EPROTOTYPE;
691 }
692 }
693 return EPROTONOSUPPORT;
694 }
695 if (prp->pr_type != type) {
696 return EPROTOTYPE;
697 }
698 so = soalloc(1, dom, type);
699 if (so == NULL) {
700 return ENOBUFS;
701 }
702
703 switch (dom) {
704 case PF_LOCAL:
705 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
706 break;
707 case PF_INET:
708 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
709 if (type == SOCK_STREAM) {
710 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
711 } else {
712 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
713 }
714 break;
715 case PF_ROUTE:
716 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
717 break;
718 case PF_NDRV:
719 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
720 break;
721 case PF_KEY:
722 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
723 break;
724 case PF_INET6:
725 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
726 if (type == SOCK_STREAM) {
727 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
728 } else {
729 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
730 }
731 break;
732 case PF_SYSTEM:
733 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
734 break;
735 case PF_MULTIPATH:
736 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
737 break;
738 default:
739 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
740 break;
741 }
742
743 if (flags & SOCF_MPTCP) {
744 so->so_state |= SS_NBIO;
745 }
746
747 TAILQ_INIT(&so->so_incomp);
748 TAILQ_INIT(&so->so_comp);
749 so->so_type = type;
750 so->last_upid = proc_uniqueid(p);
751 so->last_pid = proc_pid(p);
752 proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
753 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
754
755 if (ep != PROC_NULL && ep != p) {
756 so->e_upid = proc_uniqueid(ep);
757 so->e_pid = proc_pid(ep);
758 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
759 so->so_flags |= SOF_DELEGATED;
760 }
761
762 so->so_cred = kauth_cred_proc_ref(p);
763 if (!suser(kauth_cred_get(), NULL)) {
764 so->so_state |= SS_PRIV;
765 }
766
767 so->so_proto = prp;
768 so->so_rcv.sb_flags |= SB_RECV;
769 so->so_rcv.sb_so = so->so_snd.sb_so = so;
770 so->next_lock_lr = 0;
771 so->next_unlock_lr = 0;
772
773 #if CONFIG_MACF_SOCKET
774 mac_socket_label_associate(kauth_cred_get(), so);
775 #endif /* MAC_SOCKET */
776
777 /*
778 * Attachment will create the per pcb lock if necessary and
779 * increase refcount for creation, make sure it's done before
780 * socket is inserted in lists.
781 */
782 so->so_usecount++;
783
784 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
785 if (error != 0) {
786 /*
787 * Warning:
788 * If so_pcb is not zero, the socket will be leaked,
789 * so protocol attachment handler must be coded carefuly
790 */
791 so->so_state |= SS_NOFDREF;
792 VERIFY(so->so_usecount > 0);
793 so->so_usecount--;
794 sofreelastref(so, 1); /* will deallocate the socket */
795 return error;
796 }
797
798 /*
799 * Note: needs so_pcb to be set after pru_attach
800 */
801 if (prp->pr_update_last_owner != NULL) {
802 (*prp->pr_update_last_owner)(so, p, ep);
803 }
804
805 atomic_add_32(&prp->pr_domain->dom_refs, 1);
806 TAILQ_INIT(&so->so_evlist);
807
808 /* Attach socket filters for this protocol */
809 sflt_initsock(so);
810 #if TCPDEBUG
811 if (tcpconsdebug == 2) {
812 so->so_options |= SO_DEBUG;
813 }
814 #endif
815 so_set_default_traffic_class(so);
816
817 /*
818 * If this thread or task is marked to create backgrounded sockets,
819 * mark the socket as background.
820 */
821 if (!(flags & SOCF_MPTCP) &&
822 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
823 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
824 so->so_background_thread = current_thread();
825 }
826
827 switch (dom) {
828 /*
829 * Don't mark Unix domain or system
830 * eligible for defunct by default.
831 */
832 case PF_LOCAL:
833 case PF_SYSTEM:
834 so->so_flags |= SOF_NODEFUNCT;
835 break;
836 default:
837 break;
838 }
839
840 /*
841 * Entitlements can't be checked at socket creation time except if the
842 * application requested a feature guarded by a privilege (c.f., socket
843 * delegation).
844 * The priv(9) and the Sandboxing APIs are designed with the idea that
845 * a privilege check should only be triggered by a userland request.
846 * A privilege check at socket creation time is time consuming and
847 * could trigger many authorisation error messages from the security
848 * APIs.
849 */
850
851 *aso = so;
852
853 return 0;
854 }
855
856 /*
857 * Returns: 0 Success
858 * EAFNOSUPPORT
859 * EPROTOTYPE
860 * EPROTONOSUPPORT
861 * ENOBUFS
862 * <pru_attach>:ENOBUFS[AF_UNIX]
863 * <pru_attach>:ENOBUFS[TCP]
864 * <pru_attach>:ENOMEM[TCP]
865 * <pru_attach>:??? [other protocol families, IPSEC]
866 */
867 int
868 socreate(int dom, struct socket **aso, int type, int proto)
869 {
870 return socreate_internal(dom, aso, type, proto, current_proc(), 0,
871 PROC_NULL);
872 }
873
874 int
875 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
876 {
877 int error = 0;
878 struct proc *ep = PROC_NULL;
879
880 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
881 error = ESRCH;
882 goto done;
883 }
884
885 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
886
887 /*
888 * It might not be wise to hold the proc reference when calling
889 * socreate_internal since it calls soalloc with M_WAITOK
890 */
891 done:
892 if (ep != PROC_NULL) {
893 proc_rele(ep);
894 }
895
896 return error;
897 }
898
899 /*
900 * Returns: 0 Success
901 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
902 * <pru_bind>:EAFNOSUPPORT Address family not supported
903 * <pru_bind>:EADDRNOTAVAIL Address not available.
904 * <pru_bind>:EINVAL Invalid argument
905 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
906 * <pru_bind>:EACCES Permission denied
907 * <pru_bind>:EADDRINUSE Address in use
908 * <pru_bind>:EAGAIN Resource unavailable, try again
909 * <pru_bind>:EPERM Operation not permitted
910 * <pru_bind>:???
911 * <sf_bind>:???
912 *
913 * Notes: It's not possible to fully enumerate the return codes above,
914 * since socket filter authors and protocol family authors may
915 * not choose to limit their error returns to those listed, even
916 * though this may result in some software operating incorrectly.
917 *
918 * The error codes which are enumerated above are those known to
919 * be returned by the tcp_usr_bind function supplied.
920 */
921 int
922 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
923 {
924 struct proc *p = current_proc();
925 int error = 0;
926
927 if (dolock) {
928 socket_lock(so, 1);
929 }
930
931 so_update_last_owner_locked(so, p);
932 so_update_policy(so);
933
934 #if NECP
935 so_update_necp_policy(so, nam, NULL);
936 #endif /* NECP */
937
938 /*
939 * If this is a bind request on a socket that has been marked
940 * as inactive, reject it now before we go any further.
941 */
942 if (so->so_flags & SOF_DEFUNCT) {
943 error = EINVAL;
944 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
945 __func__, proc_pid(p), proc_best_name(p),
946 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
947 SOCK_DOM(so), SOCK_TYPE(so), error);
948 goto out;
949 }
950
951 /* Socket filter */
952 error = sflt_bind(so, nam);
953
954 if (error == 0) {
955 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
956 }
957 out:
958 if (dolock) {
959 socket_unlock(so, 1);
960 }
961
962 if (error == EJUSTRETURN) {
963 error = 0;
964 }
965
966 return error;
967 }
968
969 void
970 sodealloc(struct socket *so)
971 {
972 kauth_cred_unref(&so->so_cred);
973
974 /* Remove any filters */
975 sflt_termsock(so);
976
977 #if CONTENT_FILTER
978 cfil_sock_detach(so);
979 #endif /* CONTENT_FILTER */
980
981 /* Delete the state allocated for msg queues on a socket */
982 if (so->so_flags & SOF_ENABLE_MSGS) {
983 FREE(so->so_msg_state, M_TEMP);
984 so->so_msg_state = NULL;
985 }
986 VERIFY(so->so_msg_state == NULL);
987
988 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
989
990 #if CONFIG_MACF_SOCKET
991 mac_socket_label_destroy(so);
992 #endif /* MAC_SOCKET */
993
994 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
995 cached_sock_free(so);
996 } else {
997 FREE_ZONE(so, sizeof(*so), so->so_zone);
998 }
999 }
1000
1001 /*
1002 * Returns: 0 Success
1003 * EINVAL
1004 * EOPNOTSUPP
1005 * <pru_listen>:EINVAL[AF_UNIX]
1006 * <pru_listen>:EINVAL[TCP]
1007 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
1008 * <pru_listen>:EINVAL[TCP] Invalid argument
1009 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
1010 * <pru_listen>:EACCES[TCP] Permission denied
1011 * <pru_listen>:EADDRINUSE[TCP] Address in use
1012 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
1013 * <pru_listen>:EPERM[TCP] Operation not permitted
1014 * <sf_listen>:???
1015 *
1016 * Notes: Other <pru_listen> returns depend on the protocol family; all
1017 * <sf_listen> returns depend on what the filter author causes
1018 * their filter to return.
1019 */
1020 int
1021 solisten(struct socket *so, int backlog)
1022 {
1023 struct proc *p = current_proc();
1024 int error = 0;
1025
1026 socket_lock(so, 1);
1027
1028 so_update_last_owner_locked(so, p);
1029 so_update_policy(so);
1030
1031 #if NECP
1032 so_update_necp_policy(so, NULL, NULL);
1033 #endif /* NECP */
1034
1035 if (so->so_proto == NULL) {
1036 error = EINVAL;
1037 goto out;
1038 }
1039 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1040 error = EOPNOTSUPP;
1041 goto out;
1042 }
1043
1044 /*
1045 * If the listen request is made on a socket that is not fully
1046 * disconnected, or on a socket that has been marked as inactive,
1047 * reject the request now.
1048 */
1049 if ((so->so_state &
1050 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1051 (so->so_flags & SOF_DEFUNCT)) {
1052 error = EINVAL;
1053 if (so->so_flags & SOF_DEFUNCT) {
1054 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1055 "(%d)\n", __func__, proc_pid(p),
1056 proc_best_name(p),
1057 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1058 SOCK_DOM(so), SOCK_TYPE(so), error);
1059 }
1060 goto out;
1061 }
1062
1063 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1064 error = EPERM;
1065 goto out;
1066 }
1067
1068 error = sflt_listen(so);
1069 if (error == 0) {
1070 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1071 }
1072
1073 if (error) {
1074 if (error == EJUSTRETURN) {
1075 error = 0;
1076 }
1077 goto out;
1078 }
1079
1080 if (TAILQ_EMPTY(&so->so_comp)) {
1081 so->so_options |= SO_ACCEPTCONN;
1082 }
1083 /*
1084 * POSIX: The implementation may have an upper limit on the length of
1085 * the listen queue-either global or per accepting socket. If backlog
1086 * exceeds this limit, the length of the listen queue is set to the
1087 * limit.
1088 *
1089 * If listen() is called with a backlog argument value that is less
1090 * than 0, the function behaves as if it had been called with a backlog
1091 * argument value of 0.
1092 *
1093 * A backlog argument of 0 may allow the socket to accept connections,
1094 * in which case the length of the listen queue may be set to an
1095 * implementation-defined minimum value.
1096 */
1097 if (backlog <= 0 || backlog > somaxconn) {
1098 backlog = somaxconn;
1099 }
1100
1101 so->so_qlimit = backlog;
1102 out:
1103 socket_unlock(so, 1);
1104 return error;
1105 }
1106
1107 /*
1108 * The "accept list lock" protects the fields related to the listener queues
1109 * because we can unlock a socket to respect the lock ordering between
1110 * the listener socket and its clients sockets. The lock ordering is first to
1111 * acquire the client socket before the listener socket.
1112 *
1113 * The accept list lock serializes access to the following fields:
1114 * - of the listener socket:
1115 * - so_comp
1116 * - so_incomp
1117 * - so_qlen
1118 * - so_inqlen
1119 * - of client sockets that are in so_comp or so_incomp:
1120 * - so_head
1121 * - so_list
1122 *
1123 * As one can see the accept list lock protects the consistent of the
1124 * linkage of the client sockets.
1125 *
1126 * Note that those fields may be read without holding the accept list lock
1127 * for a preflight provided the accept list lock is taken when committing
1128 * to take an action based on the result of the preflight. The preflight
1129 * saves the cost of doing the unlock/lock dance.
1130 */
1131 void
1132 so_acquire_accept_list(struct socket *head, struct socket *so)
1133 {
1134 lck_mtx_t *mutex_held;
1135
1136 if (head->so_proto->pr_getlock == NULL) {
1137 return;
1138 }
1139 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1140 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1141
1142 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1143 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1144 return;
1145 }
1146 if (so != NULL) {
1147 socket_unlock(so, 0);
1148 }
1149 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1150 so_accept_list_waits += 1;
1151 msleep((caddr_t)&head->so_incomp, mutex_held,
1152 PSOCK | PCATCH, __func__, NULL);
1153 }
1154 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1155 if (so != NULL) {
1156 socket_unlock(head, 0);
1157 socket_lock(so, 0);
1158 socket_lock(head, 0);
1159 }
1160 }
1161
1162 void
1163 so_release_accept_list(struct socket *head)
1164 {
1165 if (head->so_proto->pr_getlock != NULL) {
1166 lck_mtx_t *mutex_held;
1167
1168 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1169 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1170
1171 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1172 wakeup((caddr_t)&head->so_incomp);
1173 }
1174 }
1175
1176 void
1177 sofreelastref(struct socket *so, int dealloc)
1178 {
1179 struct socket *head = so->so_head;
1180
1181 /* Assume socket is locked */
1182
1183 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1184 selthreadclear(&so->so_snd.sb_sel);
1185 selthreadclear(&so->so_rcv.sb_sel);
1186 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1187 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1188 so->so_event = sonullevent;
1189 return;
1190 }
1191 if (head != NULL) {
1192 /*
1193 * Need to lock the listener when the protocol has
1194 * per socket locks
1195 */
1196 if (head->so_proto->pr_getlock != NULL) {
1197 socket_lock(head, 1);
1198 so_acquire_accept_list(head, so);
1199 }
1200 if (so->so_state & SS_INCOMP) {
1201 so->so_state &= ~SS_INCOMP;
1202 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1203 head->so_incqlen--;
1204 head->so_qlen--;
1205 so->so_head = NULL;
1206
1207 if (head->so_proto->pr_getlock != NULL) {
1208 so_release_accept_list(head);
1209 socket_unlock(head, 1);
1210 }
1211 } else if (so->so_state & SS_COMP) {
1212 if (head->so_proto->pr_getlock != NULL) {
1213 so_release_accept_list(head);
1214 socket_unlock(head, 1);
1215 }
1216 /*
1217 * We must not decommission a socket that's
1218 * on the accept(2) queue. If we do, then
1219 * accept(2) may hang after select(2) indicated
1220 * that the listening socket was ready.
1221 */
1222 selthreadclear(&so->so_snd.sb_sel);
1223 selthreadclear(&so->so_rcv.sb_sel);
1224 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1225 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1226 so->so_event = sonullevent;
1227 return;
1228 } else {
1229 if (head->so_proto->pr_getlock != NULL) {
1230 so_release_accept_list(head);
1231 socket_unlock(head, 1);
1232 }
1233 printf("sofree: not queued\n");
1234 }
1235 }
1236 sowflush(so);
1237 sorflush(so);
1238
1239 #if FLOW_DIVERT
1240 if (so->so_flags & SOF_FLOW_DIVERT) {
1241 flow_divert_detach(so);
1242 }
1243 #endif /* FLOW_DIVERT */
1244
1245 /* 3932268: disable upcall */
1246 so->so_rcv.sb_flags &= ~SB_UPCALL;
1247 so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1248 so->so_event = sonullevent;
1249
1250 if (dealloc) {
1251 sodealloc(so);
1252 }
1253 }
1254
1255 void
1256 soclose_wait_locked(struct socket *so)
1257 {
1258 lck_mtx_t *mutex_held;
1259
1260 if (so->so_proto->pr_getlock != NULL) {
1261 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1262 } else {
1263 mutex_held = so->so_proto->pr_domain->dom_mtx;
1264 }
1265 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1266
1267 /*
1268 * Double check here and return if there's no outstanding upcall;
1269 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1270 */
1271 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1272 return;
1273 }
1274 so->so_rcv.sb_flags &= ~SB_UPCALL;
1275 so->so_snd.sb_flags &= ~SB_UPCALL;
1276 so->so_flags |= SOF_CLOSEWAIT;
1277
1278 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1279 "soclose_wait_locked", NULL);
1280 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1281 so->so_flags &= ~SOF_CLOSEWAIT;
1282 }
1283
1284 /*
1285 * Close a socket on last file table reference removal.
1286 * Initiate disconnect if connected.
1287 * Free socket when disconnect complete.
1288 */
1289 int
1290 soclose_locked(struct socket *so)
1291 {
1292 int error = 0;
1293 struct timespec ts;
1294
1295 if (so->so_usecount == 0) {
1296 panic("soclose: so=%p refcount=0\n", so);
1297 /* NOTREACHED */
1298 }
1299
1300 sflt_notify(so, sock_evt_closing, NULL);
1301
1302 if (so->so_upcallusecount) {
1303 soclose_wait_locked(so);
1304 }
1305
1306 #if CONTENT_FILTER
1307 /*
1308 * We have to wait until the content filters are done
1309 */
1310 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1311 cfil_sock_close_wait(so);
1312 cfil_sock_is_closed(so);
1313 cfil_sock_detach(so);
1314 }
1315 #endif /* CONTENT_FILTER */
1316
1317 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1318 soresume(current_proc(), so, 1);
1319 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1320 }
1321
1322 if ((so->so_options & SO_ACCEPTCONN)) {
1323 struct socket *sp, *sonext;
1324 int persocklock = 0;
1325 int incomp_overflow_only;
1326
1327 /*
1328 * We do not want new connection to be added
1329 * to the connection queues
1330 */
1331 so->so_options &= ~SO_ACCEPTCONN;
1332
1333 /*
1334 * We can drop the lock on the listener once
1335 * we've acquired the incoming list
1336 */
1337 if (so->so_proto->pr_getlock != NULL) {
1338 persocklock = 1;
1339 so_acquire_accept_list(so, NULL);
1340 socket_unlock(so, 0);
1341 }
1342 again:
1343 incomp_overflow_only = 1;
1344
1345 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1346 /*
1347 * Radar 5350314
1348 * skip sockets thrown away by tcpdropdropblreq
1349 * they will get cleanup by the garbage collection.
1350 * otherwise, remove the incomp socket from the queue
1351 * and let soabort trigger the appropriate cleanup.
1352 */
1353 if (sp->so_flags & SOF_OVERFLOW) {
1354 continue;
1355 }
1356
1357 if (persocklock != 0) {
1358 socket_lock(sp, 1);
1359 }
1360
1361 /*
1362 * Radar 27945981
1363 * The extra reference for the list insure the
1364 * validity of the socket pointer when we perform the
1365 * unlock of the head above
1366 */
1367 if (sp->so_state & SS_INCOMP) {
1368 sp->so_state &= ~SS_INCOMP;
1369 sp->so_head = NULL;
1370 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1371 so->so_incqlen--;
1372 so->so_qlen--;
1373
1374 (void) soabort(sp);
1375 } else {
1376 panic("%s sp %p in so_incomp but !SS_INCOMP",
1377 __func__, sp);
1378 }
1379
1380 if (persocklock != 0) {
1381 socket_unlock(sp, 1);
1382 }
1383 }
1384
1385 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1386 /* Dequeue from so_comp since sofree() won't do it */
1387 if (persocklock != 0) {
1388 socket_lock(sp, 1);
1389 }
1390
1391 if (sp->so_state & SS_COMP) {
1392 sp->so_state &= ~SS_COMP;
1393 sp->so_head = NULL;
1394 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1395 so->so_qlen--;
1396
1397 (void) soabort(sp);
1398 } else {
1399 panic("%s sp %p in so_comp but !SS_COMP",
1400 __func__, sp);
1401 }
1402
1403 if (persocklock) {
1404 socket_unlock(sp, 1);
1405 }
1406 }
1407
1408 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1409 #if (DEBUG | DEVELOPMENT)
1410 panic("%s head %p so_comp not empty\n", __func__, so);
1411 #endif /* (DEVELOPMENT || DEBUG) */
1412
1413 goto again;
1414 }
1415
1416 if (!TAILQ_EMPTY(&so->so_comp)) {
1417 #if (DEBUG | DEVELOPMENT)
1418 panic("%s head %p so_comp not empty\n", __func__, so);
1419 #endif /* (DEVELOPMENT || DEBUG) */
1420
1421 goto again;
1422 }
1423
1424 if (persocklock) {
1425 socket_lock(so, 0);
1426 so_release_accept_list(so);
1427 }
1428 }
1429 if (so->so_pcb == NULL) {
1430 /* 3915887: mark the socket as ready for dealloc */
1431 so->so_flags |= SOF_PCBCLEARING;
1432 goto discard;
1433 }
1434 if (so->so_state & SS_ISCONNECTED) {
1435 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1436 error = sodisconnectlocked(so);
1437 if (error) {
1438 goto drop;
1439 }
1440 }
1441 if (so->so_options & SO_LINGER) {
1442 lck_mtx_t *mutex_held;
1443
1444 if ((so->so_state & SS_ISDISCONNECTING) &&
1445 (so->so_state & SS_NBIO)) {
1446 goto drop;
1447 }
1448 if (so->so_proto->pr_getlock != NULL) {
1449 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1450 } else {
1451 mutex_held = so->so_proto->pr_domain->dom_mtx;
1452 }
1453 while (so->so_state & SS_ISCONNECTED) {
1454 ts.tv_sec = (so->so_linger / 100);
1455 ts.tv_nsec = (so->so_linger % 100) *
1456 NSEC_PER_USEC * 1000 * 10;
1457 error = msleep((caddr_t)&so->so_timeo,
1458 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1459 if (error) {
1460 /*
1461 * It's OK when the time fires,
1462 * don't report an error
1463 */
1464 if (error == EWOULDBLOCK) {
1465 error = 0;
1466 }
1467 break;
1468 }
1469 }
1470 }
1471 }
1472 drop:
1473 if (so->so_usecount == 0) {
1474 panic("soclose: usecount is zero so=%p\n", so);
1475 /* NOTREACHED */
1476 }
1477 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1478 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1479 if (error == 0) {
1480 error = error2;
1481 }
1482 }
1483 if (so->so_usecount <= 0) {
1484 panic("soclose: usecount is zero so=%p\n", so);
1485 /* NOTREACHED */
1486 }
1487 discard:
1488 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1489 (so->so_state & SS_NOFDREF)) {
1490 panic("soclose: NOFDREF");
1491 /* NOTREACHED */
1492 }
1493 so->so_state |= SS_NOFDREF;
1494
1495 if ((so->so_flags & SOF_KNOTE) != 0) {
1496 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1497 }
1498
1499 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1500 evsofree(so);
1501
1502 VERIFY(so->so_usecount > 0);
1503 so->so_usecount--;
1504 sofree(so);
1505 return error;
1506 }
1507
1508 int
1509 soclose(struct socket *so)
1510 {
1511 int error = 0;
1512 socket_lock(so, 1);
1513
1514 if (so->so_retaincnt == 0) {
1515 error = soclose_locked(so);
1516 } else {
1517 /*
1518 * if the FD is going away, but socket is
1519 * retained in kernel remove its reference
1520 */
1521 so->so_usecount--;
1522 if (so->so_usecount < 2) {
1523 panic("soclose: retaincnt non null and so=%p "
1524 "usecount=%d\n", so, so->so_usecount);
1525 }
1526 }
1527 socket_unlock(so, 1);
1528 return error;
1529 }
1530
1531 /*
1532 * Must be called at splnet...
1533 */
1534 /* Should already be locked */
1535 int
1536 soabort(struct socket *so)
1537 {
1538 int error;
1539
1540 #ifdef MORE_LOCKING_DEBUG
1541 lck_mtx_t *mutex_held;
1542
1543 if (so->so_proto->pr_getlock != NULL) {
1544 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1545 } else {
1546 mutex_held = so->so_proto->pr_domain->dom_mtx;
1547 }
1548 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1549 #endif
1550
1551 if ((so->so_flags & SOF_ABORTED) == 0) {
1552 so->so_flags |= SOF_ABORTED;
1553 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1554 if (error) {
1555 sofree(so);
1556 return error;
1557 }
1558 }
1559 return 0;
1560 }
1561
1562 int
1563 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1564 {
1565 int error;
1566
1567 if (dolock) {
1568 socket_lock(so, 1);
1569 }
1570
1571 so_update_last_owner_locked(so, PROC_NULL);
1572 so_update_policy(so);
1573 #if NECP
1574 so_update_necp_policy(so, NULL, NULL);
1575 #endif /* NECP */
1576
1577 if ((so->so_state & SS_NOFDREF) == 0) {
1578 panic("soaccept: !NOFDREF");
1579 }
1580 so->so_state &= ~SS_NOFDREF;
1581 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1582
1583 if (dolock) {
1584 socket_unlock(so, 1);
1585 }
1586 return error;
1587 }
1588
1589 int
1590 soaccept(struct socket *so, struct sockaddr **nam)
1591 {
1592 return soacceptlock(so, nam, 1);
1593 }
1594
1595 int
1596 soacceptfilter(struct socket *so, struct socket *head)
1597 {
1598 struct sockaddr *local = NULL, *remote = NULL;
1599 int error = 0;
1600
1601 /*
1602 * Hold the lock even if this socket has not been made visible
1603 * to the filter(s). For sockets with global locks, this protects
1604 * against the head or peer going away
1605 */
1606 socket_lock(so, 1);
1607 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1608 sogetaddr_locked(so, &local, 0) != 0) {
1609 so->so_state &= ~SS_NOFDREF;
1610 socket_unlock(so, 1);
1611 soclose(so);
1612 /* Out of resources; try it again next time */
1613 error = ECONNABORTED;
1614 goto done;
1615 }
1616
1617 error = sflt_accept(head, so, local, remote);
1618
1619 /*
1620 * If we get EJUSTRETURN from one of the filters, mark this socket
1621 * as inactive and return it anyway. This newly accepted socket
1622 * will be disconnected later before we hand it off to the caller.
1623 */
1624 if (error == EJUSTRETURN) {
1625 error = 0;
1626 (void) sosetdefunct(current_proc(), so,
1627 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1628 }
1629
1630 if (error != 0) {
1631 /*
1632 * This may seem like a duplication to the above error
1633 * handling part when we return ECONNABORTED, except
1634 * the following is done while holding the lock since
1635 * the socket has been exposed to the filter(s) earlier.
1636 */
1637 so->so_state &= ~SS_NOFDREF;
1638 socket_unlock(so, 1);
1639 soclose(so);
1640 /* Propagate socket filter's error code to the caller */
1641 } else {
1642 socket_unlock(so, 1);
1643 }
1644 done:
1645 /* Callee checks for NULL pointer */
1646 sock_freeaddr(remote);
1647 sock_freeaddr(local);
1648 return error;
1649 }
1650
1651 /*
1652 * Returns: 0 Success
1653 * EOPNOTSUPP Operation not supported on socket
1654 * EISCONN Socket is connected
1655 * <pru_connect>:EADDRNOTAVAIL Address not available.
1656 * <pru_connect>:EINVAL Invalid argument
1657 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1658 * <pru_connect>:EACCES Permission denied
1659 * <pru_connect>:EADDRINUSE Address in use
1660 * <pru_connect>:EAGAIN Resource unavailable, try again
1661 * <pru_connect>:EPERM Operation not permitted
1662 * <sf_connect_out>:??? [anything a filter writer might set]
1663 */
1664 int
1665 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1666 {
1667 int error;
1668 struct proc *p = current_proc();
1669
1670 if (dolock) {
1671 socket_lock(so, 1);
1672 }
1673
1674 so_update_last_owner_locked(so, p);
1675 so_update_policy(so);
1676
1677 #if NECP
1678 so_update_necp_policy(so, NULL, nam);
1679 #endif /* NECP */
1680
1681 /*
1682 * If this is a listening socket or if this is a previously-accepted
1683 * socket that has been marked as inactive, reject the connect request.
1684 */
1685 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1686 error = EOPNOTSUPP;
1687 if (so->so_flags & SOF_DEFUNCT) {
1688 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1689 "(%d)\n", __func__, proc_pid(p),
1690 proc_best_name(p),
1691 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1692 SOCK_DOM(so), SOCK_TYPE(so), error);
1693 }
1694 if (dolock) {
1695 socket_unlock(so, 1);
1696 }
1697 return error;
1698 }
1699
1700 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1701 if (dolock) {
1702 socket_unlock(so, 1);
1703 }
1704 return EPERM;
1705 }
1706
1707 /*
1708 * If protocol is connection-based, can only connect once.
1709 * Otherwise, if connected, try to disconnect first.
1710 * This allows user to disconnect by connecting to, e.g.,
1711 * a null address.
1712 */
1713 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1714 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1715 (error = sodisconnectlocked(so)))) {
1716 error = EISCONN;
1717 } else {
1718 /*
1719 * Run connect filter before calling protocol:
1720 * - non-blocking connect returns before completion;
1721 */
1722 error = sflt_connectout(so, nam);
1723 if (error != 0) {
1724 if (error == EJUSTRETURN) {
1725 error = 0;
1726 }
1727 } else {
1728 error = (*so->so_proto->pr_usrreqs->pru_connect)
1729 (so, nam, p);
1730 }
1731 }
1732 if (dolock) {
1733 socket_unlock(so, 1);
1734 }
1735 return error;
1736 }
1737
1738 int
1739 soconnect(struct socket *so, struct sockaddr *nam)
1740 {
1741 return soconnectlock(so, nam, 1);
1742 }
1743
1744 /*
1745 * Returns: 0 Success
1746 * <pru_connect2>:EINVAL[AF_UNIX]
1747 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1748 * <pru_connect2>:??? [other protocol families]
1749 *
1750 * Notes: <pru_connect2> is not supported by [TCP].
1751 */
1752 int
1753 soconnect2(struct socket *so1, struct socket *so2)
1754 {
1755 int error;
1756
1757 socket_lock(so1, 1);
1758 if (so2->so_proto->pr_lock) {
1759 socket_lock(so2, 1);
1760 }
1761
1762 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1763
1764 socket_unlock(so1, 1);
1765 if (so2->so_proto->pr_lock) {
1766 socket_unlock(so2, 1);
1767 }
1768 return error;
1769 }
1770
1771 int
1772 soconnectxlocked(struct socket *so, struct sockaddr *src,
1773 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1774 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1775 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1776 {
1777 int error;
1778
1779 so_update_last_owner_locked(so, p);
1780 so_update_policy(so);
1781
1782 /*
1783 * If this is a listening socket or if this is a previously-accepted
1784 * socket that has been marked as inactive, reject the connect request.
1785 */
1786 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1787 error = EOPNOTSUPP;
1788 if (so->so_flags & SOF_DEFUNCT) {
1789 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1790 "(%d)\n", __func__, proc_pid(p),
1791 proc_best_name(p),
1792 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1793 SOCK_DOM(so), SOCK_TYPE(so), error);
1794 }
1795 return error;
1796 }
1797
1798 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1799 return EPERM;
1800 }
1801
1802 /*
1803 * If protocol is connection-based, can only connect once
1804 * unless PR_MULTICONN is set. Otherwise, if connected,
1805 * try to disconnect first. This allows user to disconnect
1806 * by connecting to, e.g., a null address.
1807 */
1808 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1809 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1810 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1811 (error = sodisconnectlocked(so)) != 0)) {
1812 error = EISCONN;
1813 } else {
1814 /*
1815 * Run connect filter before calling protocol:
1816 * - non-blocking connect returns before completion;
1817 */
1818 error = sflt_connectout(so, dst);
1819 if (error != 0) {
1820 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1821 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1822 if (error == EJUSTRETURN) {
1823 error = 0;
1824 }
1825 } else {
1826 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1827 (so, src, dst, p, ifscope, aid, pcid,
1828 flags, arg, arglen, auio, bytes_written);
1829 }
1830 }
1831
1832 return error;
1833 }
1834
1835 int
1836 sodisconnectlocked(struct socket *so)
1837 {
1838 int error;
1839
1840 if ((so->so_state & SS_ISCONNECTED) == 0) {
1841 error = ENOTCONN;
1842 goto bad;
1843 }
1844 if (so->so_state & SS_ISDISCONNECTING) {
1845 error = EALREADY;
1846 goto bad;
1847 }
1848
1849 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1850 if (error == 0) {
1851 sflt_notify(so, sock_evt_disconnected, NULL);
1852 }
1853
1854 bad:
1855 return error;
1856 }
1857
1858 /* Locking version */
1859 int
1860 sodisconnect(struct socket *so)
1861 {
1862 int error;
1863
1864 socket_lock(so, 1);
1865 error = sodisconnectlocked(so);
1866 socket_unlock(so, 1);
1867 return error;
1868 }
1869
1870 int
1871 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1872 {
1873 int error;
1874
1875 /*
1876 * Call the protocol disconnectx handler; let it handle all
1877 * matters related to the connection state of this session.
1878 */
1879 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1880 if (error == 0) {
1881 /*
1882 * The event applies only for the session, not for
1883 * the disconnection of individual subflows.
1884 */
1885 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1886 sflt_notify(so, sock_evt_disconnected, NULL);
1887 }
1888 }
1889 return error;
1890 }
1891
1892 int
1893 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1894 {
1895 int error;
1896
1897 socket_lock(so, 1);
1898 error = sodisconnectxlocked(so, aid, cid);
1899 socket_unlock(so, 1);
1900 return error;
1901 }
1902
1903 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1904
1905 /*
1906 * sosendcheck will lock the socket buffer if it isn't locked and
1907 * verify that there is space for the data being inserted.
1908 *
1909 * Returns: 0 Success
1910 * EPIPE
1911 * sblock:EWOULDBLOCK
1912 * sblock:EINTR
1913 * sbwait:EBADF
1914 * sbwait:EINTR
1915 * [so_error]:???
1916 */
1917 int
1918 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1919 int32_t clen, int32_t atomic, int flags, int *sblocked,
1920 struct mbuf *control)
1921 {
1922 int error = 0;
1923 int32_t space;
1924 int assumelock = 0;
1925
1926 restart:
1927 if (*sblocked == 0) {
1928 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1929 so->so_send_filt_thread != 0 &&
1930 so->so_send_filt_thread == current_thread()) {
1931 /*
1932 * We're being called recursively from a filter,
1933 * allow this to continue. Radar 4150520.
1934 * Don't set sblocked because we don't want
1935 * to perform an unlock later.
1936 */
1937 assumelock = 1;
1938 } else {
1939 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1940 if (error) {
1941 if (so->so_flags & SOF_DEFUNCT) {
1942 goto defunct;
1943 }
1944 return error;
1945 }
1946 *sblocked = 1;
1947 }
1948 }
1949
1950 /*
1951 * If a send attempt is made on a socket that has been marked
1952 * as inactive (disconnected), reject the request.
1953 */
1954 if (so->so_flags & SOF_DEFUNCT) {
1955 defunct:
1956 error = EPIPE;
1957 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1958 __func__, proc_selfpid(), proc_best_name(current_proc()),
1959 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1960 SOCK_DOM(so), SOCK_TYPE(so), error);
1961 return error;
1962 }
1963
1964 if (so->so_state & SS_CANTSENDMORE) {
1965 #if CONTENT_FILTER
1966 /*
1967 * Can re-inject data of half closed connections
1968 */
1969 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1970 so->so_snd.sb_cfil_thread == current_thread() &&
1971 cfil_sock_data_pending(&so->so_snd) != 0) {
1972 CFIL_LOG(LOG_INFO,
1973 "so %llx ignore SS_CANTSENDMORE",
1974 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1975 } else
1976 #endif /* CONTENT_FILTER */
1977 return EPIPE;
1978 }
1979 if (so->so_error) {
1980 error = so->so_error;
1981 so->so_error = 0;
1982 return error;
1983 }
1984
1985 if ((so->so_state & SS_ISCONNECTED) == 0) {
1986 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1987 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1988 (resid != 0 || clen == 0) &&
1989 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1990 return ENOTCONN;
1991 }
1992 } else if (addr == 0) {
1993 return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1994 ENOTCONN : EDESTADDRREQ;
1995 }
1996 }
1997
1998 if (so->so_flags & SOF_ENABLE_MSGS) {
1999 space = msgq_sbspace(so, control);
2000 } else {
2001 space = sbspace(&so->so_snd);
2002 }
2003
2004 if (flags & MSG_OOB) {
2005 space += 1024;
2006 }
2007 if ((atomic && resid > so->so_snd.sb_hiwat) ||
2008 clen > so->so_snd.sb_hiwat) {
2009 return EMSGSIZE;
2010 }
2011
2012 if ((space < resid + clen &&
2013 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2014 space < clen)) ||
2015 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2016 /*
2017 * don't block the connectx call when there's more data
2018 * than can be copied.
2019 */
2020 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2021 if (space == 0) {
2022 return EWOULDBLOCK;
2023 }
2024 if (space < (int32_t)so->so_snd.sb_lowat) {
2025 return 0;
2026 }
2027 }
2028 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2029 assumelock) {
2030 return EWOULDBLOCK;
2031 }
2032 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
2033 *sblocked = 0;
2034 error = sbwait(&so->so_snd);
2035 if (error) {
2036 if (so->so_flags & SOF_DEFUNCT) {
2037 goto defunct;
2038 }
2039 return error;
2040 }
2041 goto restart;
2042 }
2043 return 0;
2044 }
2045
2046 /*
2047 * Send on a socket.
2048 * If send must go all at once and message is larger than
2049 * send buffering, then hard error.
2050 * Lock against other senders.
2051 * If must go all at once and not enough room now, then
2052 * inform user that this would block and do nothing.
2053 * Otherwise, if nonblocking, send as much as possible.
2054 * The data to be sent is described by "uio" if nonzero,
2055 * otherwise by the mbuf chain "top" (which must be null
2056 * if uio is not). Data provided in mbuf chain must be small
2057 * enough to send all at once.
2058 *
2059 * Returns nonzero on error, timeout or signal; callers
2060 * must check for short counts if EINTR/ERESTART are returned.
2061 * Data and control buffers are freed on return.
2062 *
2063 * Returns: 0 Success
2064 * EOPNOTSUPP
2065 * EINVAL
2066 * ENOBUFS
2067 * uiomove:EFAULT
2068 * sosendcheck:EPIPE
2069 * sosendcheck:EWOULDBLOCK
2070 * sosendcheck:EINTR
2071 * sosendcheck:EBADF
2072 * sosendcheck:EINTR
2073 * sosendcheck:??? [value from so_error]
2074 * <pru_send>:ECONNRESET[TCP]
2075 * <pru_send>:EINVAL[TCP]
2076 * <pru_send>:ENOBUFS[TCP]
2077 * <pru_send>:EADDRINUSE[TCP]
2078 * <pru_send>:EADDRNOTAVAIL[TCP]
2079 * <pru_send>:EAFNOSUPPORT[TCP]
2080 * <pru_send>:EACCES[TCP]
2081 * <pru_send>:EAGAIN[TCP]
2082 * <pru_send>:EPERM[TCP]
2083 * <pru_send>:EMSGSIZE[TCP]
2084 * <pru_send>:EHOSTUNREACH[TCP]
2085 * <pru_send>:ENETUNREACH[TCP]
2086 * <pru_send>:ENETDOWN[TCP]
2087 * <pru_send>:ENOMEM[TCP]
2088 * <pru_send>:ENOBUFS[TCP]
2089 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2090 * <pru_send>:EINVAL[AF_UNIX]
2091 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2092 * <pru_send>:EPIPE[AF_UNIX]
2093 * <pru_send>:ENOTCONN[AF_UNIX]
2094 * <pru_send>:EISCONN[AF_UNIX]
2095 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2096 * <sf_data_out>:??? [whatever a filter author chooses]
2097 *
2098 * Notes: Other <pru_send> returns depend on the protocol family; all
2099 * <sf_data_out> returns depend on what the filter author causes
2100 * their filter to return.
2101 */
2102 int
2103 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2104 struct mbuf *top, struct mbuf *control, int flags)
2105 {
2106 struct mbuf **mp;
2107 struct mbuf *m, *freelist = NULL;
2108 user_ssize_t space, len, resid, orig_resid;
2109 int clen = 0, error, dontroute, mlen, sendflags;
2110 int atomic = sosendallatonce(so) || top;
2111 int sblocked = 0;
2112 struct proc *p = current_proc();
2113 struct mbuf *control_copy = NULL;
2114 uint16_t headroom = 0;
2115 boolean_t en_tracing = FALSE;
2116
2117 if (uio != NULL) {
2118 resid = uio_resid(uio);
2119 } else {
2120 resid = top->m_pkthdr.len;
2121 }
2122
2123 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2124 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2125
2126 socket_lock(so, 1);
2127
2128 /*
2129 * trace if tracing & network (vs. unix) sockets & and
2130 * non-loopback
2131 */
2132 if (ENTR_SHOULDTRACE &&
2133 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2134 struct inpcb *inp = sotoinpcb(so);
2135 if (inp->inp_last_outifp != NULL &&
2136 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2137 en_tracing = TRUE;
2138 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2139 VM_KERNEL_ADDRPERM(so),
2140 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2141 (int64_t)resid);
2142 orig_resid = resid;
2143 }
2144 }
2145
2146 /*
2147 * Re-injection should not affect process accounting
2148 */
2149 if ((flags & MSG_SKIPCFIL) == 0) {
2150 so_update_last_owner_locked(so, p);
2151 so_update_policy(so);
2152
2153 #if NECP
2154 so_update_necp_policy(so, NULL, addr);
2155 #endif /* NECP */
2156 }
2157
2158 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2159 error = EOPNOTSUPP;
2160 goto out_locked;
2161 }
2162
2163 /*
2164 * In theory resid should be unsigned.
2165 * However, space must be signed, as it might be less than 0
2166 * if we over-committed, and we must use a signed comparison
2167 * of space and resid. On the other hand, a negative resid
2168 * causes us to loop sending 0-length segments to the protocol.
2169 *
2170 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2171 * But it will be used by sockets doing message delivery.
2172 *
2173 * Note: We limit resid to be a positive int value as we use
2174 * imin() to set bytes_to_copy -- radr://14558484
2175 */
2176 if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
2177 !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
2178 error = EINVAL;
2179 goto out_locked;
2180 }
2181
2182 dontroute = (flags & MSG_DONTROUTE) &&
2183 (so->so_options & SO_DONTROUTE) == 0 &&
2184 (so->so_proto->pr_flags & PR_ATOMIC);
2185 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2186
2187 if (control != NULL) {
2188 clen = control->m_len;
2189 }
2190
2191 if (soreserveheadroom != 0) {
2192 headroom = so->so_pktheadroom;
2193 }
2194
2195 do {
2196 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2197 &sblocked, control);
2198 if (error) {
2199 goto out_locked;
2200 }
2201
2202 mp = &top;
2203 if (so->so_flags & SOF_ENABLE_MSGS) {
2204 space = msgq_sbspace(so, control);
2205 } else {
2206 space = sbspace(&so->so_snd) - clen;
2207 }
2208 space += ((flags & MSG_OOB) ? 1024 : 0);
2209
2210 do {
2211 if (uio == NULL) {
2212 /*
2213 * Data is prepackaged in "top".
2214 */
2215 resid = 0;
2216 if (flags & MSG_EOR) {
2217 top->m_flags |= M_EOR;
2218 }
2219 } else {
2220 int chainlength;
2221 int bytes_to_copy;
2222 boolean_t jumbocl;
2223 boolean_t bigcl;
2224 int bytes_to_alloc;
2225
2226 bytes_to_copy = imin(resid, space);
2227
2228 bytes_to_alloc = bytes_to_copy;
2229 if (top == NULL) {
2230 bytes_to_alloc += headroom;
2231 }
2232
2233 if (sosendminchain > 0) {
2234 chainlength = 0;
2235 } else {
2236 chainlength = sosendmaxchain;
2237 }
2238
2239 /*
2240 * Use big 4 KB cluster when the outgoing interface
2241 * does not prefer 2 KB clusters
2242 */
2243 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2244 sosendbigcl_ignore_capab;
2245
2246 /*
2247 * Attempt to use larger than system page-size
2248 * clusters for large writes only if there is
2249 * a jumbo cluster pool and if the socket is
2250 * marked accordingly.
2251 */
2252 jumbocl = sosendjcl && njcl > 0 &&
2253 ((so->so_flags & SOF_MULTIPAGES) ||
2254 sosendjcl_ignore_capab) &&
2255 bigcl;
2256
2257 socket_unlock(so, 0);
2258
2259 do {
2260 int num_needed;
2261 int hdrs_needed = (top == NULL) ? 1 : 0;
2262
2263 /*
2264 * try to maintain a local cache of mbuf
2265 * clusters needed to complete this
2266 * write the list is further limited to
2267 * the number that are currently needed
2268 * to fill the socket this mechanism
2269 * allows a large number of mbufs/
2270 * clusters to be grabbed under a single
2271 * mbuf lock... if we can't get any
2272 * clusters, than fall back to trying
2273 * for mbufs if we fail early (or
2274 * miscalcluate the number needed) make
2275 * sure to release any clusters we
2276 * haven't yet consumed.
2277 */
2278 if (freelist == NULL &&
2279 bytes_to_alloc > MBIGCLBYTES &&
2280 jumbocl) {
2281 num_needed =
2282 bytes_to_alloc / M16KCLBYTES;
2283
2284 if ((bytes_to_alloc -
2285 (num_needed * M16KCLBYTES))
2286 >= MINCLSIZE) {
2287 num_needed++;
2288 }
2289
2290 freelist =
2291 m_getpackets_internal(
2292 (unsigned int *)&num_needed,
2293 hdrs_needed, M_WAIT, 0,
2294 M16KCLBYTES);
2295 /*
2296 * Fall back to 4K cluster size
2297 * if allocation failed
2298 */
2299 }
2300
2301 if (freelist == NULL &&
2302 bytes_to_alloc > MCLBYTES &&
2303 bigcl) {
2304 num_needed =
2305 bytes_to_alloc / MBIGCLBYTES;
2306
2307 if ((bytes_to_alloc -
2308 (num_needed * MBIGCLBYTES)) >=
2309 MINCLSIZE) {
2310 num_needed++;
2311 }
2312
2313 freelist =
2314 m_getpackets_internal(
2315 (unsigned int *)&num_needed,
2316 hdrs_needed, M_WAIT, 0,
2317 MBIGCLBYTES);
2318 /*
2319 * Fall back to cluster size
2320 * if allocation failed
2321 */
2322 }
2323
2324 /*
2325 * Allocate a cluster as we want to
2326 * avoid to split the data in more
2327 * that one segment and using MINCLSIZE
2328 * would lead us to allocate two mbufs
2329 */
2330 if (soreserveheadroom != 0 &&
2331 freelist == NULL &&
2332 ((top == NULL &&
2333 bytes_to_alloc > _MHLEN) ||
2334 bytes_to_alloc > _MLEN)) {
2335 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2336 MCLBYTES;
2337 freelist =
2338 m_getpackets_internal(
2339 (unsigned int *)&num_needed,
2340 hdrs_needed, M_WAIT, 0,
2341 MCLBYTES);
2342 /*
2343 * Fall back to a single mbuf
2344 * if allocation failed
2345 */
2346 } else if (freelist == NULL &&
2347 bytes_to_alloc > MINCLSIZE) {
2348 num_needed =
2349 bytes_to_alloc / MCLBYTES;
2350
2351 if ((bytes_to_alloc -
2352 (num_needed * MCLBYTES)) >=
2353 MINCLSIZE) {
2354 num_needed++;
2355 }
2356
2357 freelist =
2358 m_getpackets_internal(
2359 (unsigned int *)&num_needed,
2360 hdrs_needed, M_WAIT, 0,
2361 MCLBYTES);
2362 /*
2363 * Fall back to a single mbuf
2364 * if allocation failed
2365 */
2366 }
2367 /*
2368 * For datagram protocols, leave
2369 * headroom for protocol headers
2370 * in the first cluster of the chain
2371 */
2372 if (freelist != NULL && atomic &&
2373 top == NULL && headroom > 0) {
2374 freelist->m_data += headroom;
2375 }
2376
2377 /*
2378 * Fall back to regular mbufs without
2379 * reserving the socket headroom
2380 */
2381 if (freelist == NULL) {
2382 if (top == NULL) {
2383 MGETHDR(freelist,
2384 M_WAIT, MT_DATA);
2385 } else {
2386 MGET(freelist,
2387 M_WAIT, MT_DATA);
2388 }
2389
2390 if (freelist == NULL) {
2391 error = ENOBUFS;
2392 socket_lock(so, 0);
2393 goto out_locked;
2394 }
2395 /*
2396 * For datagram protocols,
2397 * leave room for protocol
2398 * headers in first mbuf.
2399 */
2400 if (atomic && top == NULL &&
2401 bytes_to_copy < MHLEN) {
2402 MH_ALIGN(freelist,
2403 bytes_to_copy);
2404 }
2405 }
2406 m = freelist;
2407 freelist = m->m_next;
2408 m->m_next = NULL;
2409
2410 if ((m->m_flags & M_EXT)) {
2411 mlen = m->m_ext.ext_size -
2412 M_LEADINGSPACE(m);
2413 } else if ((m->m_flags & M_PKTHDR)) {
2414 mlen =
2415 MHLEN - M_LEADINGSPACE(m);
2416 } else {
2417 mlen = MLEN - M_LEADINGSPACE(m);
2418 }
2419 len = imin(mlen, bytes_to_copy);
2420
2421 chainlength += len;
2422
2423 space -= len;
2424
2425 error = uiomove(mtod(m, caddr_t),
2426 len, uio);
2427
2428 resid = uio_resid(uio);
2429
2430 m->m_len = len;
2431 *mp = m;
2432 top->m_pkthdr.len += len;
2433 if (error) {
2434 break;
2435 }
2436 mp = &m->m_next;
2437 if (resid <= 0) {
2438 if (flags & MSG_EOR) {
2439 top->m_flags |= M_EOR;
2440 }
2441 break;
2442 }
2443 bytes_to_copy = min(resid, space);
2444 } while (space > 0 &&
2445 (chainlength < sosendmaxchain || atomic ||
2446 resid < MINCLSIZE));
2447
2448 socket_lock(so, 0);
2449
2450 if (error) {
2451 goto out_locked;
2452 }
2453 }
2454
2455 if (dontroute) {
2456 so->so_options |= SO_DONTROUTE;
2457 }
2458
2459 /*
2460 * Compute flags here, for pru_send and NKEs
2461 *
2462 * If the user set MSG_EOF, the protocol
2463 * understands this flag and nothing left to
2464 * send then use PRU_SEND_EOF instead of PRU_SEND.
2465 */
2466 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2467 ((flags & MSG_EOF) &&
2468 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2469 (resid <= 0)) ? PRUS_EOF :
2470 /* If there is more to send set PRUS_MORETOCOME */
2471 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2472
2473 if ((flags & MSG_SKIPCFIL) == 0) {
2474 /*
2475 * Socket filter processing
2476 */
2477 error = sflt_data_out(so, addr, &top,
2478 &control, (sendflags & MSG_OOB) ?
2479 sock_data_filt_flag_oob : 0);
2480 if (error) {
2481 if (error == EJUSTRETURN) {
2482 error = 0;
2483 clen = 0;
2484 control = NULL;
2485 top = NULL;
2486 }
2487 goto out_locked;
2488 }
2489 #if CONTENT_FILTER
2490 /*
2491 * Content filter processing
2492 */
2493 error = cfil_sock_data_out(so, addr, top,
2494 control, sendflags);
2495 if (error) {
2496 if (error == EJUSTRETURN) {
2497 error = 0;
2498 clen = 0;
2499 control = NULL;
2500 top = NULL;
2501 }
2502 goto out_locked;
2503 }
2504 #endif /* CONTENT_FILTER */
2505 }
2506 if (so->so_flags & SOF_ENABLE_MSGS) {
2507 /*
2508 * Make a copy of control mbuf,
2509 * so that msg priority can be
2510 * passed to subsequent mbufs.
2511 */
2512 control_copy = m_dup(control, M_NOWAIT);
2513 }
2514 error = (*so->so_proto->pr_usrreqs->pru_send)
2515 (so, sendflags, top, addr, control, p);
2516
2517 if (dontroute) {
2518 so->so_options &= ~SO_DONTROUTE;
2519 }
2520
2521 clen = 0;
2522 control = control_copy;
2523 control_copy = NULL;
2524 top = NULL;
2525 mp = &top;
2526 if (error) {
2527 goto out_locked;
2528 }
2529 } while (resid && space > 0);
2530 } while (resid);
2531
2532 out_locked:
2533 if (sblocked) {
2534 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2535 } else {
2536 socket_unlock(so, 1);
2537 }
2538 if (top != NULL) {
2539 m_freem(top);
2540 }
2541 if (control != NULL) {
2542 m_freem(control);
2543 }
2544 if (freelist != NULL) {
2545 m_freem_list(freelist);
2546 }
2547 if (control_copy != NULL) {
2548 m_freem(control_copy);
2549 }
2550
2551 soclearfastopen(so);
2552
2553 if (en_tracing) {
2554 /* resid passed here is the bytes left in uio */
2555 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2556 VM_KERNEL_ADDRPERM(so),
2557 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2558 (int64_t)(orig_resid - resid));
2559 }
2560 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2561 so->so_snd.sb_cc, space, error);
2562
2563 return error;
2564 }
2565
2566 int
2567 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2568 {
2569 struct mbuf *m0 = NULL, *control_end = NULL;
2570
2571 socket_lock_assert_owned(so);
2572
2573 /*
2574 * top must points to mbuf chain to be sent.
2575 * If control is not NULL, top must be packet header
2576 */
2577 VERIFY(top != NULL &&
2578 (control == NULL || top->m_flags & M_PKTHDR));
2579
2580 /*
2581 * If control is not passed in, see if we can get it
2582 * from top.
2583 */
2584 if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2585 // Locate start of control if present and start of data
2586 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2587 if (m0->m_flags & M_PKTHDR) {
2588 top = m0;
2589 break;
2590 } else if (m0->m_type == MT_CONTROL) {
2591 if (control == NULL) {
2592 // Found start of control
2593 control = m0;
2594 }
2595 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2596 // Found end of control
2597 control_end = m0;
2598 }
2599 }
2600 }
2601 if (control_end != NULL) {
2602 control_end->m_next = NULL;
2603 }
2604 }
2605
2606 int error = (*so->so_proto->pr_usrreqs->pru_send)
2607 (so, sendflags, top, addr, control, current_proc());
2608
2609 return error;
2610 }
2611
2612 /*
2613 * Supported only connected sockets (no address) without ancillary data
2614 * (control mbuf) for atomic protocols
2615 */
2616 int
2617 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2618 {
2619 struct mbuf *m, *freelist = NULL;
2620 user_ssize_t len, resid;
2621 int error, dontroute, mlen;
2622 int atomic = sosendallatonce(so);
2623 int sblocked = 0;
2624 struct proc *p = current_proc();
2625 u_int uiofirst = 0;
2626 u_int uiolast = 0;
2627 struct mbuf *top = NULL;
2628 uint16_t headroom = 0;
2629 boolean_t bigcl;
2630
2631 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2632 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2633
2634 if (so->so_type != SOCK_DGRAM) {
2635 error = EINVAL;
2636 goto out;
2637 }
2638 if (atomic == 0) {
2639 error = EINVAL;
2640 goto out;
2641 }
2642 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2643 error = EPROTONOSUPPORT;
2644 goto out;
2645 }
2646 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2647 error = EINVAL;
2648 goto out;
2649 }
2650 resid = uio_array_resid(uioarray, uiocnt);
2651
2652 /*
2653 * In theory resid should be unsigned.
2654 * However, space must be signed, as it might be less than 0
2655 * if we over-committed, and we must use a signed comparison
2656 * of space and resid. On the other hand, a negative resid
2657 * causes us to loop sending 0-length segments to the protocol.
2658 *
2659 * Note: We limit resid to be a positive int value as we use
2660 * imin() to set bytes_to_copy -- radr://14558484
2661 */
2662 if (resid < 0 || resid > INT_MAX) {
2663 error = EINVAL;
2664 goto out;
2665 }
2666
2667 socket_lock(so, 1);
2668 so_update_last_owner_locked(so, p);
2669 so_update_policy(so);
2670
2671 #if NECP
2672 so_update_necp_policy(so, NULL, NULL);
2673 #endif /* NECP */
2674
2675 dontroute = (flags & MSG_DONTROUTE) &&
2676 (so->so_options & SO_DONTROUTE) == 0 &&
2677 (so->so_proto->pr_flags & PR_ATOMIC);
2678 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2679
2680 error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2681 &sblocked, NULL);
2682 if (error) {
2683 goto release;
2684 }
2685
2686 /*
2687 * Use big 4 KB clusters when the outgoing interface does not prefer
2688 * 2 KB clusters
2689 */
2690 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2691
2692 if (soreserveheadroom != 0) {
2693 headroom = so->so_pktheadroom;
2694 }
2695
2696 do {
2697 int i;
2698 int num_needed = 0;
2699 int chainlength;
2700 size_t maxpktlen = 0;
2701 int bytes_to_alloc;
2702
2703 if (sosendminchain > 0) {
2704 chainlength = 0;
2705 } else {
2706 chainlength = sosendmaxchain;
2707 }
2708
2709 socket_unlock(so, 0);
2710
2711 /*
2712 * Find a set of uio that fit in a reasonable number
2713 * of mbuf packets
2714 */
2715 for (i = uiofirst; i < uiocnt; i++) {
2716 struct uio *auio = uioarray[i];
2717
2718 len = uio_resid(auio);
2719
2720 /* Do nothing for empty messages */
2721 if (len == 0) {
2722 continue;
2723 }
2724
2725 num_needed += 1;
2726 uiolast += 1;
2727
2728 if (len > maxpktlen) {
2729 maxpktlen = len;
2730 }
2731
2732 chainlength += len;
2733 if (chainlength > sosendmaxchain) {
2734 break;
2735 }
2736 }
2737 /*
2738 * Nothing left to send
2739 */
2740 if (num_needed == 0) {
2741 socket_lock(so, 0);
2742 break;
2743 }
2744 /*
2745 * Allocate buffer large enough to include headroom space for
2746 * network and link header
2747 *
2748 */
2749 bytes_to_alloc = maxpktlen + headroom;
2750
2751 /*
2752 * Allocate a single contiguous buffer of the smallest available
2753 * size when possible
2754 */
2755 if (bytes_to_alloc > MCLBYTES &&
2756 bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2757 freelist = m_getpackets_internal(
2758 (unsigned int *)&num_needed,
2759 num_needed, M_WAIT, 1,
2760 MBIGCLBYTES);
2761 } else if (bytes_to_alloc > _MHLEN &&
2762 bytes_to_alloc <= MCLBYTES) {
2763 freelist = m_getpackets_internal(
2764 (unsigned int *)&num_needed,
2765 num_needed, M_WAIT, 1,
2766 MCLBYTES);
2767 } else {
2768 freelist = m_allocpacket_internal(
2769 (unsigned int *)&num_needed,
2770 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2771 }
2772
2773 if (freelist == NULL) {
2774 socket_lock(so, 0);
2775 error = ENOMEM;
2776 goto release;
2777 }
2778 /*
2779 * Copy each uio of the set into its own mbuf packet
2780 */
2781 for (i = uiofirst, m = freelist;
2782 i < uiolast && m != NULL;
2783 i++) {
2784 int bytes_to_copy;
2785 struct mbuf *n;
2786 struct uio *auio = uioarray[i];
2787
2788 bytes_to_copy = uio_resid(auio);
2789
2790 /* Do nothing for empty messages */
2791 if (bytes_to_copy == 0) {
2792 continue;
2793 }
2794 /*
2795 * Leave headroom for protocol headers
2796 * in the first mbuf of the chain
2797 */
2798 m->m_data += headroom;
2799
2800 for (n = m; n != NULL; n = n->m_next) {
2801 if ((m->m_flags & M_EXT)) {
2802 mlen = m->m_ext.ext_size -
2803 M_LEADINGSPACE(m);
2804 } else if ((m->m_flags & M_PKTHDR)) {
2805 mlen =
2806 MHLEN - M_LEADINGSPACE(m);
2807 } else {
2808 mlen = MLEN - M_LEADINGSPACE(m);
2809 }
2810 len = imin(mlen, bytes_to_copy);
2811
2812 /*
2813 * Note: uiomove() decrements the iovec
2814 * length
2815 */
2816 error = uiomove(mtod(n, caddr_t),
2817 len, auio);
2818 if (error != 0) {
2819 break;
2820 }
2821 n->m_len = len;
2822 m->m_pkthdr.len += len;
2823
2824 VERIFY(m->m_pkthdr.len <= maxpktlen);
2825
2826 bytes_to_copy -= len;
2827 resid -= len;
2828 }
2829 if (m->m_pkthdr.len == 0) {
2830 printf(
2831 "%s:%d so %llx pkt %llx type %u len null\n",
2832 __func__, __LINE__,
2833 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2834 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2835 m->m_type);
2836 }
2837 if (error != 0) {
2838 break;
2839 }
2840 m = m->m_nextpkt;
2841 }
2842
2843 socket_lock(so, 0);
2844
2845 if (error) {
2846 goto release;
2847 }
2848 top = freelist;
2849 freelist = NULL;
2850
2851 if (dontroute) {
2852 so->so_options |= SO_DONTROUTE;
2853 }
2854
2855 if ((flags & MSG_SKIPCFIL) == 0) {
2856 struct mbuf **prevnextp = NULL;
2857
2858 for (i = uiofirst, m = top;
2859 i < uiolast && m != NULL;
2860 i++) {
2861 struct mbuf *nextpkt = m->m_nextpkt;
2862
2863 /*
2864 * Socket filter processing
2865 */
2866 error = sflt_data_out(so, NULL, &m,
2867 NULL, 0);
2868 if (error != 0 && error != EJUSTRETURN) {
2869 goto release;
2870 }
2871
2872 #if CONTENT_FILTER
2873 if (error == 0) {
2874 /*
2875 * Content filter processing
2876 */
2877 error = cfil_sock_data_out(so, NULL, m,
2878 NULL, 0);
2879 if (error != 0 && error != EJUSTRETURN) {
2880 goto release;
2881 }
2882 }
2883 #endif /* CONTENT_FILTER */
2884 /*
2885 * Remove packet from the list when
2886 * swallowed by a filter
2887 */
2888 if (error == EJUSTRETURN) {
2889 error = 0;
2890 if (prevnextp != NULL) {
2891 *prevnextp = nextpkt;
2892 } else {
2893 top = nextpkt;
2894 }
2895 }
2896
2897 m = nextpkt;
2898 if (m != NULL) {
2899 prevnextp = &m->m_nextpkt;
2900 }
2901 }
2902 }
2903 if (top != NULL) {
2904 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2905 (so, 0, top, NULL, NULL, p);
2906 }
2907
2908 if (dontroute) {
2909 so->so_options &= ~SO_DONTROUTE;
2910 }
2911
2912 top = NULL;
2913 uiofirst = uiolast;
2914 } while (resid > 0 && error == 0);
2915 release:
2916 if (sblocked) {
2917 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2918 } else {
2919 socket_unlock(so, 1);
2920 }
2921 out:
2922 if (top != NULL) {
2923 m_freem(top);
2924 }
2925 if (freelist != NULL) {
2926 m_freem_list(freelist);
2927 }
2928
2929 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2930 so->so_snd.sb_cc, 0, error);
2931
2932 return error;
2933 }
2934
2935 /*
2936 * May return ERESTART when packet is dropped by MAC policy check
2937 */
2938 static int
2939 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2940 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2941 {
2942 int error = 0;
2943 struct mbuf *m = *mp;
2944 struct mbuf *nextrecord = *nextrecordp;
2945
2946 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2947 #if CONFIG_MACF_SOCKET_SUBSET
2948 /*
2949 * Call the MAC framework for policy checking if we're in
2950 * the user process context and the socket isn't connected.
2951 */
2952 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2953 struct mbuf *m0 = m;
2954 /*
2955 * Dequeue this record (temporarily) from the receive
2956 * list since we're about to drop the socket's lock
2957 * where a new record may arrive and be appended to
2958 * the list. Upon MAC policy failure, the record
2959 * will be freed. Otherwise, we'll add it back to
2960 * the head of the list. We cannot rely on SB_LOCK
2961 * because append operation uses the socket's lock.
2962 */
2963 do {
2964 m->m_nextpkt = NULL;
2965 sbfree(&so->so_rcv, m);
2966 m = m->m_next;
2967 } while (m != NULL);
2968 m = m0;
2969 so->so_rcv.sb_mb = nextrecord;
2970 SB_EMPTY_FIXUP(&so->so_rcv);
2971 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2972 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2973 socket_unlock(so, 0);
2974
2975 if (mac_socket_check_received(proc_ucred(p), so,
2976 mtod(m, struct sockaddr *)) != 0) {
2977 /*
2978 * MAC policy failure; free this record and
2979 * process the next record (or block until
2980 * one is available). We have adjusted sb_cc
2981 * and sb_mbcnt above so there is no need to
2982 * call sbfree() again.
2983 */
2984 m_freem(m);
2985 /*
2986 * Clear SB_LOCK but don't unlock the socket.
2987 * Process the next record or wait for one.
2988 */
2989 socket_lock(so, 0);
2990 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2991 error = ERESTART;
2992 goto done;
2993 }
2994 socket_lock(so, 0);
2995 /*
2996 * If the socket has been defunct'd, drop it.
2997 */
2998 if (so->so_flags & SOF_DEFUNCT) {
2999 m_freem(m);
3000 error = ENOTCONN;
3001 goto done;
3002 }
3003 /*
3004 * Re-adjust the socket receive list and re-enqueue
3005 * the record in front of any packets which may have
3006 * been appended while we dropped the lock.
3007 */
3008 for (m = m0; m->m_next != NULL; m = m->m_next) {
3009 sballoc(&so->so_rcv, m);
3010 }
3011 sballoc(&so->so_rcv, m);
3012 if (so->so_rcv.sb_mb == NULL) {
3013 so->so_rcv.sb_lastrecord = m0;
3014 so->so_rcv.sb_mbtail = m;
3015 }
3016 m = m0;
3017 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3018 so->so_rcv.sb_mb = m;
3019 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3020 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3021 }
3022 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3023 if (psa != NULL) {
3024 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3025 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3026 error = EWOULDBLOCK;
3027 goto done;
3028 }
3029 }
3030 if (flags & MSG_PEEK) {
3031 m = m->m_next;
3032 } else {
3033 sbfree(&so->so_rcv, m);
3034 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3035 panic("%s: about to create invalid socketbuf",
3036 __func__);
3037 /* NOTREACHED */
3038 }
3039 MFREE(m, so->so_rcv.sb_mb);
3040 m = so->so_rcv.sb_mb;
3041 if (m != NULL) {
3042 m->m_nextpkt = nextrecord;
3043 } else {
3044 so->so_rcv.sb_mb = nextrecord;
3045 SB_EMPTY_FIXUP(&so->so_rcv);
3046 }
3047 }
3048 done:
3049 *mp = m;
3050 *nextrecordp = nextrecord;
3051
3052 return error;
3053 }
3054
3055 /*
3056 * Process one or more MT_CONTROL mbufs present before any data mbufs
3057 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3058 * just copy the data; if !MSG_PEEK, we call into the protocol to
3059 * perform externalization.
3060 */
3061 static int
3062 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3063 struct mbuf **mp, struct mbuf **nextrecordp)
3064 {
3065 int error = 0;
3066 struct mbuf *cm = NULL, *cmn;
3067 struct mbuf **cme = &cm;
3068 struct sockbuf *sb_rcv = &so->so_rcv;
3069 struct mbuf **msgpcm = NULL;
3070 struct mbuf *m = *mp;
3071 struct mbuf *nextrecord = *nextrecordp;
3072 struct protosw *pr = so->so_proto;
3073
3074 /*
3075 * Externalizing the control messages would require us to
3076 * drop the socket's lock below. Once we re-acquire the
3077 * lock, the mbuf chain might change. In order to preserve
3078 * consistency, we unlink all control messages from the
3079 * first mbuf chain in one shot and link them separately
3080 * onto a different chain.
3081 */
3082 do {
3083 if (flags & MSG_PEEK) {
3084 if (controlp != NULL) {
3085 if (*controlp == NULL) {
3086 msgpcm = controlp;
3087 }
3088 *controlp = m_copy(m, 0, m->m_len);
3089
3090 /*
3091 * If we failed to allocate an mbuf,
3092 * release any previously allocated
3093 * mbufs for control data. Return
3094 * an error. Keep the mbufs in the
3095 * socket as this is using
3096 * MSG_PEEK flag.
3097 */
3098 if (*controlp == NULL) {
3099 m_freem(*msgpcm);
3100 error = ENOBUFS;
3101 goto done;
3102 }
3103 controlp = &(*controlp)->m_next;
3104 }
3105 m = m->m_next;
3106 } else {
3107 m->m_nextpkt = NULL;
3108 sbfree(sb_rcv, m);
3109 sb_rcv->sb_mb = m->m_next;
3110 m->m_next = NULL;
3111 *cme = m;
3112 cme = &(*cme)->m_next;
3113 m = sb_rcv->sb_mb;
3114 }
3115 } while (m != NULL && m->m_type == MT_CONTROL);
3116
3117 if (!(flags & MSG_PEEK)) {
3118 if (sb_rcv->sb_mb != NULL) {
3119 sb_rcv->sb_mb->m_nextpkt = nextrecord;
3120 } else {
3121 sb_rcv->sb_mb = nextrecord;
3122 SB_EMPTY_FIXUP(sb_rcv);
3123 }
3124 if (nextrecord == NULL) {
3125 sb_rcv->sb_lastrecord = m;
3126 }
3127 }
3128
3129 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3130 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3131
3132 while (cm != NULL) {
3133 int cmsg_type;
3134
3135 cmn = cm->m_next;
3136 cm->m_next = NULL;
3137 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3138
3139 /*
3140 * Call the protocol to externalize SCM_RIGHTS message
3141 * and return the modified message to the caller upon
3142 * success. Otherwise, all other control messages are
3143 * returned unmodified to the caller. Note that we
3144 * only get into this loop if MSG_PEEK is not set.
3145 */
3146 if (pr->pr_domain->dom_externalize != NULL &&
3147 cmsg_type == SCM_RIGHTS) {
3148 /*
3149 * Release socket lock: see 3903171. This
3150 * would also allow more records to be appended
3151 * to the socket buffer. We still have SB_LOCK
3152 * set on it, so we can be sure that the head
3153 * of the mbuf chain won't change.
3154 */
3155 socket_unlock(so, 0);
3156 error = (*pr->pr_domain->dom_externalize)(cm);
3157 socket_lock(so, 0);
3158 } else {
3159 error = 0;
3160 }
3161
3162 if (controlp != NULL && error == 0) {
3163 *controlp = cm;
3164 controlp = &(*controlp)->m_next;
3165 } else {
3166 (void) m_free(cm);
3167 }
3168 cm = cmn;
3169 }
3170 /*
3171 * Update the value of nextrecord in case we received new
3172 * records when the socket was unlocked above for
3173 * externalizing SCM_RIGHTS.
3174 */
3175 if (m != NULL) {
3176 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3177 } else {
3178 nextrecord = sb_rcv->sb_mb;
3179 }
3180
3181 done:
3182 *mp = m;
3183 *nextrecordp = nextrecord;
3184
3185 return error;
3186 }
3187
3188 /*
3189 * Implement receive operations on a socket.
3190 * We depend on the way that records are added to the sockbuf
3191 * by sbappend*. In particular, each record (mbufs linked through m_next)
3192 * must begin with an address if the protocol so specifies,
3193 * followed by an optional mbuf or mbufs containing ancillary data,
3194 * and then zero or more mbufs of data.
3195 * In order to avoid blocking network interrupts for the entire time here,
3196 * we splx() while doing the actual copy to user space.
3197 * Although the sockbuf is locked, new data may still be appended,
3198 * and thus we must maintain consistency of the sockbuf during that time.
3199 *
3200 * The caller may receive the data as a single mbuf chain by supplying
3201 * an mbuf **mp0 for use in returning the chain. The uio is then used
3202 * only for the count in uio_resid.
3203 *
3204 * Returns: 0 Success
3205 * ENOBUFS
3206 * ENOTCONN
3207 * EWOULDBLOCK
3208 * uiomove:EFAULT
3209 * sblock:EWOULDBLOCK
3210 * sblock:EINTR
3211 * sbwait:EBADF
3212 * sbwait:EINTR
3213 * sodelayed_copy:EFAULT
3214 * <pru_rcvoob>:EINVAL[TCP]
3215 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3216 * <pru_rcvoob>:???
3217 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3218 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3219 * <pr_domain->dom_externalize>:???
3220 *
3221 * Notes: Additional return values from calls through <pru_rcvoob> and
3222 * <pr_domain->dom_externalize> depend on protocols other than
3223 * TCP or AF_UNIX, which are documented above.
3224 */
3225 int
3226 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3227 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3228 {
3229 struct mbuf *m, **mp, *ml = NULL;
3230 struct mbuf *nextrecord, *free_list;
3231 int flags, error, offset;
3232 user_ssize_t len;
3233 struct protosw *pr = so->so_proto;
3234 int moff, type = 0;
3235 user_ssize_t orig_resid = uio_resid(uio);
3236 user_ssize_t delayed_copy_len;
3237 int can_delay;
3238 int need_event;
3239 struct proc *p = current_proc();
3240 boolean_t en_tracing = FALSE;
3241
3242 /*
3243 * Sanity check on the length passed by caller as we are making 'int'
3244 * comparisons
3245 */
3246 if (orig_resid < 0 || orig_resid > INT_MAX) {
3247 return EINVAL;
3248 }
3249
3250 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3251 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3252 so->so_rcv.sb_hiwat);
3253
3254 socket_lock(so, 1);
3255 so_update_last_owner_locked(so, p);
3256 so_update_policy(so);
3257
3258 #ifdef MORE_LOCKING_DEBUG
3259 if (so->so_usecount == 1) {
3260 panic("%s: so=%x no other reference on socket\n", __func__, so);
3261 /* NOTREACHED */
3262 }
3263 #endif
3264 mp = mp0;
3265 if (psa != NULL) {
3266 *psa = NULL;
3267 }
3268 if (controlp != NULL) {
3269 *controlp = NULL;
3270 }
3271 if (flagsp != NULL) {
3272 flags = *flagsp & ~MSG_EOR;
3273 } else {
3274 flags = 0;
3275 }
3276
3277 /*
3278 * If a recv attempt is made on a previously-accepted socket
3279 * that has been marked as inactive (disconnected), reject
3280 * the request.
3281 */
3282 if (so->so_flags & SOF_DEFUNCT) {
3283 struct sockbuf *sb = &so->so_rcv;
3284
3285 error = ENOTCONN;
3286 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3287 __func__, proc_pid(p), proc_best_name(p),
3288 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3289 SOCK_DOM(so), SOCK_TYPE(so), error);
3290 /*
3291 * This socket should have been disconnected and flushed
3292 * prior to being returned from sodefunct(); there should
3293 * be no data on its receive list, so panic otherwise.
3294 */
3295 if (so->so_state & SS_DEFUNCT) {
3296 sb_empty_assert(sb, __func__);
3297 }
3298 socket_unlock(so, 1);
3299 return error;
3300 }
3301
3302 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3303 pr->pr_usrreqs->pru_preconnect) {
3304 /*
3305 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3306 * calling write() right after this. *If* the app calls a read
3307 * we do not want to block this read indefinetely. Thus,
3308 * we trigger a connect so that the session gets initiated.
3309 */
3310 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3311
3312 if (error) {
3313 socket_unlock(so, 1);
3314 return error;
3315 }
3316 }
3317
3318 if (ENTR_SHOULDTRACE &&
3319 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3320 /*
3321 * enable energy tracing for inet sockets that go over
3322 * non-loopback interfaces only.
3323 */
3324 struct inpcb *inp = sotoinpcb(so);
3325 if (inp->inp_last_outifp != NULL &&
3326 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3327 en_tracing = TRUE;
3328 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3329 VM_KERNEL_ADDRPERM(so),
3330 ((so->so_state & SS_NBIO) ?
3331 kEnTrFlagNonBlocking : 0),
3332 (int64_t)orig_resid);
3333 }
3334 }
3335
3336 /*
3337 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3338 * regardless of the flags argument. Here is the case were
3339 * out-of-band data is not inline.
3340 */
3341 if ((flags & MSG_OOB) ||
3342 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3343 (so->so_options & SO_OOBINLINE) == 0 &&
3344 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3345 m = m_get(M_WAIT, MT_DATA);
3346 if (m == NULL) {
3347 socket_unlock(so, 1);
3348 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3349 ENOBUFS, 0, 0, 0, 0);
3350 return ENOBUFS;
3351 }
3352 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3353 if (error) {
3354 goto bad;
3355 }
3356 socket_unlock(so, 0);
3357 do {
3358 error = uiomove(mtod(m, caddr_t),
3359 imin(uio_resid(uio), m->m_len), uio);
3360 m = m_free(m);
3361 } while (uio_resid(uio) && error == 0 && m != NULL);
3362 socket_lock(so, 0);
3363 bad:
3364 if (m != NULL) {
3365 m_freem(m);
3366 }
3367
3368 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3369 if (error == EWOULDBLOCK || error == EINVAL) {
3370 /*
3371 * Let's try to get normal data:
3372 * EWOULDBLOCK: out-of-band data not
3373 * receive yet. EINVAL: out-of-band data
3374 * already read.
3375 */
3376 error = 0;
3377 goto nooob;
3378 } else if (error == 0 && flagsp != NULL) {
3379 *flagsp |= MSG_OOB;
3380 }
3381 }
3382 socket_unlock(so, 1);
3383 if (en_tracing) {
3384 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3385 VM_KERNEL_ADDRPERM(so), 0,
3386 (int64_t)(orig_resid - uio_resid(uio)));
3387 }
3388 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3389 0, 0, 0, 0);
3390
3391 return error;
3392 }
3393 nooob:
3394 if (mp != NULL) {
3395 *mp = NULL;
3396 }
3397
3398 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3399 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3400 }
3401
3402 free_list = NULL;
3403 delayed_copy_len = 0;
3404 restart:
3405 #ifdef MORE_LOCKING_DEBUG
3406 if (so->so_usecount <= 1) {
3407 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3408 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3409 }
3410 #endif
3411 /*
3412 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3413 * and if so just return to the caller. This could happen when
3414 * soreceive() is called by a socket upcall function during the
3415 * time the socket is freed. The socket buffer would have been
3416 * locked across the upcall, therefore we cannot put this thread
3417 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3418 * we may livelock), because the lock on the socket buffer will
3419 * only be released when the upcall routine returns to its caller.
3420 * Because the socket has been officially closed, there can be
3421 * no further read on it.
3422 *
3423 * A multipath subflow socket would have its SS_NOFDREF set by
3424 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3425 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3426 */
3427 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3428 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3429 socket_unlock(so, 1);
3430 return 0;
3431 }
3432
3433 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3434 if (error) {
3435 socket_unlock(so, 1);
3436 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3437 0, 0, 0, 0);
3438 if (en_tracing) {
3439 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3440 VM_KERNEL_ADDRPERM(so), 0,
3441 (int64_t)(orig_resid - uio_resid(uio)));
3442 }
3443 return error;
3444 }
3445
3446 m = so->so_rcv.sb_mb;
3447 /*
3448 * If we have less data than requested, block awaiting more
3449 * (subject to any timeout) if:
3450 * 1. the current count is less than the low water mark, or
3451 * 2. MSG_WAITALL is set, and it is possible to do the entire
3452 * receive operation at once if we block (resid <= hiwat).
3453 * 3. MSG_DONTWAIT is not set
3454 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3455 * we have to do the receive in sections, and thus risk returning
3456 * a short count if a timeout or signal occurs after we start.
3457 */
3458 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3459 so->so_rcv.sb_cc < uio_resid(uio)) &&
3460 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3461 ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
3462 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
3463 /*
3464 * Panic if we notice inconsistencies in the socket's
3465 * receive list; both sb_mb and sb_cc should correctly
3466 * reflect the contents of the list, otherwise we may
3467 * end up with false positives during select() or poll()
3468 * which could put the application in a bad state.
3469 */
3470 SB_MB_CHECK(&so->so_rcv);
3471
3472 if (so->so_error) {
3473 if (m != NULL) {
3474 goto dontblock;
3475 }
3476 error = so->so_error;
3477 if ((flags & MSG_PEEK) == 0) {
3478 so->so_error = 0;
3479 }
3480 goto release;
3481 }
3482 if (so->so_state & SS_CANTRCVMORE) {
3483 #if CONTENT_FILTER
3484 /*
3485 * Deal with half closed connections
3486 */
3487 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3488 cfil_sock_data_pending(&so->so_rcv) != 0) {
3489 CFIL_LOG(LOG_INFO,
3490 "so %llx ignore SS_CANTRCVMORE",
3491 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3492 } else
3493 #endif /* CONTENT_FILTER */
3494 if (m != NULL) {
3495 goto dontblock;
3496 } else {
3497 goto release;
3498 }
3499 }
3500 for (; m != NULL; m = m->m_next) {
3501 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3502 m = so->so_rcv.sb_mb;
3503 goto dontblock;
3504 }
3505 }
3506 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3507 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3508 error = ENOTCONN;
3509 goto release;
3510 }
3511 if (uio_resid(uio) == 0) {
3512 goto release;
3513 }
3514
3515 if ((so->so_state & SS_NBIO) ||
3516 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3517 error = EWOULDBLOCK;
3518 goto release;
3519 }
3520 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3521 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3522 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3523 #if EVEN_MORE_LOCKING_DEBUG
3524 if (socket_debug) {
3525 printf("Waiting for socket data\n");
3526 }
3527 #endif
3528
3529 error = sbwait(&so->so_rcv);
3530 #if EVEN_MORE_LOCKING_DEBUG
3531 if (socket_debug) {
3532 printf("SORECEIVE - sbwait returned %d\n", error);
3533 }
3534 #endif
3535 if (so->so_usecount < 1) {
3536 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3537 __func__, so, so->so_usecount);
3538 /* NOTREACHED */
3539 }
3540 if (error) {
3541 socket_unlock(so, 1);
3542 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3543 0, 0, 0, 0);
3544 if (en_tracing) {
3545 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3546 VM_KERNEL_ADDRPERM(so), 0,
3547 (int64_t)(orig_resid - uio_resid(uio)));
3548 }
3549 return error;
3550 }
3551 goto restart;
3552 }
3553 dontblock:
3554 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3555 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3556 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3557 nextrecord = m->m_nextpkt;
3558
3559 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3560 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3561 mp0 == NULL);
3562 if (error == ERESTART) {
3563 goto restart;
3564 } else if (error != 0) {
3565 goto release;
3566 }
3567 orig_resid = 0;
3568 }
3569
3570 /*
3571 * Process one or more MT_CONTROL mbufs present before any data mbufs
3572 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3573 * just copy the data; if !MSG_PEEK, we call into the protocol to
3574 * perform externalization.
3575 */
3576 if (m != NULL && m->m_type == MT_CONTROL) {
3577 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3578 if (error != 0) {
3579 goto release;
3580 }
3581 orig_resid = 0;
3582 }
3583
3584 /*
3585 * If the socket is a TCP socket with message delivery
3586 * enabled, then create a control msg to deliver the
3587 * relative TCP sequence number for this data. Waiting
3588 * until this point will protect against failures to
3589 * allocate an mbuf for control msgs.
3590 */
3591 if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3592 (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3593 struct mbuf *seq_cm;
3594
3595 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
3596 sizeof(uint32_t), SCM_SEQNUM, SOL_SOCKET);
3597 if (seq_cm == NULL) {
3598 /* unable to allocate a control mbuf */
3599 error = ENOBUFS;
3600 goto release;
3601 }
3602 *controlp = seq_cm;
3603 controlp = &seq_cm->m_next;
3604 }
3605
3606 if (m != NULL) {
3607 if (!(flags & MSG_PEEK)) {
3608 /*
3609 * We get here because m points to an mbuf following
3610 * any MT_SONAME or MT_CONTROL mbufs which have been
3611 * processed above. In any case, m should be pointing
3612 * to the head of the mbuf chain, and the nextrecord
3613 * should be either NULL or equal to m->m_nextpkt.
3614 * See comments above about SB_LOCK.
3615 */
3616 if (m != so->so_rcv.sb_mb ||
3617 m->m_nextpkt != nextrecord) {
3618 panic("%s: post-control !sync so=%p m=%p "
3619 "nextrecord=%p\n", __func__, so, m,
3620 nextrecord);
3621 /* NOTREACHED */
3622 }
3623 if (nextrecord == NULL) {
3624 so->so_rcv.sb_lastrecord = m;
3625 }
3626 }
3627 type = m->m_type;
3628 if (type == MT_OOBDATA) {
3629 flags |= MSG_OOB;
3630 }
3631 } else {
3632 if (!(flags & MSG_PEEK)) {
3633 SB_EMPTY_FIXUP(&so->so_rcv);
3634 }
3635 }
3636 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3637 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3638
3639 moff = 0;
3640 offset = 0;
3641
3642 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3643 can_delay = 1;
3644 } else {
3645 can_delay = 0;
3646 }
3647
3648 need_event = 0;
3649
3650 while (m != NULL &&
3651 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3652 if (m->m_type == MT_OOBDATA) {
3653 if (type != MT_OOBDATA) {
3654 break;
3655 }
3656 } else if (type == MT_OOBDATA) {
3657 break;
3658 }
3659 /*
3660 * Make sure to allways set MSG_OOB event when getting
3661 * out of band data inline.
3662 */
3663 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3664 (so->so_options & SO_OOBINLINE) != 0 &&
3665 (so->so_state & SS_RCVATMARK) != 0) {
3666 flags |= MSG_OOB;
3667 }
3668 so->so_state &= ~SS_RCVATMARK;
3669 len = uio_resid(uio) - delayed_copy_len;
3670 if (so->so_oobmark && len > so->so_oobmark - offset) {
3671 len = so->so_oobmark - offset;
3672 }
3673 if (len > m->m_len - moff) {
3674 len = m->m_len - moff;
3675 }
3676 /*
3677 * If mp is set, just pass back the mbufs.
3678 * Otherwise copy them out via the uio, then free.
3679 * Sockbuf must be consistent here (points to current mbuf,
3680 * it points to next record) when we drop priority;
3681 * we must note any additions to the sockbuf when we
3682 * block interrupts again.
3683 */
3684 if (mp == NULL) {
3685 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3686 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3687 if (can_delay && len == m->m_len) {
3688 /*
3689 * only delay the copy if we're consuming the
3690 * mbuf and we're NOT in MSG_PEEK mode
3691 * and we have enough data to make it worthwile
3692 * to drop and retake the lock... can_delay
3693 * reflects the state of the 2 latter
3694 * constraints moff should always be zero
3695 * in these cases
3696 */
3697 delayed_copy_len += len;
3698 } else {
3699 if (delayed_copy_len) {
3700 error = sodelayed_copy(so, uio,
3701 &free_list, &delayed_copy_len);
3702
3703 if (error) {
3704 goto release;
3705 }
3706 /*
3707 * can only get here if MSG_PEEK is not
3708 * set therefore, m should point at the
3709 * head of the rcv queue; if it doesn't,
3710 * it means something drastically
3711 * changed while we were out from behind
3712 * the lock in sodelayed_copy. perhaps
3713 * a RST on the stream. in any event,
3714 * the stream has been interrupted. it's
3715 * probably best just to return whatever
3716 * data we've moved and let the caller
3717 * sort it out...
3718 */
3719 if (m != so->so_rcv.sb_mb) {
3720 break;
3721 }
3722 }
3723 socket_unlock(so, 0);
3724 error = uiomove(mtod(m, caddr_t) + moff,
3725 (int)len, uio);
3726 socket_lock(so, 0);
3727
3728 if (error) {
3729 goto release;
3730 }
3731 }
3732 } else {
3733 uio_setresid(uio, (uio_resid(uio) - len));
3734 }
3735 if (len == m->m_len - moff) {
3736 if (m->m_flags & M_EOR) {
3737 flags |= MSG_EOR;
3738 }
3739 if (flags & MSG_PEEK) {
3740 m = m->m_next;
3741 moff = 0;
3742 } else {
3743 nextrecord = m->m_nextpkt;
3744 sbfree(&so->so_rcv, m);
3745 m->m_nextpkt = NULL;
3746
3747 /*
3748 * If this packet is an unordered packet
3749 * (indicated by M_UNORDERED_DATA flag), remove
3750 * the additional bytes added to the
3751 * receive socket buffer size.
3752 */
3753 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3754 m->m_len &&
3755 (m->m_flags & M_UNORDERED_DATA) &&
3756 sbreserve(&so->so_rcv,
3757 so->so_rcv.sb_hiwat - m->m_len)) {
3758 if (so->so_msg_state->msg_uno_bytes >
3759 m->m_len) {
3760 so->so_msg_state->
3761 msg_uno_bytes -= m->m_len;
3762 } else {
3763 so->so_msg_state->
3764 msg_uno_bytes = 0;
3765 }
3766 m->m_flags &= ~M_UNORDERED_DATA;
3767 }
3768
3769 if (mp != NULL) {
3770 *mp = m;
3771 mp = &m->m_next;
3772 so->so_rcv.sb_mb = m = m->m_next;
3773 *mp = NULL;
3774 } else {
3775 if (free_list == NULL) {
3776 free_list = m;
3777 } else {
3778 ml->m_next = m;
3779 }
3780 ml = m;
3781 so->so_rcv.sb_mb = m = m->m_next;
3782 ml->m_next = NULL;
3783 }
3784 if (m != NULL) {
3785 m->m_nextpkt = nextrecord;
3786 if (nextrecord == NULL) {
3787 so->so_rcv.sb_lastrecord = m;
3788 }
3789 } else {
3790 so->so_rcv.sb_mb = nextrecord;
3791 SB_EMPTY_FIXUP(&so->so_rcv);
3792 }
3793 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3794 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3795 }
3796 } else {
3797 if (flags & MSG_PEEK) {
3798 moff += len;
3799 } else {
3800 if (mp != NULL) {
3801 int copy_flag;
3802
3803 if (flags & MSG_DONTWAIT) {
3804 copy_flag = M_DONTWAIT;
3805 } else {
3806 copy_flag = M_WAIT;
3807 }
3808 *mp = m_copym(m, 0, len, copy_flag);
3809 /*
3810 * Failed to allocate an mbuf?
3811 * Adjust uio_resid back, it was
3812 * adjusted down by len bytes which
3813 * we didn't copy over.
3814 */
3815 if (*mp == NULL) {
3816 uio_setresid(uio,
3817 (uio_resid(uio) + len));
3818 break;
3819 }
3820 }
3821 m->m_data += len;
3822 m->m_len -= len;
3823 so->so_rcv.sb_cc -= len;
3824 }
3825 }
3826 if (so->so_oobmark) {
3827 if ((flags & MSG_PEEK) == 0) {
3828 so->so_oobmark -= len;
3829 if (so->so_oobmark == 0) {
3830 so->so_state |= SS_RCVATMARK;
3831 /*
3832 * delay posting the actual event until
3833 * after any delayed copy processing
3834 * has finished
3835 */
3836 need_event = 1;
3837 break;
3838 }
3839 } else {
3840 offset += len;
3841 if (offset == so->so_oobmark) {
3842 break;
3843 }
3844 }
3845 }
3846 if (flags & MSG_EOR) {
3847 break;
3848 }
3849 /*
3850 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3851 * (for non-atomic socket), we must not quit until
3852 * "uio->uio_resid == 0" or an error termination.
3853 * If a signal/timeout occurs, return with a short
3854 * count but without error. Keep sockbuf locked
3855 * against other readers.
3856 */
3857 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3858 (uio_resid(uio) - delayed_copy_len) > 0 &&
3859 !sosendallatonce(so) && !nextrecord) {
3860 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3861 #if CONTENT_FILTER
3862 && cfil_sock_data_pending(&so->so_rcv) == 0
3863 #endif /* CONTENT_FILTER */
3864 )) {
3865 goto release;
3866 }
3867
3868 /*
3869 * Depending on the protocol (e.g. TCP), the following
3870 * might cause the socket lock to be dropped and later
3871 * be reacquired, and more data could have arrived and
3872 * have been appended to the receive socket buffer by
3873 * the time it returns. Therefore, we only sleep in
3874 * sbwait() below if and only if the socket buffer is
3875 * empty, in order to avoid a false sleep.
3876 */
3877 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3878 (((struct inpcb *)so->so_pcb)->inp_state !=
3879 INPCB_STATE_DEAD)) {
3880 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3881 }
3882
3883 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3884 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3885
3886 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3887 error = 0;
3888 goto release;
3889 }
3890 /*
3891 * have to wait until after we get back from the sbwait
3892 * to do the copy because we will drop the lock if we
3893 * have enough data that has been delayed... by dropping
3894 * the lock we open up a window allowing the netisr
3895 * thread to process the incoming packets and to change
3896 * the state of this socket... we're issuing the sbwait
3897 * because the socket is empty and we're expecting the
3898 * netisr thread to wake us up when more packets arrive;
3899 * if we allow that processing to happen and then sbwait
3900 * we could stall forever with packets sitting in the
3901 * socket if no further packets arrive from the remote
3902 * side.
3903 *
3904 * we want to copy before we've collected all the data
3905 * to satisfy this request to allow the copy to overlap
3906 * the incoming packet processing on an MP system
3907 */
3908 if (delayed_copy_len > sorecvmincopy &&
3909 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3910 error = sodelayed_copy(so, uio,
3911 &free_list, &delayed_copy_len);
3912
3913 if (error) {
3914 goto release;
3915 }
3916 }
3917 m = so->so_rcv.sb_mb;
3918 if (m != NULL) {
3919 nextrecord = m->m_nextpkt;
3920 }
3921 SB_MB_CHECK(&so->so_rcv);
3922 }
3923 }
3924 #ifdef MORE_LOCKING_DEBUG
3925 if (so->so_usecount <= 1) {
3926 panic("%s: after big while so=%p ref=%d on socket\n",
3927 __func__, so, so->so_usecount);
3928 /* NOTREACHED */
3929 }
3930 #endif
3931
3932 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3933 if (so->so_options & SO_DONTTRUNC) {
3934 flags |= MSG_RCVMORE;
3935 } else {
3936 flags |= MSG_TRUNC;
3937 if ((flags & MSG_PEEK) == 0) {
3938 (void) sbdroprecord(&so->so_rcv);
3939 }
3940 }
3941 }
3942
3943 /*
3944 * pru_rcvd below (for TCP) may cause more data to be received
3945 * if the socket lock is dropped prior to sending the ACK; some
3946 * legacy OpenTransport applications don't handle this well
3947 * (if it receives less data than requested while MSG_HAVEMORE
3948 * is set), and so we set the flag now based on what we know
3949 * prior to calling pru_rcvd.
3950 */
3951 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
3952 flags |= MSG_HAVEMORE;
3953 }
3954
3955 if ((flags & MSG_PEEK) == 0) {
3956 if (m == NULL) {
3957 so->so_rcv.sb_mb = nextrecord;
3958 /*
3959 * First part is an inline SB_EMPTY_FIXUP(). Second
3960 * part makes sure sb_lastrecord is up-to-date if
3961 * there is still data in the socket buffer.
3962 */
3963 if (so->so_rcv.sb_mb == NULL) {
3964 so->so_rcv.sb_mbtail = NULL;
3965 so->so_rcv.sb_lastrecord = NULL;
3966 } else if (nextrecord->m_nextpkt == NULL) {
3967 so->so_rcv.sb_lastrecord = nextrecord;
3968 }
3969 SB_MB_CHECK(&so->so_rcv);
3970 }
3971 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3972 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3973 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
3974 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3975 }
3976 }
3977
3978 if (delayed_copy_len) {
3979 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3980 if (error) {
3981 goto release;
3982 }
3983 }
3984 if (free_list != NULL) {
3985 m_freem_list(free_list);
3986 free_list = NULL;
3987 }
3988 if (need_event) {
3989 postevent(so, 0, EV_OOB);
3990 }
3991
3992 if (orig_resid == uio_resid(uio) && orig_resid &&
3993 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3994 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3995 goto restart;
3996 }
3997
3998 if (flagsp != NULL) {
3999 *flagsp |= flags;
4000 }
4001 release:
4002 #ifdef MORE_LOCKING_DEBUG
4003 if (so->so_usecount <= 1) {
4004 panic("%s: release so=%p ref=%d on socket\n", __func__,
4005 so, so->so_usecount);
4006 /* NOTREACHED */
4007 }
4008 #endif
4009 if (delayed_copy_len) {
4010 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4011 }
4012
4013 if (free_list != NULL) {
4014 m_freem_list(free_list);
4015 }
4016
4017 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4018
4019 if (en_tracing) {
4020 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4021 VM_KERNEL_ADDRPERM(so),
4022 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4023 (int64_t)(orig_resid - uio_resid(uio)));
4024 }
4025 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4026 so->so_rcv.sb_cc, 0, error);
4027
4028 return error;
4029 }
4030
4031 /*
4032 * Returns: 0 Success
4033 * uiomove:EFAULT
4034 */
4035 static int
4036 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4037 user_ssize_t *resid)
4038 {
4039 int error = 0;
4040 struct mbuf *m;
4041
4042 m = *free_list;
4043
4044 socket_unlock(so, 0);
4045
4046 while (m != NULL && error == 0) {
4047 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4048 m = m->m_next;
4049 }
4050 m_freem_list(*free_list);
4051
4052 *free_list = NULL;
4053 *resid = 0;
4054
4055 socket_lock(so, 0);
4056
4057 return error;
4058 }
4059
4060 static int
4061 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4062 u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4063 {
4064 #pragma unused(so)
4065 int error = 0;
4066 struct mbuf *ml, *m;
4067 int i = 0;
4068 struct uio *auio;
4069
4070 for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4071 ml = ml->m_nextpkt, i++) {
4072 auio = msgarray[i].uio;
4073 for (m = ml; m != NULL; m = m->m_next) {
4074 error = uiomove(mtod(m, caddr_t), m->m_len, auio);
4075 if (error != 0) {
4076 goto out;
4077 }
4078 }
4079 }
4080 out:
4081 m_freem_list(*free_list);
4082
4083 *free_list = NULL;
4084 *resid = 0;
4085
4086 return error;
4087 }
4088
4089 int
4090 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4091 int *flagsp)
4092 {
4093 struct mbuf *m;
4094 struct mbuf *nextrecord;
4095 struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4096 int error;
4097 user_ssize_t len, pktlen, delayed_copy_len = 0;
4098 struct protosw *pr = so->so_proto;
4099 user_ssize_t resid;
4100 struct proc *p = current_proc();
4101 struct uio *auio = NULL;
4102 int npkts = 0;
4103 int sblocked = 0;
4104 struct sockaddr **psa = NULL;
4105 struct mbuf **controlp = NULL;
4106 int can_delay;
4107 int flags;
4108 struct mbuf *free_others = NULL;
4109
4110 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4111 so, uiocnt,
4112 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4113
4114 /*
4115 * Sanity checks:
4116 * - Only supports don't wait flags
4117 * - Only support datagram sockets (could be extended to raw)
4118 * - Must be atomic
4119 * - Protocol must support packet chains
4120 * - The uio array is NULL (should we panic?)
4121 */
4122 if (flagsp != NULL) {
4123 flags = *flagsp;
4124 } else {
4125 flags = 0;
4126 }
4127 if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4128 MSG_NBIO)) {
4129 printf("%s invalid flags 0x%x\n", __func__, flags);
4130 error = EINVAL;
4131 goto out;
4132 }
4133 if (so->so_type != SOCK_DGRAM) {
4134 error = EINVAL;
4135 goto out;
4136 }
4137 if (sosendallatonce(so) == 0) {
4138 error = EINVAL;
4139 goto out;
4140 }
4141 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4142 error = EPROTONOSUPPORT;
4143 goto out;
4144 }
4145 if (msgarray == NULL) {
4146 printf("%s uioarray is NULL\n", __func__);
4147 error = EINVAL;
4148 goto out;
4149 }
4150 if (uiocnt == 0) {
4151 printf("%s uiocnt is 0\n", __func__);
4152 error = EINVAL;
4153 goto out;
4154 }
4155 /*
4156 * Sanity check on the length passed by caller as we are making 'int'
4157 * comparisons
4158 */
4159 resid = recv_msg_array_resid(msgarray, uiocnt);
4160 if (resid < 0 || resid > INT_MAX) {
4161 error = EINVAL;
4162 goto out;
4163 }
4164
4165 if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
4166 can_delay = 1;
4167 } else {
4168 can_delay = 0;
4169 }
4170
4171 socket_lock(so, 1);
4172 so_update_last_owner_locked(so, p);
4173 so_update_policy(so);
4174
4175 #if NECP
4176 so_update_necp_policy(so, NULL, NULL);
4177 #endif /* NECP */
4178
4179 /*
4180 * If a recv attempt is made on a previously-accepted socket
4181 * that has been marked as inactive (disconnected), reject
4182 * the request.
4183 */
4184 if (so->so_flags & SOF_DEFUNCT) {
4185 struct sockbuf *sb = &so->so_rcv;
4186
4187 error = ENOTCONN;
4188 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4189 __func__, proc_pid(p), proc_best_name(p),
4190 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4191 SOCK_DOM(so), SOCK_TYPE(so), error);
4192 /*
4193 * This socket should have been disconnected and flushed
4194 * prior to being returned from sodefunct(); there should
4195 * be no data on its receive list, so panic otherwise.
4196 */
4197 if (so->so_state & SS_DEFUNCT) {
4198 sb_empty_assert(sb, __func__);
4199 }
4200 goto release;
4201 }
4202
4203 next:
4204 /*
4205 * The uio may be empty
4206 */
4207 if (npkts >= uiocnt) {
4208 error = 0;
4209 goto release;
4210 }
4211 restart:
4212 /*
4213 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4214 * and if so just return to the caller. This could happen when
4215 * soreceive() is called by a socket upcall function during the
4216 * time the socket is freed. The socket buffer would have been
4217 * locked across the upcall, therefore we cannot put this thread
4218 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4219 * we may livelock), because the lock on the socket buffer will
4220 * only be released when the upcall routine returns to its caller.
4221 * Because the socket has been officially closed, there can be
4222 * no further read on it.
4223 */
4224 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4225 (SS_NOFDREF | SS_CANTRCVMORE)) {
4226 error = 0;
4227 goto release;
4228 }
4229
4230 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4231 if (error) {
4232 goto release;
4233 }
4234 sblocked = 1;
4235
4236 m = so->so_rcv.sb_mb;
4237 /*
4238 * Block awaiting more datagram if needed
4239 */
4240 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4241 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4242 ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4243 /*
4244 * Panic if we notice inconsistencies in the socket's
4245 * receive list; both sb_mb and sb_cc should correctly
4246 * reflect the contents of the list, otherwise we may
4247 * end up with false positives during select() or poll()
4248 * which could put the application in a bad state.
4249 */
4250 SB_MB_CHECK(&so->so_rcv);
4251
4252 if (so->so_error) {
4253 error = so->so_error;
4254 if ((flags & MSG_PEEK) == 0) {
4255 so->so_error = 0;
4256 }
4257 goto release;
4258 }
4259 if (so->so_state & SS_CANTRCVMORE) {
4260 goto release;
4261 }
4262 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4263 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4264 error = ENOTCONN;
4265 goto release;
4266 }
4267 if ((so->so_state & SS_NBIO) ||
4268 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4269 error = EWOULDBLOCK;
4270 goto release;
4271 }
4272 /*
4273 * Do not block if we got some data
4274 */
4275 if (free_list != NULL) {
4276 error = 0;
4277 goto release;
4278 }
4279
4280 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4281 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4282
4283 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4284 sblocked = 0;
4285
4286 error = sbwait(&so->so_rcv);
4287 if (error) {
4288 goto release;
4289 }
4290 goto restart;
4291 }
4292
4293 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4294 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4295 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4296
4297 /*
4298 * Consume the current uio index as we have a datagram
4299 */
4300 auio = msgarray[npkts].uio;
4301 resid = uio_resid(auio);
4302 msgarray[npkts].which |= SOCK_MSG_DATA;
4303 psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4304 &msgarray[npkts].psa : NULL;
4305 controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4306 &msgarray[npkts].controlp : NULL;
4307 npkts += 1;
4308 nextrecord = m->m_nextpkt;
4309
4310 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4311 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4312 if (error == ERESTART) {
4313 goto restart;
4314 } else if (error != 0) {
4315 goto release;
4316 }
4317 }
4318
4319 if (m != NULL && m->m_type == MT_CONTROL) {
4320 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4321 if (error != 0) {
4322 goto release;
4323 }
4324 }
4325
4326 if (m->m_pkthdr.len == 0) {
4327 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4328 __func__, __LINE__,
4329 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4330 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4331 m->m_type);
4332 }
4333
4334 /*
4335 * Loop to copy the mbufs of the current record
4336 * Support zero length packets
4337 */
4338 ml = NULL;
4339 pktlen = 0;
4340 while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4341 if (m->m_len == 0) {
4342 panic("%p m_len zero", m);
4343 }
4344 if (m->m_type == 0) {
4345 panic("%p m_type zero", m);
4346 }
4347 /*
4348 * Clip to the residual length
4349 */
4350 if (len > m->m_len) {
4351 len = m->m_len;
4352 }
4353 pktlen += len;
4354 /*
4355 * Copy the mbufs via the uio or delay the copy
4356 * Sockbuf must be consistent here (points to current mbuf,
4357 * it points to next record) when we drop priority;
4358 * we must note any additions to the sockbuf when we
4359 * block interrupts again.
4360 */
4361 if (len > 0 && can_delay == 0) {
4362 socket_unlock(so, 0);
4363 error = uiomove(mtod(m, caddr_t), (int)len, auio);
4364 socket_lock(so, 0);
4365 if (error) {
4366 goto release;
4367 }
4368 } else {
4369 delayed_copy_len += len;
4370 }
4371
4372 if (len == m->m_len) {
4373 /*
4374 * m was entirely copied
4375 */
4376 sbfree(&so->so_rcv, m);
4377 nextrecord = m->m_nextpkt;
4378 m->m_nextpkt = NULL;
4379
4380 /*
4381 * Set the first packet to the head of the free list
4382 */
4383 if (free_list == NULL) {
4384 free_list = m;
4385 }
4386 /*
4387 * Link current packet to tail of free list
4388 */
4389 if (ml == NULL) {
4390 if (free_tail != NULL) {
4391 free_tail->m_nextpkt = m;
4392 }
4393 free_tail = m;
4394 }
4395 /*
4396 * Link current mbuf to last mbuf of current packet
4397 */
4398 if (ml != NULL) {
4399 ml->m_next = m;
4400 }
4401 ml = m;
4402
4403 /*
4404 * Move next buf to head of socket buffer
4405 */
4406 so->so_rcv.sb_mb = m = ml->m_next;
4407 ml->m_next = NULL;
4408
4409 if (m != NULL) {
4410 m->m_nextpkt = nextrecord;
4411 if (nextrecord == NULL) {
4412 so->so_rcv.sb_lastrecord = m;
4413 }
4414 } else {
4415 so->so_rcv.sb_mb = nextrecord;
4416 SB_EMPTY_FIXUP(&so->so_rcv);
4417 }
4418 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4419 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4420 } else {
4421 /*
4422 * Stop the loop on partial copy
4423 */
4424 break;
4425 }
4426 }
4427 #ifdef MORE_LOCKING_DEBUG
4428 if (so->so_usecount <= 1) {
4429 panic("%s: after big while so=%llx ref=%d on socket\n",
4430 __func__,
4431 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4432 /* NOTREACHED */
4433 }
4434 #endif
4435 /*
4436 * Tell the caller we made a partial copy
4437 */
4438 if (m != NULL) {
4439 if (so->so_options & SO_DONTTRUNC) {
4440 /*
4441 * Copyout first the freelist then the partial mbuf
4442 */
4443 socket_unlock(so, 0);
4444 if (delayed_copy_len) {
4445 error = sodelayed_copy_list(so, msgarray,
4446 uiocnt, &free_list, &delayed_copy_len);
4447 }
4448
4449 if (error == 0) {
4450 error = uiomove(mtod(m, caddr_t), (int)len,
4451 auio);
4452 }
4453 socket_lock(so, 0);
4454 if (error) {
4455 goto release;
4456 }
4457
4458 m->m_data += len;
4459 m->m_len -= len;
4460 so->so_rcv.sb_cc -= len;
4461 flags |= MSG_RCVMORE;
4462 } else {
4463 (void) sbdroprecord(&so->so_rcv);
4464 nextrecord = so->so_rcv.sb_mb;
4465 m = NULL;
4466 flags |= MSG_TRUNC;
4467 }
4468 }
4469
4470 if (m == NULL) {
4471 so->so_rcv.sb_mb = nextrecord;
4472 /*
4473 * First part is an inline SB_EMPTY_FIXUP(). Second
4474 * part makes sure sb_lastrecord is up-to-date if
4475 * there is still data in the socket buffer.
4476 */
4477 if (so->so_rcv.sb_mb == NULL) {
4478 so->so_rcv.sb_mbtail = NULL;
4479 so->so_rcv.sb_lastrecord = NULL;
4480 } else if (nextrecord->m_nextpkt == NULL) {
4481 so->so_rcv.sb_lastrecord = nextrecord;
4482 }
4483 SB_MB_CHECK(&so->so_rcv);
4484 }
4485 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4486 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4487
4488 /*
4489 * We can continue to the next packet as long as:
4490 * - We haven't exhausted the uio array
4491 * - There was no error
4492 * - A packet was not truncated
4493 * - We can still receive more data
4494 */
4495 if (npkts < uiocnt && error == 0 &&
4496 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4497 (so->so_state & SS_CANTRCVMORE) == 0) {
4498 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4499 sblocked = 0;
4500
4501 goto next;
4502 }
4503 if (flagsp != NULL) {
4504 *flagsp |= flags;
4505 }
4506
4507 release:
4508 /*
4509 * pru_rcvd may cause more data to be received if the socket lock
4510 * is dropped so we set MSG_HAVEMORE now based on what we know.
4511 * That way the caller won't be surprised if it receives less data
4512 * than requested.
4513 */
4514 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4515 flags |= MSG_HAVEMORE;
4516 }
4517
4518 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4519 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4520 }
4521
4522 if (sblocked) {
4523 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4524 } else {
4525 socket_unlock(so, 1);
4526 }
4527
4528 if (delayed_copy_len) {
4529 error = sodelayed_copy_list(so, msgarray, uiocnt,
4530 &free_list, &delayed_copy_len);
4531 }
4532 out:
4533 /*
4534 * Amortize the cost of freeing the mbufs
4535 */
4536 if (free_list != NULL) {
4537 m_freem_list(free_list);
4538 }
4539 if (free_others != NULL) {
4540 m_freem_list(free_others);
4541 }
4542
4543 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4544 0, 0, 0, 0);
4545 return error;
4546 }
4547
4548 static int
4549 so_statistics_event_to_nstat_event(int64_t *input_options,
4550 uint64_t *nstat_event)
4551 {
4552 int error = 0;
4553 switch (*input_options) {
4554 case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4555 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4556 break;
4557 case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4558 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4559 break;
4560 #if (DEBUG || DEVELOPMENT)
4561 case SO_STATISTICS_EVENT_RESERVED_1:
4562 *nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4563 break;
4564 case SO_STATISTICS_EVENT_RESERVED_2:
4565 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4566 break;
4567 #endif /* (DEBUG || DEVELOPMENT) */
4568 default:
4569 error = EINVAL;
4570 break;
4571 }
4572 return error;
4573 }
4574
4575 /*
4576 * Returns: 0 Success
4577 * EINVAL
4578 * ENOTCONN
4579 * <pru_shutdown>:EINVAL
4580 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4581 * <pru_shutdown>:ENOBUFS[TCP]
4582 * <pru_shutdown>:EMSGSIZE[TCP]
4583 * <pru_shutdown>:EHOSTUNREACH[TCP]
4584 * <pru_shutdown>:ENETUNREACH[TCP]
4585 * <pru_shutdown>:ENETDOWN[TCP]
4586 * <pru_shutdown>:ENOMEM[TCP]
4587 * <pru_shutdown>:EACCES[TCP]
4588 * <pru_shutdown>:EMSGSIZE[TCP]
4589 * <pru_shutdown>:ENOBUFS[TCP]
4590 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4591 * <pru_shutdown>:??? [other protocol families]
4592 */
4593 int
4594 soshutdown(struct socket *so, int how)
4595 {
4596 int error;
4597
4598 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4599
4600 switch (how) {
4601 case SHUT_RD:
4602 case SHUT_WR:
4603 case SHUT_RDWR:
4604 socket_lock(so, 1);
4605 if ((so->so_state &
4606 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4607 error = ENOTCONN;
4608 } else {
4609 error = soshutdownlock(so, how);
4610 }
4611 socket_unlock(so, 1);
4612 break;
4613 default:
4614 error = EINVAL;
4615 break;
4616 }
4617
4618 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4619
4620 return error;
4621 }
4622
4623 int
4624 soshutdownlock_final(struct socket *so, int how)
4625 {
4626 struct protosw *pr = so->so_proto;
4627 int error = 0;
4628
4629 sflt_notify(so, sock_evt_shutdown, &how);
4630
4631 if (how != SHUT_WR) {
4632 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4633 /* read already shut down */
4634 error = ENOTCONN;
4635 goto done;
4636 }
4637 sorflush(so);
4638 postevent(so, 0, EV_RCLOSED);
4639 }
4640 if (how != SHUT_RD) {
4641 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4642 /* write already shut down */
4643 error = ENOTCONN;
4644 goto done;
4645 }
4646 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4647 postevent(so, 0, EV_WCLOSED);
4648 }
4649 done:
4650 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4651 return error;
4652 }
4653
4654 int
4655 soshutdownlock(struct socket *so, int how)
4656 {
4657 int error = 0;
4658
4659 #if CONTENT_FILTER
4660 /*
4661 * A content filter may delay the actual shutdown until it
4662 * has processed the pending data
4663 */
4664 if (so->so_flags & SOF_CONTENT_FILTER) {
4665 error = cfil_sock_shutdown(so, &how);
4666 if (error == EJUSTRETURN) {
4667 error = 0;
4668 goto done;
4669 } else if (error != 0) {
4670 goto done;
4671 }
4672 }
4673 #endif /* CONTENT_FILTER */
4674
4675 error = soshutdownlock_final(so, how);
4676
4677 done:
4678 return error;
4679 }
4680
4681 void
4682 sowflush(struct socket *so)
4683 {
4684 struct sockbuf *sb = &so->so_snd;
4685
4686 /*
4687 * Obtain lock on the socket buffer (SB_LOCK). This is required
4688 * to prevent the socket buffer from being unexpectedly altered
4689 * while it is used by another thread in socket send/receive.
4690 *
4691 * sblock() must not fail here, hence the assertion.
4692 */
4693 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4694 VERIFY(sb->sb_flags & SB_LOCK);
4695
4696 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4697 sb->sb_flags |= SB_DROP;
4698 sb->sb_upcall = NULL;
4699 sb->sb_upcallarg = NULL;
4700
4701 sbunlock(sb, TRUE); /* keep socket locked */
4702
4703 selthreadclear(&sb->sb_sel);
4704 sbrelease(sb);
4705 }
4706
4707 void
4708 sorflush(struct socket *so)
4709 {
4710 struct sockbuf *sb = &so->so_rcv;
4711 struct protosw *pr = so->so_proto;
4712 struct sockbuf asb;
4713 #ifdef notyet
4714 lck_mtx_t *mutex_held;
4715 /*
4716 * XXX: This code is currently commented out, because we may get here
4717 * as part of sofreelastref(), and at that time, pr_getlock() may no
4718 * longer be able to return us the lock; this will be fixed in future.
4719 */
4720 if (so->so_proto->pr_getlock != NULL) {
4721 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4722 } else {
4723 mutex_held = so->so_proto->pr_domain->dom_mtx;
4724 }
4725
4726 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4727 #endif /* notyet */
4728
4729 sflt_notify(so, sock_evt_flush_read, NULL);
4730
4731 socantrcvmore(so);
4732
4733 /*
4734 * Obtain lock on the socket buffer (SB_LOCK). This is required
4735 * to prevent the socket buffer from being unexpectedly altered
4736 * while it is used by another thread in socket send/receive.
4737 *
4738 * sblock() must not fail here, hence the assertion.
4739 */
4740 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4741 VERIFY(sb->sb_flags & SB_LOCK);
4742
4743 /*
4744 * Copy only the relevant fields from "sb" to "asb" which we
4745 * need for sbrelease() to function. In particular, skip
4746 * sb_sel as it contains the wait queue linkage, which would
4747 * wreak havoc if we were to issue selthreadclear() on "asb".
4748 * Make sure to not carry over SB_LOCK in "asb", as we need
4749 * to acquire it later as part of sbrelease().
4750 */
4751 bzero(&asb, sizeof(asb));
4752 asb.sb_cc = sb->sb_cc;
4753 asb.sb_hiwat = sb->sb_hiwat;
4754 asb.sb_mbcnt = sb->sb_mbcnt;
4755 asb.sb_mbmax = sb->sb_mbmax;
4756 asb.sb_ctl = sb->sb_ctl;
4757 asb.sb_lowat = sb->sb_lowat;
4758 asb.sb_mb = sb->sb_mb;
4759 asb.sb_mbtail = sb->sb_mbtail;
4760 asb.sb_lastrecord = sb->sb_lastrecord;
4761 asb.sb_so = sb->sb_so;
4762 asb.sb_flags = sb->sb_flags;
4763 asb.sb_flags &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4764 asb.sb_flags |= SB_DROP;
4765
4766 /*
4767 * Ideally we'd bzero() these and preserve the ones we need;
4768 * but to do that we'd need to shuffle things around in the
4769 * sockbuf, and we can't do it now because there are KEXTS
4770 * that are directly referring to the socket structure.
4771 *
4772 * Setting SB_DROP acts as a barrier to prevent further appends.
4773 * Clearing SB_SEL is done for selthreadclear() below.
4774 */
4775 sb->sb_cc = 0;
4776 sb->sb_hiwat = 0;
4777 sb->sb_mbcnt = 0;
4778 sb->sb_mbmax = 0;
4779 sb->sb_ctl = 0;
4780 sb->sb_lowat = 0;
4781 sb->sb_mb = NULL;
4782 sb->sb_mbtail = NULL;
4783 sb->sb_lastrecord = NULL;
4784 sb->sb_timeo.tv_sec = 0;
4785 sb->sb_timeo.tv_usec = 0;
4786 sb->sb_upcall = NULL;
4787 sb->sb_upcallarg = NULL;
4788 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4789 sb->sb_flags |= SB_DROP;
4790
4791 sbunlock(sb, TRUE); /* keep socket locked */
4792
4793 /*
4794 * Note that selthreadclear() is called on the original "sb" and
4795 * not the local "asb" because of the way wait queue linkage is
4796 * implemented. Given that selwakeup() may be triggered, SB_SEL
4797 * should no longer be set (cleared above.)
4798 */
4799 selthreadclear(&sb->sb_sel);
4800
4801 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4802 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4803 }
4804
4805 sbrelease(&asb);
4806 }
4807
4808 /*
4809 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4810 * an additional variant to handle the case where the option value needs
4811 * to be some kind of integer, but not a specific size.
4812 * In addition to their use here, these functions are also called by the
4813 * protocol-level pr_ctloutput() routines.
4814 *
4815 * Returns: 0 Success
4816 * EINVAL
4817 * copyin:EFAULT
4818 */
4819 int
4820 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4821 {
4822 size_t valsize;
4823
4824 /*
4825 * If the user gives us more than we wanted, we ignore it,
4826 * but if we don't get the minimum length the caller
4827 * wants, we return EINVAL. On success, sopt->sopt_valsize
4828 * is set to however much we actually retrieved.
4829 */
4830 if ((valsize = sopt->sopt_valsize) < minlen) {
4831 return EINVAL;
4832 }
4833 if (valsize > len) {
4834 sopt->sopt_valsize = valsize = len;
4835 }
4836
4837 if (sopt->sopt_p != kernproc) {
4838 return copyin(sopt->sopt_val, buf, valsize);
4839 }
4840
4841 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4842 return 0;
4843 }
4844
4845 /*
4846 * sooptcopyin_timeval
4847 * Copy in a timeval value into tv_p, and take into account whether the
4848 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4849 * code here so that we can verify the 64-bit tv_sec value before we lose
4850 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4851 */
4852 static int
4853 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4854 {
4855 int error;
4856
4857 if (proc_is64bit(sopt->sopt_p)) {
4858 struct user64_timeval tv64;
4859
4860 if (sopt->sopt_valsize < sizeof(tv64)) {
4861 return EINVAL;
4862 }
4863
4864 sopt->sopt_valsize = sizeof(tv64);
4865 if (sopt->sopt_p != kernproc) {
4866 error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4867 if (error != 0) {
4868 return error;
4869 }
4870 } else {
4871 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4872 sizeof(tv64));
4873 }
4874 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4875 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4876 return EDOM;
4877 }
4878
4879 tv_p->tv_sec = tv64.tv_sec;
4880 tv_p->tv_usec = tv64.tv_usec;
4881 } else {
4882 struct user32_timeval tv32;
4883
4884 if (sopt->sopt_valsize < sizeof(tv32)) {
4885 return EINVAL;
4886 }
4887
4888 sopt->sopt_valsize = sizeof(tv32);
4889 if (sopt->sopt_p != kernproc) {
4890 error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4891 if (error != 0) {
4892 return error;
4893 }
4894 } else {
4895 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4896 sizeof(tv32));
4897 }
4898 #ifndef __LP64__
4899 /*
4900 * K64todo "comparison is always false due to
4901 * limited range of data type"
4902 */
4903 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4904 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4905 return EDOM;
4906 }
4907 #endif
4908 tv_p->tv_sec = tv32.tv_sec;
4909 tv_p->tv_usec = tv32.tv_usec;
4910 }
4911 return 0;
4912 }
4913
4914 int
4915 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4916 boolean_t ignore_delegate)
4917 {
4918 kauth_cred_t cred = NULL;
4919 proc_t ep = PROC_NULL;
4920 uid_t uid;
4921 int error = 0;
4922
4923 if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4924 ep = proc_find(so->e_pid);
4925 if (ep) {
4926 cred = kauth_cred_proc_ref(ep);
4927 }
4928 }
4929
4930 uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4931
4932 /* uid is 0 for root */
4933 if (uid != 0 || !allow_root) {
4934 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4935 }
4936 if (cred) {
4937 kauth_cred_unref(&cred);
4938 }
4939 if (ep != PROC_NULL) {
4940 proc_rele(ep);
4941 }
4942
4943 return error;
4944 }
4945
4946 /*
4947 * Returns: 0 Success
4948 * EINVAL
4949 * ENOPROTOOPT
4950 * ENOBUFS
4951 * EDOM
4952 * sooptcopyin:EINVAL
4953 * sooptcopyin:EFAULT
4954 * sooptcopyin_timeval:EINVAL
4955 * sooptcopyin_timeval:EFAULT
4956 * sooptcopyin_timeval:EDOM
4957 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4958 * <pr_ctloutput>:???w
4959 * sflt_attach_private:??? [whatever a filter author chooses]
4960 * <sf_setoption>:??? [whatever a filter author chooses]
4961 *
4962 * Notes: Other <pru_listen> returns depend on the protocol family; all
4963 * <sf_listen> returns depend on what the filter author causes
4964 * their filter to return.
4965 */
4966 int
4967 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4968 {
4969 int error, optval;
4970 int64_t long_optval;
4971 struct linger l;
4972 struct timeval tv;
4973 #if CONFIG_MACF_SOCKET
4974 struct mac extmac;
4975 #endif /* MAC_SOCKET */
4976
4977 if (sopt->sopt_dir != SOPT_SET) {
4978 sopt->sopt_dir = SOPT_SET;
4979 }
4980
4981 if (dolock) {
4982 socket_lock(so, 1);
4983 }
4984
4985 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4986 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4987 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4988 /* the socket has been shutdown, no more sockopt's */
4989 error = EINVAL;
4990 goto out;
4991 }
4992
4993 error = sflt_setsockopt(so, sopt);
4994 if (error != 0) {
4995 if (error == EJUSTRETURN) {
4996 error = 0;
4997 }
4998 goto out;
4999 }
5000
5001 if (sopt->sopt_level != SOL_SOCKET) {
5002 if (so->so_proto != NULL &&
5003 so->so_proto->pr_ctloutput != NULL) {
5004 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5005 goto out;
5006 }
5007 error = ENOPROTOOPT;
5008 } else {
5009 /*
5010 * Allow socket-level (SOL_SOCKET) options to be filtered by
5011 * the protocol layer, if needed. A zero value returned from
5012 * the handler means use default socket-level processing as
5013 * done by the rest of this routine. Otherwise, any other
5014 * return value indicates that the option is unsupported.
5015 */
5016 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5017 pru_socheckopt(so, sopt)) != 0) {
5018 goto out;
5019 }
5020
5021 error = 0;
5022 switch (sopt->sopt_name) {
5023 case SO_LINGER:
5024 case SO_LINGER_SEC:
5025 error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5026 if (error != 0) {
5027 goto out;
5028 }
5029
5030 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
5031 l.l_linger : l.l_linger * hz;
5032 if (l.l_onoff != 0) {
5033 so->so_options |= SO_LINGER;
5034 } else {
5035 so->so_options &= ~SO_LINGER;
5036 }
5037 break;
5038
5039 case SO_DEBUG:
5040 case SO_KEEPALIVE:
5041 case SO_DONTROUTE:
5042 case SO_USELOOPBACK:
5043 case SO_BROADCAST:
5044 case SO_REUSEADDR:
5045 case SO_REUSEPORT:
5046 case SO_OOBINLINE:
5047 case SO_TIMESTAMP:
5048 case SO_TIMESTAMP_MONOTONIC:
5049 case SO_TIMESTAMP_CONTINUOUS:
5050 case SO_DONTTRUNC:
5051 case SO_WANTMORE:
5052 case SO_WANTOOBFLAG:
5053 case SO_NOWAKEFROMSLEEP:
5054 case SO_NOAPNFALLBK:
5055 error = sooptcopyin(sopt, &optval, sizeof(optval),
5056 sizeof(optval));
5057 if (error != 0) {
5058 goto out;
5059 }
5060 if (optval) {
5061 so->so_options |= sopt->sopt_name;
5062 } else {
5063 so->so_options &= ~sopt->sopt_name;
5064 }
5065 break;
5066
5067 case SO_SNDBUF:
5068 case SO_RCVBUF:
5069 case SO_SNDLOWAT:
5070 case SO_RCVLOWAT:
5071 error = sooptcopyin(sopt, &optval, sizeof(optval),
5072 sizeof(optval));
5073 if (error != 0) {
5074 goto out;
5075 }
5076
5077 /*
5078 * Values < 1 make no sense for any of these
5079 * options, so disallow them.
5080 */
5081 if (optval < 1) {
5082 error = EINVAL;
5083 goto out;
5084 }
5085
5086 switch (sopt->sopt_name) {
5087 case SO_SNDBUF:
5088 case SO_RCVBUF: {
5089 struct sockbuf *sb =
5090 (sopt->sopt_name == SO_SNDBUF) ?
5091 &so->so_snd : &so->so_rcv;
5092 if (sbreserve(sb, (u_int32_t)optval) == 0) {
5093 error = ENOBUFS;
5094 goto out;
5095 }
5096 sb->sb_flags |= SB_USRSIZE;
5097 sb->sb_flags &= ~SB_AUTOSIZE;
5098 sb->sb_idealsize = (u_int32_t)optval;
5099 break;
5100 }
5101 /*
5102 * Make sure the low-water is never greater than
5103 * the high-water.
5104 */
5105 case SO_SNDLOWAT: {
5106 int space = sbspace(&so->so_snd);
5107 u_int32_t hiwat = so->so_snd.sb_hiwat;
5108
5109 if (so->so_snd.sb_flags & SB_UNIX) {
5110 struct unpcb *unp =
5111 (struct unpcb *)(so->so_pcb);
5112 if (unp != NULL &&
5113 unp->unp_conn != NULL) {
5114 hiwat += unp->unp_conn->unp_cc;
5115 }
5116 }
5117
5118 so->so_snd.sb_lowat =
5119 (optval > hiwat) ?
5120 hiwat : optval;
5121
5122 if (space >= so->so_snd.sb_lowat) {
5123 sowwakeup(so);
5124 }
5125 break;
5126 }
5127 case SO_RCVLOWAT: {
5128 int64_t data_len;
5129 so->so_rcv.sb_lowat =
5130 (optval > so->so_rcv.sb_hiwat) ?
5131 so->so_rcv.sb_hiwat : optval;
5132 data_len = so->so_rcv.sb_cc
5133 - so->so_rcv.sb_ctl;
5134 if (data_len >= so->so_rcv.sb_lowat) {
5135 sorwakeup(so);
5136 }
5137 break;
5138 }
5139 }
5140 break;
5141
5142 case SO_SNDTIMEO:
5143 case SO_RCVTIMEO:
5144 error = sooptcopyin_timeval(sopt, &tv);
5145 if (error != 0) {
5146 goto out;
5147 }
5148
5149 switch (sopt->sopt_name) {
5150 case SO_SNDTIMEO:
5151 so->so_snd.sb_timeo = tv;
5152 break;
5153 case SO_RCVTIMEO:
5154 so->so_rcv.sb_timeo = tv;
5155 break;
5156 }
5157 break;
5158
5159 case SO_NKE: {
5160 struct so_nke nke;
5161
5162 error = sooptcopyin(sopt, &nke, sizeof(nke),
5163 sizeof(nke));
5164 if (error != 0) {
5165 goto out;
5166 }
5167
5168 error = sflt_attach_internal(so, nke.nke_handle);
5169 break;
5170 }
5171
5172 case SO_NOSIGPIPE:
5173 error = sooptcopyin(sopt, &optval, sizeof(optval),
5174 sizeof(optval));
5175 if (error != 0) {
5176 goto out;
5177 }
5178 if (optval != 0) {
5179 so->so_flags |= SOF_NOSIGPIPE;
5180 } else {
5181 so->so_flags &= ~SOF_NOSIGPIPE;
5182 }
5183 break;
5184
5185 case SO_NOADDRERR:
5186 error = sooptcopyin(sopt, &optval, sizeof(optval),
5187 sizeof(optval));
5188 if (error != 0) {
5189 goto out;
5190 }
5191 if (optval != 0) {
5192 so->so_flags |= SOF_NOADDRAVAIL;
5193 } else {
5194 so->so_flags &= ~SOF_NOADDRAVAIL;
5195 }
5196 break;
5197
5198 case SO_REUSESHAREUID:
5199 error = sooptcopyin(sopt, &optval, sizeof(optval),
5200 sizeof(optval));
5201 if (error != 0) {
5202 goto out;
5203 }
5204 if (optval != 0) {
5205 so->so_flags |= SOF_REUSESHAREUID;
5206 } else {
5207 so->so_flags &= ~SOF_REUSESHAREUID;
5208 }
5209 break;
5210
5211 case SO_NOTIFYCONFLICT:
5212 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5213 error = EPERM;
5214 goto out;
5215 }
5216 error = sooptcopyin(sopt, &optval, sizeof(optval),
5217 sizeof(optval));
5218 if (error != 0) {
5219 goto out;
5220 }
5221 if (optval != 0) {
5222 so->so_flags |= SOF_NOTIFYCONFLICT;
5223 } else {
5224 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5225 }
5226 break;
5227
5228 case SO_RESTRICTIONS:
5229 error = sooptcopyin(sopt, &optval, sizeof(optval),
5230 sizeof(optval));
5231 if (error != 0) {
5232 goto out;
5233 }
5234
5235 error = so_set_restrictions(so, optval);
5236 break;
5237
5238 case SO_AWDL_UNRESTRICTED:
5239 if (SOCK_DOM(so) != PF_INET &&
5240 SOCK_DOM(so) != PF_INET6) {
5241 error = EOPNOTSUPP;
5242 goto out;
5243 }
5244 error = sooptcopyin(sopt, &optval, sizeof(optval),
5245 sizeof(optval));
5246 if (error != 0) {
5247 goto out;
5248 }
5249 if (optval != 0) {
5250 error = soopt_cred_check(so,
5251 PRIV_NET_RESTRICTED_AWDL, false, false);
5252 if (error == 0) {
5253 inp_set_awdl_unrestricted(
5254 sotoinpcb(so));
5255 }
5256 } else {
5257 inp_clear_awdl_unrestricted(sotoinpcb(so));
5258 }
5259 break;
5260 case SO_INTCOPROC_ALLOW:
5261 if (SOCK_DOM(so) != PF_INET6) {
5262 error = EOPNOTSUPP;
5263 goto out;
5264 }
5265 error = sooptcopyin(sopt, &optval, sizeof(optval),
5266 sizeof(optval));
5267 if (error != 0) {
5268 goto out;
5269 }
5270 if (optval != 0 &&
5271 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5272 error = soopt_cred_check(so,
5273 PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5274 if (error == 0) {
5275 inp_set_intcoproc_allowed(
5276 sotoinpcb(so));
5277 }
5278 } else if (optval == 0) {
5279 inp_clear_intcoproc_allowed(sotoinpcb(so));
5280 }
5281 break;
5282
5283 case SO_LABEL:
5284 #if CONFIG_MACF_SOCKET
5285 if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
5286 sizeof(extmac))) != 0) {
5287 goto out;
5288 }
5289
5290 error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
5291 so, &extmac);
5292 #else
5293 error = EOPNOTSUPP;
5294 #endif /* MAC_SOCKET */
5295 break;
5296
5297 case SO_UPCALLCLOSEWAIT:
5298 error = sooptcopyin(sopt, &optval, sizeof(optval),
5299 sizeof(optval));
5300 if (error != 0) {
5301 goto out;
5302 }
5303 if (optval != 0) {
5304 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5305 } else {
5306 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5307 }
5308 break;
5309
5310 case SO_RANDOMPORT:
5311 error = sooptcopyin(sopt, &optval, sizeof(optval),
5312 sizeof(optval));
5313 if (error != 0) {
5314 goto out;
5315 }
5316 if (optval != 0) {
5317 so->so_flags |= SOF_BINDRANDOMPORT;
5318 } else {
5319 so->so_flags &= ~SOF_BINDRANDOMPORT;
5320 }
5321 break;
5322
5323 case SO_NP_EXTENSIONS: {
5324 struct so_np_extensions sonpx;
5325
5326 error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5327 sizeof(sonpx));
5328 if (error != 0) {
5329 goto out;
5330 }
5331 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5332 error = EINVAL;
5333 goto out;
5334 }
5335 /*
5336 * Only one bit defined for now
5337 */
5338 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5339 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5340 so->so_flags |= SOF_NPX_SETOPTSHUT;
5341 } else {
5342 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5343 }
5344 }
5345 break;
5346 }
5347
5348 case SO_TRAFFIC_CLASS: {
5349 error = sooptcopyin(sopt, &optval, sizeof(optval),
5350 sizeof(optval));
5351 if (error != 0) {
5352 goto out;
5353 }
5354 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5355 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5356 error = so_set_net_service_type(so, netsvc);
5357 goto out;
5358 }
5359 error = so_set_traffic_class(so, optval);
5360 if (error != 0) {
5361 goto out;
5362 }
5363 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5364 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5365 break;
5366 }
5367
5368 case SO_RECV_TRAFFIC_CLASS: {
5369 error = sooptcopyin(sopt, &optval, sizeof(optval),
5370 sizeof(optval));
5371 if (error != 0) {
5372 goto out;
5373 }
5374 if (optval == 0) {
5375 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5376 } else {
5377 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5378 }
5379 break;
5380 }
5381
5382 #if (DEVELOPMENT || DEBUG)
5383 case SO_TRAFFIC_CLASS_DBG: {
5384 struct so_tcdbg so_tcdbg;
5385
5386 error = sooptcopyin(sopt, &so_tcdbg,
5387 sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5388 if (error != 0) {
5389 goto out;
5390 }
5391 error = so_set_tcdbg(so, &so_tcdbg);
5392 if (error != 0) {
5393 goto out;
5394 }
5395 break;
5396 }
5397 #endif /* (DEVELOPMENT || DEBUG) */
5398
5399 case SO_PRIVILEGED_TRAFFIC_CLASS:
5400 error = priv_check_cred(kauth_cred_get(),
5401 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5402 if (error != 0) {
5403 goto out;
5404 }
5405 error = sooptcopyin(sopt, &optval, sizeof(optval),
5406 sizeof(optval));
5407 if (error != 0) {
5408 goto out;
5409 }
5410 if (optval == 0) {
5411 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5412 } else {
5413 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5414 }
5415 break;
5416
5417 #if (DEVELOPMENT || DEBUG)
5418 case SO_DEFUNCTIT:
5419 error = sosetdefunct(current_proc(), so, 0, FALSE);
5420 if (error == 0) {
5421 error = sodefunct(current_proc(), so, 0);
5422 }
5423
5424 break;
5425 #endif /* (DEVELOPMENT || DEBUG) */
5426
5427 case SO_DEFUNCTOK:
5428 error = sooptcopyin(sopt, &optval, sizeof(optval),
5429 sizeof(optval));
5430 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5431 if (error == 0) {
5432 error = EBADF;
5433 }
5434 goto out;
5435 }
5436 /*
5437 * Any process can set SO_DEFUNCTOK (clear
5438 * SOF_NODEFUNCT), but only root can clear
5439 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5440 */
5441 if (optval == 0 &&
5442 kauth_cred_issuser(kauth_cred_get()) == 0) {
5443 error = EPERM;
5444 goto out;
5445 }
5446 if (optval) {
5447 so->so_flags &= ~SOF_NODEFUNCT;
5448 } else {
5449 so->so_flags |= SOF_NODEFUNCT;
5450 }
5451
5452 if (SOCK_DOM(so) == PF_INET ||
5453 SOCK_DOM(so) == PF_INET6) {
5454 char s[MAX_IPv6_STR_LEN];
5455 char d[MAX_IPv6_STR_LEN];
5456 struct inpcb *inp = sotoinpcb(so);
5457
5458 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5459 "[%s %s:%d -> %s:%d] is now marked "
5460 "as %seligible for "
5461 "defunct\n", __func__, proc_selfpid(),
5462 proc_best_name(current_proc()),
5463 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5464 (SOCK_TYPE(so) == SOCK_STREAM) ?
5465 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5466 ((SOCK_DOM(so) == PF_INET) ?
5467 (void *)&inp->inp_laddr.s_addr :
5468 (void *)&inp->in6p_laddr), s, sizeof(s)),
5469 ntohs(inp->in6p_lport),
5470 inet_ntop(SOCK_DOM(so),
5471 (SOCK_DOM(so) == PF_INET) ?
5472 (void *)&inp->inp_faddr.s_addr :
5473 (void *)&inp->in6p_faddr, d, sizeof(d)),
5474 ntohs(inp->in6p_fport),
5475 (so->so_flags & SOF_NODEFUNCT) ?
5476 "not " : "");
5477 } else {
5478 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5479 "is now marked as %seligible for "
5480 "defunct\n",
5481 __func__, proc_selfpid(),
5482 proc_best_name(current_proc()),
5483 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5484 SOCK_DOM(so), SOCK_TYPE(so),
5485 (so->so_flags & SOF_NODEFUNCT) ?
5486 "not " : "");
5487 }
5488 break;
5489
5490 case SO_ISDEFUNCT:
5491 /* This option is not settable */
5492 error = EINVAL;
5493 break;
5494
5495 case SO_OPPORTUNISTIC:
5496 error = sooptcopyin(sopt, &optval, sizeof(optval),
5497 sizeof(optval));
5498 if (error == 0) {
5499 error = so_set_opportunistic(so, optval);
5500 }
5501 break;
5502
5503 case SO_FLUSH:
5504 /* This option is handled by lower layer(s) */
5505 error = 0;
5506 break;
5507
5508 case SO_RECV_ANYIF:
5509 error = sooptcopyin(sopt, &optval, sizeof(optval),
5510 sizeof(optval));
5511 if (error == 0) {
5512 error = so_set_recv_anyif(so, optval);
5513 }
5514 break;
5515
5516 case SO_TRAFFIC_MGT_BACKGROUND: {
5517 /* This option is handled by lower layer(s) */
5518 error = 0;
5519 break;
5520 }
5521
5522 #if FLOW_DIVERT
5523 case SO_FLOW_DIVERT_TOKEN:
5524 error = flow_divert_token_set(so, sopt);
5525 break;
5526 #endif /* FLOW_DIVERT */
5527
5528
5529 case SO_DELEGATED:
5530 if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5531 sizeof(optval))) != 0) {
5532 break;
5533 }
5534
5535 error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5536 break;
5537
5538 case SO_DELEGATED_UUID: {
5539 uuid_t euuid;
5540
5541 if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5542 sizeof(euuid))) != 0) {
5543 break;
5544 }
5545
5546 error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5547 break;
5548 }
5549
5550 #if NECP
5551 case SO_NECP_ATTRIBUTES:
5552 error = necp_set_socket_attributes(so, sopt);
5553 break;
5554
5555 case SO_NECP_CLIENTUUID: {
5556 if (SOCK_DOM(so) == PF_MULTIPATH) {
5557 /* Handled by MPTCP itself */
5558 break;
5559 }
5560
5561 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5562 error = EINVAL;
5563 goto out;
5564 }
5565
5566 struct inpcb *inp = sotoinpcb(so);
5567 if (!uuid_is_null(inp->necp_client_uuid)) {
5568 // Clear out the old client UUID if present
5569 necp_inpcb_remove_cb(inp);
5570 }
5571
5572 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5573 sizeof(uuid_t), sizeof(uuid_t));
5574 if (error != 0) {
5575 goto out;
5576 }
5577
5578 if (uuid_is_null(inp->necp_client_uuid)) {
5579 error = EINVAL;
5580 goto out;
5581 }
5582
5583 pid_t current_pid = proc_pid(current_proc());
5584 error = necp_client_register_socket_flow(current_pid,
5585 inp->necp_client_uuid, inp);
5586 if (error != 0) {
5587 uuid_clear(inp->necp_client_uuid);
5588 goto out;
5589 }
5590
5591 if (inp->inp_lport != 0) {
5592 // There is a bound local port, so this is not
5593 // a fresh socket. Assign to the client.
5594 necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5595 }
5596
5597 break;
5598 }
5599 case SO_NECP_LISTENUUID: {
5600 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5601 error = EINVAL;
5602 goto out;
5603 }
5604
5605 struct inpcb *inp = sotoinpcb(so);
5606 if (!uuid_is_null(inp->necp_client_uuid)) {
5607 error = EINVAL;
5608 goto out;
5609 }
5610
5611 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5612 sizeof(uuid_t), sizeof(uuid_t));
5613 if (error != 0) {
5614 goto out;
5615 }
5616
5617 if (uuid_is_null(inp->necp_client_uuid)) {
5618 error = EINVAL;
5619 goto out;
5620 }
5621
5622 error = necp_client_register_socket_listener(proc_pid(current_proc()),
5623 inp->necp_client_uuid, inp);
5624 if (error != 0) {
5625 uuid_clear(inp->necp_client_uuid);
5626 goto out;
5627 }
5628
5629 // Mark that the port registration is held by NECP
5630 inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5631
5632 break;
5633 }
5634 #endif /* NECP */
5635
5636 case SO_EXTENDED_BK_IDLE:
5637 error = sooptcopyin(sopt, &optval, sizeof(optval),
5638 sizeof(optval));
5639 if (error == 0) {
5640 error = so_set_extended_bk_idle(so, optval);
5641 }
5642 break;
5643
5644 case SO_MARK_CELLFALLBACK:
5645 error = sooptcopyin(sopt, &optval, sizeof(optval),
5646 sizeof(optval));
5647 if (error != 0) {
5648 goto out;
5649 }
5650 if (optval < 0) {
5651 error = EINVAL;
5652 goto out;
5653 }
5654 if (optval == 0) {
5655 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5656 } else {
5657 so->so_flags1 |= SOF1_CELLFALLBACK;
5658 }
5659 break;
5660
5661 case SO_STATISTICS_EVENT:
5662 error = sooptcopyin(sopt, &long_optval,
5663 sizeof(long_optval), sizeof(long_optval));
5664 if (error != 0) {
5665 goto out;
5666 }
5667 u_int64_t nstat_event = 0;
5668 error = so_statistics_event_to_nstat_event(
5669 &long_optval, &nstat_event);
5670 if (error != 0) {
5671 goto out;
5672 }
5673 nstat_pcb_event(sotoinpcb(so), nstat_event);
5674 break;
5675
5676 case SO_NET_SERVICE_TYPE: {
5677 error = sooptcopyin(sopt, &optval, sizeof(optval),
5678 sizeof(optval));
5679 if (error != 0) {
5680 goto out;
5681 }
5682 error = so_set_net_service_type(so, optval);
5683 break;
5684 }
5685
5686 case SO_QOSMARKING_POLICY_OVERRIDE:
5687 error = priv_check_cred(kauth_cred_get(),
5688 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5689 if (error != 0) {
5690 goto out;
5691 }
5692 error = sooptcopyin(sopt, &optval, sizeof(optval),
5693 sizeof(optval));
5694 if (error != 0) {
5695 goto out;
5696 }
5697 if (optval == 0) {
5698 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5699 } else {
5700 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5701 }
5702 break;
5703
5704 case SO_MPKL_SEND_INFO: {
5705 struct so_mpkl_send_info so_mpkl_send_info;
5706
5707 error = sooptcopyin(sopt, &so_mpkl_send_info,
5708 sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5709 if (error != 0) {
5710 goto out;
5711 }
5712 uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5713 so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5714
5715 if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5716 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5717 } else {
5718 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5719 }
5720 break;
5721 }
5722 default:
5723 error = ENOPROTOOPT;
5724 break;
5725 }
5726 if (error == 0 && so->so_proto != NULL &&
5727 so->so_proto->pr_ctloutput != NULL) {
5728 (void) so->so_proto->pr_ctloutput(so, sopt);
5729 }
5730 }
5731 out:
5732 if (dolock) {
5733 socket_unlock(so, 1);
5734 }
5735 return error;
5736 }
5737
5738 /* Helper routines for getsockopt */
5739 int
5740 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5741 {
5742 int error;
5743 size_t valsize;
5744
5745 error = 0;
5746
5747 /*
5748 * Documented get behavior is that we always return a value,
5749 * possibly truncated to fit in the user's buffer.
5750 * Traditional behavior is that we always tell the user
5751 * precisely how much we copied, rather than something useful
5752 * like the total amount we had available for her.
5753 * Note that this interface is not idempotent; the entire answer must
5754 * generated ahead of time.
5755 */
5756 valsize = min(len, sopt->sopt_valsize);
5757 sopt->sopt_valsize = valsize;
5758 if (sopt->sopt_val != USER_ADDR_NULL) {
5759 if (sopt->sopt_p != kernproc) {
5760 error = copyout(buf, sopt->sopt_val, valsize);
5761 } else {
5762 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5763 }
5764 }
5765 return error;
5766 }
5767
5768 static int
5769 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5770 {
5771 int error;
5772 size_t len;
5773 struct user64_timeval tv64 = {};
5774 struct user32_timeval tv32 = {};
5775 const void * val;
5776 size_t valsize;
5777
5778 error = 0;
5779 if (proc_is64bit(sopt->sopt_p)) {
5780 len = sizeof(tv64);
5781 tv64.tv_sec = tv_p->tv_sec;
5782 tv64.tv_usec = tv_p->tv_usec;
5783 val = &tv64;
5784 } else {
5785 len = sizeof(tv32);
5786 tv32.tv_sec = tv_p->tv_sec;
5787 tv32.tv_usec = tv_p->tv_usec;
5788 val = &tv32;
5789 }
5790 valsize = min(len, sopt->sopt_valsize);
5791 sopt->sopt_valsize = valsize;
5792 if (sopt->sopt_val != USER_ADDR_NULL) {
5793 if (sopt->sopt_p != kernproc) {
5794 error = copyout(val, sopt->sopt_val, valsize);
5795 } else {
5796 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5797 }
5798 }
5799 return error;
5800 }
5801
5802 /*
5803 * Return: 0 Success
5804 * ENOPROTOOPT
5805 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5806 * <pr_ctloutput>:???
5807 * <sf_getoption>:???
5808 */
5809 int
5810 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5811 {
5812 int error, optval;
5813 struct linger l;
5814 struct timeval tv;
5815 #if CONFIG_MACF_SOCKET
5816 struct mac extmac;
5817 #endif /* MAC_SOCKET */
5818
5819 if (sopt->sopt_dir != SOPT_GET) {
5820 sopt->sopt_dir = SOPT_GET;
5821 }
5822
5823 if (dolock) {
5824 socket_lock(so, 1);
5825 }
5826
5827 error = sflt_getsockopt(so, sopt);
5828 if (error != 0) {
5829 if (error == EJUSTRETURN) {
5830 error = 0;
5831 }
5832 goto out;
5833 }
5834
5835 if (sopt->sopt_level != SOL_SOCKET) {
5836 if (so->so_proto != NULL &&
5837 so->so_proto->pr_ctloutput != NULL) {
5838 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5839 goto out;
5840 }
5841 error = ENOPROTOOPT;
5842 } else {
5843 /*
5844 * Allow socket-level (SOL_SOCKET) options to be filtered by
5845 * the protocol layer, if needed. A zero value returned from
5846 * the handler means use default socket-level processing as
5847 * done by the rest of this routine. Otherwise, any other
5848 * return value indicates that the option is unsupported.
5849 */
5850 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5851 pru_socheckopt(so, sopt)) != 0) {
5852 goto out;
5853 }
5854
5855 error = 0;
5856 switch (sopt->sopt_name) {
5857 case SO_LINGER:
5858 case SO_LINGER_SEC:
5859 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5860 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5861 so->so_linger : so->so_linger / hz;
5862 error = sooptcopyout(sopt, &l, sizeof(l));
5863 break;
5864
5865 case SO_USELOOPBACK:
5866 case SO_DONTROUTE:
5867 case SO_DEBUG:
5868 case SO_KEEPALIVE:
5869 case SO_REUSEADDR:
5870 case SO_REUSEPORT:
5871 case SO_BROADCAST:
5872 case SO_OOBINLINE:
5873 case SO_TIMESTAMP:
5874 case SO_TIMESTAMP_MONOTONIC:
5875 case SO_TIMESTAMP_CONTINUOUS:
5876 case SO_DONTTRUNC:
5877 case SO_WANTMORE:
5878 case SO_WANTOOBFLAG:
5879 case SO_NOWAKEFROMSLEEP:
5880 case SO_NOAPNFALLBK:
5881 optval = so->so_options & sopt->sopt_name;
5882 integer:
5883 error = sooptcopyout(sopt, &optval, sizeof(optval));
5884 break;
5885
5886 case SO_TYPE:
5887 optval = so->so_type;
5888 goto integer;
5889
5890 case SO_NREAD:
5891 if (so->so_proto->pr_flags & PR_ATOMIC) {
5892 int pkt_total;
5893 struct mbuf *m1;
5894
5895 pkt_total = 0;
5896 m1 = so->so_rcv.sb_mb;
5897 while (m1 != NULL) {
5898 if (m1->m_type == MT_DATA ||
5899 m1->m_type == MT_HEADER ||
5900 m1->m_type == MT_OOBDATA) {
5901 pkt_total += m1->m_len;
5902 }
5903 m1 = m1->m_next;
5904 }
5905 optval = pkt_total;
5906 } else {
5907 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5908 }
5909 goto integer;
5910
5911 case SO_NUMRCVPKT:
5912 if (so->so_proto->pr_flags & PR_ATOMIC) {
5913 int cnt = 0;
5914 struct mbuf *m1;
5915
5916 m1 = so->so_rcv.sb_mb;
5917 while (m1 != NULL) {
5918 cnt += 1;
5919 m1 = m1->m_nextpkt;
5920 }
5921 optval = cnt;
5922 goto integer;
5923 } else {
5924 error = ENOPROTOOPT;
5925 break;
5926 }
5927
5928 case SO_NWRITE:
5929 optval = so->so_snd.sb_cc;
5930 goto integer;
5931
5932 case SO_ERROR:
5933 optval = so->so_error;
5934 so->so_error = 0;
5935 goto integer;
5936
5937 case SO_SNDBUF: {
5938 u_int32_t hiwat = so->so_snd.sb_hiwat;
5939
5940 if (so->so_snd.sb_flags & SB_UNIX) {
5941 struct unpcb *unp =
5942 (struct unpcb *)(so->so_pcb);
5943 if (unp != NULL && unp->unp_conn != NULL) {
5944 hiwat += unp->unp_conn->unp_cc;
5945 }
5946 }
5947
5948 optval = hiwat;
5949 goto integer;
5950 }
5951 case SO_RCVBUF:
5952 optval = so->so_rcv.sb_hiwat;
5953 goto integer;
5954
5955 case SO_SNDLOWAT:
5956 optval = so->so_snd.sb_lowat;
5957 goto integer;
5958
5959 case SO_RCVLOWAT:
5960 optval = so->so_rcv.sb_lowat;
5961 goto integer;
5962
5963 case SO_SNDTIMEO:
5964 case SO_RCVTIMEO:
5965 tv = (sopt->sopt_name == SO_SNDTIMEO ?
5966 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5967
5968 error = sooptcopyout_timeval(sopt, &tv);
5969 break;
5970
5971 case SO_NOSIGPIPE:
5972 optval = (so->so_flags & SOF_NOSIGPIPE);
5973 goto integer;
5974
5975 case SO_NOADDRERR:
5976 optval = (so->so_flags & SOF_NOADDRAVAIL);
5977 goto integer;
5978
5979 case SO_REUSESHAREUID:
5980 optval = (so->so_flags & SOF_REUSESHAREUID);
5981 goto integer;
5982
5983
5984 case SO_NOTIFYCONFLICT:
5985 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5986 goto integer;
5987
5988 case SO_RESTRICTIONS:
5989 optval = so_get_restrictions(so);
5990 goto integer;
5991
5992 case SO_AWDL_UNRESTRICTED:
5993 if (SOCK_DOM(so) == PF_INET ||
5994 SOCK_DOM(so) == PF_INET6) {
5995 optval = inp_get_awdl_unrestricted(
5996 sotoinpcb(so));
5997 goto integer;
5998 } else {
5999 error = EOPNOTSUPP;
6000 }
6001 break;
6002
6003 case SO_INTCOPROC_ALLOW:
6004 if (SOCK_DOM(so) == PF_INET6) {
6005 optval = inp_get_intcoproc_allowed(
6006 sotoinpcb(so));
6007 goto integer;
6008 } else {
6009 error = EOPNOTSUPP;
6010 }
6011 break;
6012
6013 case SO_LABEL:
6014 #if CONFIG_MACF_SOCKET
6015 if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
6016 sizeof(extmac))) != 0 ||
6017 (error = mac_socket_label_get(proc_ucred(
6018 sopt->sopt_p), so, &extmac)) != 0) {
6019 break;
6020 }
6021
6022 error = sooptcopyout(sopt, &extmac, sizeof(extmac));
6023 #else
6024 error = EOPNOTSUPP;
6025 #endif /* MAC_SOCKET */
6026 break;
6027
6028 case SO_PEERLABEL:
6029 #if CONFIG_MACF_SOCKET
6030 if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
6031 sizeof(extmac))) != 0 ||
6032 (error = mac_socketpeer_label_get(proc_ucred(
6033 sopt->sopt_p), so, &extmac)) != 0) {
6034 break;
6035 }
6036
6037 error = sooptcopyout(sopt, &extmac, sizeof(extmac));
6038 #else
6039 error = EOPNOTSUPP;
6040 #endif /* MAC_SOCKET */
6041 break;
6042
6043 #ifdef __APPLE_API_PRIVATE
6044 case SO_UPCALLCLOSEWAIT:
6045 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6046 goto integer;
6047 #endif
6048 case SO_RANDOMPORT:
6049 optval = (so->so_flags & SOF_BINDRANDOMPORT);
6050 goto integer;
6051
6052 case SO_NP_EXTENSIONS: {
6053 struct so_np_extensions sonpx = {};
6054
6055 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6056 SONPX_SETOPTSHUT : 0;
6057 sonpx.npx_mask = SONPX_MASK_VALID;
6058
6059 error = sooptcopyout(sopt, &sonpx,
6060 sizeof(struct so_np_extensions));
6061 break;
6062 }
6063
6064 case SO_TRAFFIC_CLASS:
6065 optval = so->so_traffic_class;
6066 goto integer;
6067
6068 case SO_RECV_TRAFFIC_CLASS:
6069 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6070 goto integer;
6071
6072 case SO_TRAFFIC_CLASS_STATS:
6073 error = sooptcopyout(sopt, &so->so_tc_stats,
6074 sizeof(so->so_tc_stats));
6075 break;
6076
6077 #if (DEVELOPMENT || DEBUG)
6078 case SO_TRAFFIC_CLASS_DBG:
6079 error = sogetopt_tcdbg(so, sopt);
6080 break;
6081 #endif /* (DEVELOPMENT || DEBUG) */
6082
6083 case SO_PRIVILEGED_TRAFFIC_CLASS:
6084 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6085 goto integer;
6086
6087 case SO_DEFUNCTOK:
6088 optval = !(so->so_flags & SOF_NODEFUNCT);
6089 goto integer;
6090
6091 case SO_ISDEFUNCT:
6092 optval = (so->so_flags & SOF_DEFUNCT);
6093 goto integer;
6094
6095 case SO_OPPORTUNISTIC:
6096 optval = so_get_opportunistic(so);
6097 goto integer;
6098
6099 case SO_FLUSH:
6100 /* This option is not gettable */
6101 error = EINVAL;
6102 break;
6103
6104 case SO_RECV_ANYIF:
6105 optval = so_get_recv_anyif(so);
6106 goto integer;
6107
6108 case SO_TRAFFIC_MGT_BACKGROUND:
6109 /* This option is handled by lower layer(s) */
6110 if (so->so_proto != NULL &&
6111 so->so_proto->pr_ctloutput != NULL) {
6112 (void) so->so_proto->pr_ctloutput(so, sopt);
6113 }
6114 break;
6115
6116 #if FLOW_DIVERT
6117 case SO_FLOW_DIVERT_TOKEN:
6118 error = flow_divert_token_get(so, sopt);
6119 break;
6120 #endif /* FLOW_DIVERT */
6121
6122 #if NECP
6123 case SO_NECP_ATTRIBUTES:
6124 error = necp_get_socket_attributes(so, sopt);
6125 break;
6126
6127 case SO_NECP_CLIENTUUID: {
6128 uuid_t *ncu;
6129
6130 if (SOCK_DOM(so) == PF_MULTIPATH) {
6131 ncu = &mpsotomppcb(so)->necp_client_uuid;
6132 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6133 ncu = &sotoinpcb(so)->necp_client_uuid;
6134 } else {
6135 error = EINVAL;
6136 goto out;
6137 }
6138
6139 error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6140 break;
6141 }
6142
6143 case SO_NECP_LISTENUUID: {
6144 uuid_t *nlu;
6145
6146 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6147 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6148 nlu = &sotoinpcb(so)->necp_client_uuid;
6149 } else {
6150 error = ENOENT;
6151 goto out;
6152 }
6153 } else {
6154 error = EINVAL;
6155 goto out;
6156 }
6157
6158 error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6159 break;
6160 }
6161 #endif /* NECP */
6162
6163 #if CONTENT_FILTER
6164 case SO_CFIL_SOCK_ID: {
6165 cfil_sock_id_t sock_id;
6166
6167 sock_id = cfil_sock_id_from_socket(so);
6168
6169 error = sooptcopyout(sopt, &sock_id,
6170 sizeof(cfil_sock_id_t));
6171 break;
6172 }
6173 #endif /* CONTENT_FILTER */
6174
6175 case SO_EXTENDED_BK_IDLE:
6176 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6177 goto integer;
6178 case SO_MARK_CELLFALLBACK:
6179 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6180 ? 1 : 0;
6181 goto integer;
6182 case SO_NET_SERVICE_TYPE: {
6183 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6184 optval = so->so_netsvctype;
6185 } else {
6186 optval = NET_SERVICE_TYPE_BE;
6187 }
6188 goto integer;
6189 }
6190 case SO_NETSVC_MARKING_LEVEL:
6191 optval = so_get_netsvc_marking_level(so);
6192 goto integer;
6193
6194 case SO_MPKL_SEND_INFO: {
6195 struct so_mpkl_send_info so_mpkl_send_info;
6196
6197 uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6198 so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6199 error = sooptcopyout(sopt, &so_mpkl_send_info,
6200 sizeof(struct so_mpkl_send_info));
6201 break;
6202 }
6203 default:
6204 error = ENOPROTOOPT;
6205 break;
6206 }
6207 }
6208 out:
6209 if (dolock) {
6210 socket_unlock(so, 1);
6211 }
6212 return error;
6213 }
6214
6215 /*
6216 * The size limits on our soopt_getm is different from that on FreeBSD.
6217 * We limit the size of options to MCLBYTES. This will have to change
6218 * if we need to define options that need more space than MCLBYTES.
6219 */
6220 int
6221 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6222 {
6223 struct mbuf *m, *m_prev;
6224 int sopt_size = sopt->sopt_valsize;
6225 int how;
6226
6227 if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6228 return EMSGSIZE;
6229 }
6230
6231 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6232 MGET(m, how, MT_DATA);
6233 if (m == NULL) {
6234 return ENOBUFS;
6235 }
6236 if (sopt_size > MLEN) {
6237 MCLGET(m, how);
6238 if ((m->m_flags & M_EXT) == 0) {
6239 m_free(m);
6240 return ENOBUFS;
6241 }
6242 m->m_len = min(MCLBYTES, sopt_size);
6243 } else {
6244 m->m_len = min(MLEN, sopt_size);
6245 }
6246 sopt_size -= m->m_len;
6247 *mp = m;
6248 m_prev = m;
6249
6250 while (sopt_size > 0) {
6251 MGET(m, how, MT_DATA);
6252 if (m == NULL) {
6253 m_freem(*mp);
6254 return ENOBUFS;
6255 }
6256 if (sopt_size > MLEN) {
6257 MCLGET(m, how);
6258 if ((m->m_flags & M_EXT) == 0) {
6259 m_freem(*mp);
6260 m_freem(m);
6261 return ENOBUFS;
6262 }
6263 m->m_len = min(MCLBYTES, sopt_size);
6264 } else {
6265 m->m_len = min(MLEN, sopt_size);
6266 }
6267 sopt_size -= m->m_len;
6268 m_prev->m_next = m;
6269 m_prev = m;
6270 }
6271 return 0;
6272 }
6273
6274 /* copyin sopt data into mbuf chain */
6275 int
6276 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6277 {
6278 struct mbuf *m0 = m;
6279
6280 if (sopt->sopt_val == USER_ADDR_NULL) {
6281 return 0;
6282 }
6283 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6284 if (sopt->sopt_p != kernproc) {
6285 int error;
6286
6287 error = copyin(sopt->sopt_val, mtod(m, char *),
6288 m->m_len);
6289 if (error != 0) {
6290 m_freem(m0);
6291 return error;
6292 }
6293 } else {
6294 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6295 mtod(m, char *), m->m_len);
6296 }
6297 sopt->sopt_valsize -= m->m_len;
6298 sopt->sopt_val += m->m_len;
6299 m = m->m_next;
6300 }
6301 /* should be allocated enoughly at ip6_sooptmcopyin() */
6302 if (m != NULL) {
6303 panic("soopt_mcopyin");
6304 /* NOTREACHED */
6305 }
6306 return 0;
6307 }
6308
6309 /* copyout mbuf chain data into soopt */
6310 int
6311 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6312 {
6313 struct mbuf *m0 = m;
6314 size_t valsize = 0;
6315
6316 if (sopt->sopt_val == USER_ADDR_NULL) {
6317 return 0;
6318 }
6319 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6320 if (sopt->sopt_p != kernproc) {
6321 int error;
6322
6323 error = copyout(mtod(m, char *), sopt->sopt_val,
6324 m->m_len);
6325 if (error != 0) {
6326 m_freem(m0);
6327 return error;
6328 }
6329 } else {
6330 bcopy(mtod(m, char *),
6331 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6332 }
6333 sopt->sopt_valsize -= m->m_len;
6334 sopt->sopt_val += m->m_len;
6335 valsize += m->m_len;
6336 m = m->m_next;
6337 }
6338 if (m != NULL) {
6339 /* enough soopt buffer should be given from user-land */
6340 m_freem(m0);
6341 return EINVAL;
6342 }
6343 sopt->sopt_valsize = valsize;
6344 return 0;
6345 }
6346
6347 void
6348 sohasoutofband(struct socket *so)
6349 {
6350 if (so->so_pgid < 0) {
6351 gsignal(-so->so_pgid, SIGURG);
6352 } else if (so->so_pgid > 0) {
6353 proc_signal(so->so_pgid, SIGURG);
6354 }
6355 selwakeup(&so->so_rcv.sb_sel);
6356 if (so->so_rcv.sb_flags & SB_KNOTE) {
6357 KNOTE(&so->so_rcv.sb_sel.si_note,
6358 (NOTE_OOB | SO_FILT_HINT_LOCKED));
6359 }
6360 }
6361
6362 int
6363 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6364 {
6365 #pragma unused(cred)
6366 struct proc *p = current_proc();
6367 int revents = 0;
6368
6369 socket_lock(so, 1);
6370 so_update_last_owner_locked(so, PROC_NULL);
6371 so_update_policy(so);
6372
6373 if (events & (POLLIN | POLLRDNORM)) {
6374 if (soreadable(so)) {
6375 revents |= events & (POLLIN | POLLRDNORM);
6376 }
6377 }
6378
6379 if (events & (POLLOUT | POLLWRNORM)) {
6380 if (sowriteable(so)) {
6381 revents |= events & (POLLOUT | POLLWRNORM);
6382 }
6383 }
6384
6385 if (events & (POLLPRI | POLLRDBAND)) {
6386 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6387 revents |= events & (POLLPRI | POLLRDBAND);
6388 }
6389 }
6390
6391 if (revents == 0) {
6392 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6393 /*
6394 * Darwin sets the flag first,
6395 * BSD calls selrecord first
6396 */
6397 so->so_rcv.sb_flags |= SB_SEL;
6398 selrecord(p, &so->so_rcv.sb_sel, wql);
6399 }
6400
6401 if (events & (POLLOUT | POLLWRNORM)) {
6402 /*
6403 * Darwin sets the flag first,
6404 * BSD calls selrecord first
6405 */
6406 so->so_snd.sb_flags |= SB_SEL;
6407 selrecord(p, &so->so_snd.sb_sel, wql);
6408 }
6409 }
6410
6411 socket_unlock(so, 1);
6412 return revents;
6413 }
6414
6415 int
6416 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6417 {
6418 struct socket *so = (struct socket *)fp->f_fglob->fg_data;
6419 int result;
6420
6421 socket_lock(so, 1);
6422 so_update_last_owner_locked(so, PROC_NULL);
6423 so_update_policy(so);
6424
6425 #if CONFIG_MACF_SOCKET
6426 proc_t p = knote_get_kq(kn)->kq_p;
6427 if (mac_socket_check_kqfilter(proc_ucred(p), kn, so) != 0) {
6428 socket_unlock(so, 1);
6429 knote_set_error(kn, EPERM);
6430 return 0;
6431 }
6432 #endif /* MAC_SOCKET */
6433
6434 switch (kn->kn_filter) {
6435 case EVFILT_READ:
6436 kn->kn_filtid = EVFILTID_SOREAD;
6437 break;
6438 case EVFILT_WRITE:
6439 kn->kn_filtid = EVFILTID_SOWRITE;
6440 break;
6441 case EVFILT_SOCK:
6442 kn->kn_filtid = EVFILTID_SCK;
6443 break;
6444 case EVFILT_EXCEPT:
6445 kn->kn_filtid = EVFILTID_SOEXCEPT;
6446 break;
6447 default:
6448 socket_unlock(so, 1);
6449 knote_set_error(kn, EINVAL);
6450 return 0;
6451 }
6452
6453 /*
6454 * call the appropriate sub-filter attach
6455 * with the socket still locked
6456 */
6457 result = knote_fops(kn)->f_attach(kn, kev);
6458
6459 socket_unlock(so, 1);
6460
6461 return result;
6462 }
6463
6464 static int
6465 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6466 {
6467 int retval = 0;
6468 int64_t data = 0;
6469
6470 if (so->so_options & SO_ACCEPTCONN) {
6471 /*
6472 * Radar 6615193 handle the listen case dynamically
6473 * for kqueue read filter. This allows to call listen()
6474 * after registering the kqueue EVFILT_READ.
6475 */
6476
6477 retval = !TAILQ_EMPTY(&so->so_comp);
6478 data = so->so_qlen;
6479 goto out;
6480 }
6481
6482 /* socket isn't a listener */
6483 /*
6484 * NOTE_LOWAT specifies new low water mark in data, i.e.
6485 * the bytes of protocol data. We therefore exclude any
6486 * control bytes.
6487 */
6488 data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6489
6490 if (kn->kn_sfflags & NOTE_OOB) {
6491 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6492 kn->kn_fflags |= NOTE_OOB;
6493 data -= so->so_oobmark;
6494 retval = 1;
6495 goto out;
6496 }
6497 }
6498
6499 if ((so->so_state & SS_CANTRCVMORE)
6500 #if CONTENT_FILTER
6501 && cfil_sock_data_pending(&so->so_rcv) == 0
6502 #endif /* CONTENT_FILTER */
6503 ) {
6504 kn->kn_flags |= EV_EOF;
6505 kn->kn_fflags = so->so_error;
6506 retval = 1;
6507 goto out;
6508 }
6509
6510 if (so->so_error) { /* temporary udp error */
6511 retval = 1;
6512 goto out;
6513 }
6514
6515 int64_t lowwat = so->so_rcv.sb_lowat;
6516 /*
6517 * Ensure that when NOTE_LOWAT is used, the derived
6518 * low water mark is bounded by socket's rcv buf's
6519 * high and low water mark values.
6520 */
6521 if (kn->kn_sfflags & NOTE_LOWAT) {
6522 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6523 lowwat = so->so_rcv.sb_hiwat;
6524 } else if (kn->kn_sdata > lowwat) {
6525 lowwat = kn->kn_sdata;
6526 }
6527 }
6528
6529 retval = (data >= lowwat);
6530
6531 out:
6532 if (retval && kev) {
6533 knote_fill_kevent(kn, kev, data);
6534 }
6535 return retval;
6536 }
6537
6538 static int
6539 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6540 {
6541 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6542
6543 /* socket locked */
6544
6545 /*
6546 * If the caller explicitly asked for OOB results (e.g. poll())
6547 * from EVFILT_READ, then save that off in the hookid field
6548 * and reserve the kn_flags EV_OOBAND bit for output only.
6549 */
6550 if (kn->kn_filter == EVFILT_READ &&
6551 kn->kn_flags & EV_OOBAND) {
6552 kn->kn_flags &= ~EV_OOBAND;
6553 kn->kn_hook32 = EV_OOBAND;
6554 } else {
6555 kn->kn_hook32 = 0;
6556 }
6557 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6558 so->so_rcv.sb_flags |= SB_KNOTE;
6559 }
6560
6561 /* indicate if event is already fired */
6562 return filt_soread_common(kn, NULL, so);
6563 }
6564
6565 static void
6566 filt_sordetach(struct knote *kn)
6567 {
6568 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6569
6570 socket_lock(so, 1);
6571 if (so->so_rcv.sb_flags & SB_KNOTE) {
6572 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6573 so->so_rcv.sb_flags &= ~SB_KNOTE;
6574 }
6575 }
6576 socket_unlock(so, 1);
6577 }
6578
6579 /*ARGSUSED*/
6580 static int
6581 filt_soread(struct knote *kn, long hint)
6582 {
6583 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6584 int retval;
6585
6586 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6587 socket_lock(so, 1);
6588 }
6589
6590 retval = filt_soread_common(kn, NULL, so);
6591
6592 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6593 socket_unlock(so, 1);
6594 }
6595
6596 return retval;
6597 }
6598
6599 static int
6600 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6601 {
6602 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6603 int retval;
6604
6605 socket_lock(so, 1);
6606
6607 /* save off the new input fflags and data */
6608 kn->kn_sfflags = kev->fflags;
6609 kn->kn_sdata = kev->data;
6610
6611 /* determine if changes result in fired events */
6612 retval = filt_soread_common(kn, NULL, so);
6613
6614 socket_unlock(so, 1);
6615
6616 return retval;
6617 }
6618
6619 static int
6620 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6621 {
6622 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6623 int retval;
6624
6625 socket_lock(so, 1);
6626 retval = filt_soread_common(kn, kev, so);
6627 socket_unlock(so, 1);
6628
6629 return retval;
6630 }
6631
6632 int
6633 so_wait_for_if_feedback(struct socket *so)
6634 {
6635 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6636 (so->so_state & SS_ISCONNECTED)) {
6637 struct inpcb *inp = sotoinpcb(so);
6638 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6639 return 1;
6640 }
6641 }
6642 return 0;
6643 }
6644
6645 static int
6646 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6647 {
6648 int ret = 0;
6649 int64_t data = sbspace(&so->so_snd);
6650
6651 if (so->so_state & SS_CANTSENDMORE) {
6652 kn->kn_flags |= EV_EOF;
6653 kn->kn_fflags = so->so_error;
6654 ret = 1;
6655 goto out;
6656 }
6657
6658 if (so->so_error) { /* temporary udp error */
6659 ret = 1;
6660 goto out;
6661 }
6662
6663 if (!socanwrite(so)) {
6664 ret = 0;
6665 goto out;
6666 }
6667
6668 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6669 ret = 1;
6670 goto out;
6671 }
6672
6673 int64_t lowwat = so->so_snd.sb_lowat;
6674
6675 if (kn->kn_sfflags & NOTE_LOWAT) {
6676 if (kn->kn_sdata > so->so_snd.sb_hiwat) {
6677 lowwat = so->so_snd.sb_hiwat;
6678 } else if (kn->kn_sdata > lowwat) {
6679 lowwat = kn->kn_sdata;
6680 }
6681 }
6682
6683 if (data >= lowwat) {
6684 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6685 #if (DEBUG || DEVELOPMENT)
6686 && so_notsent_lowat_check == 1
6687 #endif /* DEBUG || DEVELOPMENT */
6688 ) {
6689 if ((SOCK_DOM(so) == PF_INET ||
6690 SOCK_DOM(so) == PF_INET6) &&
6691 so->so_type == SOCK_STREAM) {
6692 ret = tcp_notsent_lowat_check(so);
6693 }
6694 #if MPTCP
6695 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6696 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6697 ret = mptcp_notsent_lowat_check(so);
6698 }
6699 #endif
6700 else {
6701 ret = 1;
6702 goto out;
6703 }
6704 } else {
6705 ret = 1;
6706 }
6707 }
6708 if (so_wait_for_if_feedback(so)) {
6709 ret = 0;
6710 }
6711
6712 out:
6713 if (ret && kev) {
6714 knote_fill_kevent(kn, kev, data);
6715 }
6716 return ret;
6717 }
6718
6719 static int
6720 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6721 {
6722 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6723
6724 /* socket locked */
6725 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6726 so->so_snd.sb_flags |= SB_KNOTE;
6727 }
6728
6729 /* determine if its already fired */
6730 return filt_sowrite_common(kn, NULL, so);
6731 }
6732
6733 static void
6734 filt_sowdetach(struct knote *kn)
6735 {
6736 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6737 socket_lock(so, 1);
6738
6739 if (so->so_snd.sb_flags & SB_KNOTE) {
6740 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6741 so->so_snd.sb_flags &= ~SB_KNOTE;
6742 }
6743 }
6744 socket_unlock(so, 1);
6745 }
6746
6747 /*ARGSUSED*/
6748 static int
6749 filt_sowrite(struct knote *kn, long hint)
6750 {
6751 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6752 int ret;
6753
6754 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6755 socket_lock(so, 1);
6756 }
6757
6758 ret = filt_sowrite_common(kn, NULL, so);
6759
6760 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6761 socket_unlock(so, 1);
6762 }
6763
6764 return ret;
6765 }
6766
6767 static int
6768 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6769 {
6770 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6771 int ret;
6772
6773 socket_lock(so, 1);
6774
6775 /*save off the new input fflags and data */
6776 kn->kn_sfflags = kev->fflags;
6777 kn->kn_sdata = kev->data;
6778
6779 /* determine if these changes result in a triggered event */
6780 ret = filt_sowrite_common(kn, NULL, so);
6781
6782 socket_unlock(so, 1);
6783
6784 return ret;
6785 }
6786
6787 static int
6788 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6789 {
6790 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6791 int ret;
6792
6793 socket_lock(so, 1);
6794 ret = filt_sowrite_common(kn, kev, so);
6795 socket_unlock(so, 1);
6796
6797 return ret;
6798 }
6799
6800 static int
6801 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6802 struct socket *so, long ev_hint)
6803 {
6804 int ret = 0;
6805 int64_t data = 0;
6806 uint32_t level_trigger = 0;
6807
6808 if (ev_hint & SO_FILT_HINT_CONNRESET) {
6809 kn->kn_fflags |= NOTE_CONNRESET;
6810 }
6811 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6812 kn->kn_fflags |= NOTE_TIMEOUT;
6813 }
6814 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6815 kn->kn_fflags |= NOTE_NOSRCADDR;
6816 }
6817 if (ev_hint & SO_FILT_HINT_IFDENIED) {
6818 kn->kn_fflags |= NOTE_IFDENIED;
6819 }
6820 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6821 kn->kn_fflags |= NOTE_KEEPALIVE;
6822 }
6823 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6824 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6825 }
6826 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6827 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6828 }
6829 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6830 (so->so_state & SS_ISCONNECTED)) {
6831 kn->kn_fflags |= NOTE_CONNECTED;
6832 level_trigger |= NOTE_CONNECTED;
6833 }
6834 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6835 (so->so_state & SS_ISDISCONNECTED)) {
6836 kn->kn_fflags |= NOTE_DISCONNECTED;
6837 level_trigger |= NOTE_DISCONNECTED;
6838 }
6839 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6840 if (so->so_proto != NULL &&
6841 (so->so_proto->pr_flags & PR_EVCONNINFO)) {
6842 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6843 }
6844 }
6845
6846 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6847 tcp_notify_ack_active(so)) {
6848 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6849 }
6850
6851 if ((so->so_state & SS_CANTRCVMORE)
6852 #if CONTENT_FILTER
6853 && cfil_sock_data_pending(&so->so_rcv) == 0
6854 #endif /* CONTENT_FILTER */
6855 ) {
6856 kn->kn_fflags |= NOTE_READCLOSED;
6857 level_trigger |= NOTE_READCLOSED;
6858 }
6859
6860 if (so->so_state & SS_CANTSENDMORE) {
6861 kn->kn_fflags |= NOTE_WRITECLOSED;
6862 level_trigger |= NOTE_WRITECLOSED;
6863 }
6864
6865 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6866 (so->so_flags & SOF_SUSPENDED)) {
6867 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6868
6869 /* If resume event was delivered before, reset it */
6870 kn->kn_hook32 &= ~NOTE_RESUME;
6871
6872 kn->kn_fflags |= NOTE_SUSPEND;
6873 level_trigger |= NOTE_SUSPEND;
6874 }
6875
6876 if ((ev_hint & SO_FILT_HINT_RESUME) ||
6877 (so->so_flags & SOF_SUSPENDED) == 0) {
6878 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6879
6880 /* If suspend event was delivered before, reset it */
6881 kn->kn_hook32 &= ~NOTE_SUSPEND;
6882
6883 kn->kn_fflags |= NOTE_RESUME;
6884 level_trigger |= NOTE_RESUME;
6885 }
6886
6887 if (so->so_error != 0) {
6888 ret = 1;
6889 data = so->so_error;
6890 kn->kn_flags |= EV_EOF;
6891 } else {
6892 u_int32_t data32;
6893 get_sockev_state(so, &data32);
6894 data = data32;
6895 }
6896
6897 /* Reset any events that are not requested on this knote */
6898 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6899 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6900
6901 /* Find the level triggerred events that are already delivered */
6902 level_trigger &= kn->kn_hook32;
6903 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6904
6905 /* Do not deliver level triggerred events more than once */
6906 if ((kn->kn_fflags & ~level_trigger) != 0) {
6907 ret = 1;
6908 }
6909
6910 if (ret && kev) {
6911 /*
6912 * Store the state of the events being delivered. This
6913 * state can be used to deliver level triggered events
6914 * ateast once and still avoid waking up the application
6915 * multiple times as long as the event is active.
6916 */
6917 if (kn->kn_fflags != 0) {
6918 kn->kn_hook32 |= (kn->kn_fflags &
6919 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6920 }
6921
6922 /*
6923 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6924 * only one of them and remember the last one that was
6925 * delivered last
6926 */
6927 if (kn->kn_fflags & NOTE_SUSPEND) {
6928 kn->kn_hook32 &= ~NOTE_RESUME;
6929 }
6930 if (kn->kn_fflags & NOTE_RESUME) {
6931 kn->kn_hook32 &= ~NOTE_SUSPEND;
6932 }
6933
6934 knote_fill_kevent(kn, kev, data);
6935 }
6936 return ret;
6937 }
6938
6939 static int
6940 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6941 {
6942 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6943
6944 /* socket locked */
6945 kn->kn_hook32 = 0;
6946 if (KNOTE_ATTACH(&so->so_klist, kn)) {
6947 so->so_flags |= SOF_KNOTE;
6948 }
6949
6950 /* determine if event already fired */
6951 return filt_sockev_common(kn, NULL, so, 0);
6952 }
6953
6954 static void
6955 filt_sockdetach(struct knote *kn)
6956 {
6957 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6958 socket_lock(so, 1);
6959
6960 if ((so->so_flags & SOF_KNOTE) != 0) {
6961 if (KNOTE_DETACH(&so->so_klist, kn)) {
6962 so->so_flags &= ~SOF_KNOTE;
6963 }
6964 }
6965 socket_unlock(so, 1);
6966 }
6967
6968 static int
6969 filt_sockev(struct knote *kn, long hint)
6970 {
6971 int ret = 0, locked = 0;
6972 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6973 long ev_hint = (hint & SO_FILT_HINT_EV);
6974
6975 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6976 socket_lock(so, 1);
6977 locked = 1;
6978 }
6979
6980 ret = filt_sockev_common(kn, NULL, so, ev_hint);
6981
6982 if (locked) {
6983 socket_unlock(so, 1);
6984 }
6985
6986 return ret;
6987 }
6988
6989
6990
6991 /*
6992 * filt_socktouch - update event state
6993 */
6994 static int
6995 filt_socktouch(
6996 struct knote *kn,
6997 struct kevent_qos_s *kev)
6998 {
6999 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
7000 uint32_t changed_flags;
7001 int ret;
7002
7003 socket_lock(so, 1);
7004
7005 /* save off the [result] data and fflags */
7006 changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7007
7008 /* save off the new input fflags and data */
7009 kn->kn_sfflags = kev->fflags;
7010 kn->kn_sdata = kev->data;
7011
7012 /* restrict the current results to the (smaller?) set of new interest */
7013 /*
7014 * For compatibility with previous implementations, we leave kn_fflags
7015 * as they were before.
7016 */
7017 //kn->kn_fflags &= kev->fflags;
7018
7019 /*
7020 * Since we keep track of events that are already
7021 * delivered, if any of those events are not requested
7022 * anymore the state related to them can be reset
7023 */
7024 kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7025
7026 /* determine if we have events to deliver */
7027 ret = filt_sockev_common(kn, NULL, so, 0);
7028
7029 socket_unlock(so, 1);
7030
7031 return ret;
7032 }
7033
7034 /*
7035 * filt_sockprocess - query event fired state and return data
7036 */
7037 static int
7038 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7039 {
7040 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
7041 int ret = 0;
7042
7043 socket_lock(so, 1);
7044
7045 ret = filt_sockev_common(kn, kev, so, 0);
7046
7047 socket_unlock(so, 1);
7048
7049 return ret;
7050 }
7051
7052 void
7053 get_sockev_state(struct socket *so, u_int32_t *statep)
7054 {
7055 u_int32_t state = *(statep);
7056
7057 /*
7058 * If the state variable is already used by a previous event,
7059 * reset it.
7060 */
7061 if (state != 0) {
7062 return;
7063 }
7064
7065 if (so->so_state & SS_ISCONNECTED) {
7066 state |= SOCKEV_CONNECTED;
7067 } else {
7068 state &= ~(SOCKEV_CONNECTED);
7069 }
7070 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7071 *(statep) = state;
7072 }
7073
7074 #define SO_LOCK_HISTORY_STR_LEN \
7075 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7076
7077 __private_extern__ const char *
7078 solockhistory_nr(struct socket *so)
7079 {
7080 size_t n = 0;
7081 int i;
7082 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7083
7084 bzero(lock_history_str, sizeof(lock_history_str));
7085 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7086 n += snprintf(lock_history_str + n,
7087 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7088 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7089 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7090 }
7091 return lock_history_str;
7092 }
7093
7094 lck_mtx_t *
7095 socket_getlock(struct socket *so, int flags)
7096 {
7097 if (so->so_proto->pr_getlock != NULL) {
7098 return (*so->so_proto->pr_getlock)(so, flags);
7099 } else {
7100 return so->so_proto->pr_domain->dom_mtx;
7101 }
7102 }
7103
7104 void
7105 socket_lock(struct socket *so, int refcount)
7106 {
7107 void *lr_saved;
7108
7109 lr_saved = __builtin_return_address(0);
7110
7111 if (so->so_proto->pr_lock) {
7112 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
7113 } else {
7114 #ifdef MORE_LOCKING_DEBUG
7115 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7116 LCK_MTX_ASSERT_NOTOWNED);
7117 #endif
7118 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7119 if (refcount) {
7120 so->so_usecount++;
7121 }
7122 so->lock_lr[so->next_lock_lr] = lr_saved;
7123 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7124 }
7125 }
7126
7127 void
7128 socket_lock_assert_owned(struct socket *so)
7129 {
7130 lck_mtx_t *mutex_held;
7131
7132 if (so->so_proto->pr_getlock != NULL) {
7133 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7134 } else {
7135 mutex_held = so->so_proto->pr_domain->dom_mtx;
7136 }
7137
7138 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7139 }
7140
7141 int
7142 socket_try_lock(struct socket *so)
7143 {
7144 lck_mtx_t *mtx;
7145
7146 if (so->so_proto->pr_getlock != NULL) {
7147 mtx = (*so->so_proto->pr_getlock)(so, 0);
7148 } else {
7149 mtx = so->so_proto->pr_domain->dom_mtx;
7150 }
7151
7152 return lck_mtx_try_lock(mtx);
7153 }
7154
7155 void
7156 socket_unlock(struct socket *so, int refcount)
7157 {
7158 void *lr_saved;
7159 lck_mtx_t *mutex_held;
7160
7161 lr_saved = __builtin_return_address(0);
7162
7163 if (so == NULL || so->so_proto == NULL) {
7164 panic("%s: null so_proto so=%p\n", __func__, so);
7165 /* NOTREACHED */
7166 }
7167
7168 if (so->so_proto->pr_unlock) {
7169 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7170 } else {
7171 mutex_held = so->so_proto->pr_domain->dom_mtx;
7172 #ifdef MORE_LOCKING_DEBUG
7173 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7174 #endif
7175 so->unlock_lr[so->next_unlock_lr] = lr_saved;
7176 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7177
7178 if (refcount) {
7179 if (so->so_usecount <= 0) {
7180 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7181 "lrh=%s", __func__, so->so_usecount, so,
7182 SOCK_DOM(so), so->so_type,
7183 SOCK_PROTO(so), solockhistory_nr(so));
7184 /* NOTREACHED */
7185 }
7186
7187 so->so_usecount--;
7188 if (so->so_usecount == 0) {
7189 sofreelastref(so, 1);
7190 }
7191 }
7192 lck_mtx_unlock(mutex_held);
7193 }
7194 }
7195
7196 /* Called with socket locked, will unlock socket */
7197 void
7198 sofree(struct socket *so)
7199 {
7200 lck_mtx_t *mutex_held;
7201
7202 if (so->so_proto->pr_getlock != NULL) {
7203 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7204 } else {
7205 mutex_held = so->so_proto->pr_domain->dom_mtx;
7206 }
7207 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7208
7209 sofreelastref(so, 0);
7210 }
7211
7212 void
7213 soreference(struct socket *so)
7214 {
7215 socket_lock(so, 1); /* locks & take one reference on socket */
7216 socket_unlock(so, 0); /* unlock only */
7217 }
7218
7219 void
7220 sodereference(struct socket *so)
7221 {
7222 socket_lock(so, 0);
7223 socket_unlock(so, 1);
7224 }
7225
7226 /*
7227 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7228 * possibility of using jumbo clusters. Caller must ensure to hold
7229 * the socket lock.
7230 */
7231 void
7232 somultipages(struct socket *so, boolean_t set)
7233 {
7234 if (set) {
7235 so->so_flags |= SOF_MULTIPAGES;
7236 } else {
7237 so->so_flags &= ~SOF_MULTIPAGES;
7238 }
7239 }
7240
7241 void
7242 soif2kcl(struct socket *so, boolean_t set)
7243 {
7244 if (set) {
7245 so->so_flags1 |= SOF1_IF_2KCL;
7246 } else {
7247 so->so_flags1 &= ~SOF1_IF_2KCL;
7248 }
7249 }
7250
7251 int
7252 so_isdstlocal(struct socket *so)
7253 {
7254 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7255
7256 if (SOCK_DOM(so) == PF_INET) {
7257 return inaddr_local(inp->inp_faddr);
7258 } else if (SOCK_DOM(so) == PF_INET6) {
7259 return in6addr_local(&inp->in6p_faddr);
7260 }
7261
7262 return 0;
7263 }
7264
7265 int
7266 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7267 {
7268 struct sockbuf *rcv, *snd;
7269 int err = 0, defunct;
7270
7271 rcv = &so->so_rcv;
7272 snd = &so->so_snd;
7273
7274 defunct = (so->so_flags & SOF_DEFUNCT);
7275 if (defunct) {
7276 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7277 panic("%s: SB_DROP not set", __func__);
7278 /* NOTREACHED */
7279 }
7280 goto done;
7281 }
7282
7283 if (so->so_flags & SOF_NODEFUNCT) {
7284 if (noforce) {
7285 err = EOPNOTSUPP;
7286 if (p != PROC_NULL) {
7287 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7288 "name %s level %d) so 0x%llx [%d,%d] "
7289 "is not eligible for defunct "
7290 "(%d)\n", __func__, proc_selfpid(),
7291 proc_best_name(current_proc()), proc_pid(p),
7292 proc_best_name(p), level,
7293 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7294 SOCK_DOM(so), SOCK_TYPE(so), err);
7295 }
7296 return err;
7297 }
7298 so->so_flags &= ~SOF_NODEFUNCT;
7299 if (p != PROC_NULL) {
7300 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7301 "name %s level %d) so 0x%llx [%d,%d] "
7302 "defunct by force "
7303 "(%d)\n", __func__, proc_selfpid(),
7304 proc_best_name(current_proc()), proc_pid(p),
7305 proc_best_name(p), level,
7306 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7307 SOCK_DOM(so), SOCK_TYPE(so), err);
7308 }
7309 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7310 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7311 struct ifnet *ifp = inp->inp_last_outifp;
7312
7313 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7314 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7315 } else if (so->so_flags & SOF_DELEGATED) {
7316 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7317 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7318 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7319 } else if (noforce && p != PROC_NULL) {
7320 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7321
7322 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7323 so->so_extended_bk_start = net_uptime();
7324 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7325
7326 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7327
7328 err = EOPNOTSUPP;
7329 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7330 "name %s level %d) so 0x%llx [%d,%d] "
7331 "extend bk idle "
7332 "(%d)\n", __func__, proc_selfpid(),
7333 proc_best_name(current_proc()), proc_pid(p),
7334 proc_best_name(p), level,
7335 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7336 SOCK_DOM(so), SOCK_TYPE(so), err);
7337 return err;
7338 } else {
7339 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7340 }
7341 }
7342
7343 so->so_flags |= SOF_DEFUNCT;
7344
7345 /* Prevent further data from being appended to the socket buffers */
7346 snd->sb_flags |= SB_DROP;
7347 rcv->sb_flags |= SB_DROP;
7348
7349 /* Flush any existing data in the socket buffers */
7350 if (rcv->sb_cc != 0) {
7351 rcv->sb_flags &= ~SB_SEL;
7352 selthreadclear(&rcv->sb_sel);
7353 sbrelease(rcv);
7354 }
7355 if (snd->sb_cc != 0) {
7356 snd->sb_flags &= ~SB_SEL;
7357 selthreadclear(&snd->sb_sel);
7358 sbrelease(snd);
7359 }
7360
7361 done:
7362 if (p != PROC_NULL) {
7363 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7364 "so 0x%llx [%d,%d] %s defunct%s\n", __func__,
7365 proc_selfpid(), proc_best_name(current_proc()),
7366 proc_pid(p), proc_best_name(p), level,
7367 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7368 SOCK_TYPE(so), defunct ? "is already" : "marked as",
7369 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7370 " extbkidle" : "");
7371 }
7372 return err;
7373 }
7374
7375 int
7376 sodefunct(struct proc *p, struct socket *so, int level)
7377 {
7378 struct sockbuf *rcv, *snd;
7379
7380 if (!(so->so_flags & SOF_DEFUNCT)) {
7381 panic("%s improperly called", __func__);
7382 /* NOTREACHED */
7383 }
7384 if (so->so_state & SS_DEFUNCT) {
7385 goto done;
7386 }
7387
7388 rcv = &so->so_rcv;
7389 snd = &so->so_snd;
7390
7391 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7392 char s[MAX_IPv6_STR_LEN];
7393 char d[MAX_IPv6_STR_LEN];
7394 struct inpcb *inp = sotoinpcb(so);
7395
7396 if (p != PROC_NULL) {
7397 SODEFUNCTLOG(
7398 "%s[%d, %s]: (target pid %d name %s level %d) "
7399 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
7400 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7401 " snd_fl 0x%x]\n", __func__,
7402 proc_selfpid(), proc_best_name(current_proc()),
7403 proc_pid(p), proc_best_name(p), level,
7404 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7405 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7406 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7407 (void *)&inp->inp_laddr.s_addr :
7408 (void *)&inp->in6p_laddr),
7409 s, sizeof(s)), ntohs(inp->in6p_lport),
7410 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7411 (void *)&inp->inp_faddr.s_addr :
7412 (void *)&inp->in6p_faddr,
7413 d, sizeof(d)), ntohs(inp->in6p_fport),
7414 (uint32_t)rcv->sb_sel.si_flags,
7415 (uint32_t)snd->sb_sel.si_flags,
7416 rcv->sb_flags, snd->sb_flags);
7417 }
7418 } else if (p != PROC_NULL) {
7419 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7420 "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7421 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7422 proc_selfpid(), proc_best_name(current_proc()),
7423 proc_pid(p), proc_best_name(p), level,
7424 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7425 SOCK_DOM(so), SOCK_TYPE(so),
7426 (uint32_t)rcv->sb_sel.si_flags,
7427 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7428 snd->sb_flags);
7429 }
7430
7431 /*
7432 * Unwedge threads blocked on sbwait() and sb_lock().
7433 */
7434 sbwakeup(rcv);
7435 sbwakeup(snd);
7436
7437 so->so_flags1 |= SOF1_DEFUNCTINPROG;
7438 if (rcv->sb_flags & SB_LOCK) {
7439 sbunlock(rcv, TRUE); /* keep socket locked */
7440 }
7441 if (snd->sb_flags & SB_LOCK) {
7442 sbunlock(snd, TRUE); /* keep socket locked */
7443 }
7444 /*
7445 * Flush the buffers and disconnect. We explicitly call shutdown
7446 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7447 * states are set for the socket. This would also flush out data
7448 * hanging off the receive list of this socket.
7449 */
7450 (void) soshutdownlock_final(so, SHUT_RD);
7451 (void) soshutdownlock_final(so, SHUT_WR);
7452 (void) sodisconnectlocked(so);
7453
7454 /*
7455 * Explicitly handle connectionless-protocol disconnection
7456 * and release any remaining data in the socket buffers.
7457 */
7458 if (!(so->so_state & SS_ISDISCONNECTED)) {
7459 (void) soisdisconnected(so);
7460 }
7461
7462 if (so->so_error == 0) {
7463 so->so_error = EBADF;
7464 }
7465
7466 if (rcv->sb_cc != 0) {
7467 rcv->sb_flags &= ~SB_SEL;
7468 selthreadclear(&rcv->sb_sel);
7469 sbrelease(rcv);
7470 }
7471 if (snd->sb_cc != 0) {
7472 snd->sb_flags &= ~SB_SEL;
7473 selthreadclear(&snd->sb_sel);
7474 sbrelease(snd);
7475 }
7476 so->so_state |= SS_DEFUNCT;
7477 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7478
7479 done:
7480 return 0;
7481 }
7482
7483 int
7484 soresume(struct proc *p, struct socket *so, int locked)
7485 {
7486 if (locked == 0) {
7487 socket_lock(so, 1);
7488 }
7489
7490 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7491 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7492 "[%d,%d] resumed from bk idle\n",
7493 __func__, proc_selfpid(), proc_best_name(current_proc()),
7494 proc_pid(p), proc_best_name(p),
7495 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7496 SOCK_DOM(so), SOCK_TYPE(so));
7497
7498 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7499 so->so_extended_bk_start = 0;
7500 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7501
7502 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7503 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7504 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7505 }
7506 if (locked == 0) {
7507 socket_unlock(so, 1);
7508 }
7509
7510 return 0;
7511 }
7512
7513 /*
7514 * Does not attempt to account for sockets that are delegated from
7515 * the current process
7516 */
7517 int
7518 so_set_extended_bk_idle(struct socket *so, int optval)
7519 {
7520 int error = 0;
7521
7522 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7523 SOCK_PROTO(so) != IPPROTO_TCP) {
7524 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7525 error = EOPNOTSUPP;
7526 } else if (optval == 0) {
7527 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7528
7529 soresume(current_proc(), so, 1);
7530 } else {
7531 struct proc *p = current_proc();
7532 int i;
7533 struct filedesc *fdp;
7534 int count = 0;
7535
7536 /*
7537 * Unlock socket to avoid lock ordering issue with
7538 * the proc fd table lock
7539 */
7540 socket_unlock(so, 0);
7541
7542 proc_fdlock(p);
7543
7544 fdp = p->p_fd;
7545 for (i = 0; i < fdp->fd_nfiles; i++) {
7546 struct fileproc *fp = fdp->fd_ofiles[i];
7547 struct socket *so2;
7548
7549 if (fp == NULL ||
7550 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7551 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) {
7552 continue;
7553 }
7554
7555 so2 = (struct socket *)fp->f_fglob->fg_data;
7556 if (so != so2 &&
7557 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7558 count++;
7559 }
7560 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7561 break;
7562 }
7563 }
7564 proc_fdunlock(p);
7565
7566 socket_lock(so, 0);
7567
7568 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7569 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7570 error = EBUSY;
7571 } else if (so->so_flags & SOF_DELEGATED) {
7572 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7573 error = EBUSY;
7574 } else {
7575 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7576 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7577 }
7578 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7579 "%s marked for extended bk idle\n",
7580 __func__, proc_selfpid(), proc_best_name(current_proc()),
7581 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7582 SOCK_DOM(so), SOCK_TYPE(so),
7583 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7584 "is" : "not");
7585 }
7586
7587 return error;
7588 }
7589
7590 static void
7591 so_stop_extended_bk_idle(struct socket *so)
7592 {
7593 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7594 so->so_extended_bk_start = 0;
7595
7596 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7597 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7598 /*
7599 * Force defunct
7600 */
7601 sosetdefunct(current_proc(), so,
7602 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7603 if (so->so_flags & SOF_DEFUNCT) {
7604 sodefunct(current_proc(), so,
7605 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7606 }
7607 }
7608
7609 void
7610 so_drain_extended_bk_idle(struct socket *so)
7611 {
7612 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7613 /*
7614 * Only penalize sockets that have outstanding data
7615 */
7616 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7617 so_stop_extended_bk_idle(so);
7618
7619 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7620 }
7621 }
7622 }
7623
7624 /*
7625 * Return values tells if socket is still in extended background idle
7626 */
7627 int
7628 so_check_extended_bk_idle_time(struct socket *so)
7629 {
7630 int ret = 1;
7631
7632 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7633 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7634 __func__, proc_selfpid(), proc_best_name(current_proc()),
7635 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7636 SOCK_DOM(so), SOCK_TYPE(so));
7637 if (net_uptime() - so->so_extended_bk_start >
7638 soextbkidlestat.so_xbkidle_time) {
7639 so_stop_extended_bk_idle(so);
7640
7641 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7642
7643 ret = 0;
7644 } else {
7645 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7646
7647 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7648 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7649 }
7650 }
7651
7652 return ret;
7653 }
7654
7655 void
7656 resume_proc_sockets(proc_t p)
7657 {
7658 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7659 struct filedesc *fdp;
7660 int i;
7661
7662 proc_fdlock(p);
7663 fdp = p->p_fd;
7664 for (i = 0; i < fdp->fd_nfiles; i++) {
7665 struct fileproc *fp;
7666 struct socket *so;
7667
7668 fp = fdp->fd_ofiles[i];
7669 if (fp == NULL ||
7670 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7671 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) {
7672 continue;
7673 }
7674
7675 so = (struct socket *)fp->f_fglob->fg_data;
7676 (void) soresume(p, so, 0);
7677 }
7678 proc_fdunlock(p);
7679
7680 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7681 }
7682 }
7683
7684 __private_extern__ int
7685 so_set_recv_anyif(struct socket *so, int optval)
7686 {
7687 int ret = 0;
7688
7689 #if INET6
7690 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7691 #else
7692 if (SOCK_DOM(so) == PF_INET) {
7693 #endif /* !INET6 */
7694 if (optval) {
7695 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7696 } else {
7697 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7698 }
7699 }
7700
7701
7702 return ret;
7703 }
7704
7705 __private_extern__ int
7706 so_get_recv_anyif(struct socket *so)
7707 {
7708 int ret = 0;
7709
7710 #if INET6
7711 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7712 #else
7713 if (SOCK_DOM(so) == PF_INET) {
7714 #endif /* !INET6 */
7715 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7716 }
7717
7718 return ret;
7719 }
7720
7721 int
7722 so_set_restrictions(struct socket *so, uint32_t vals)
7723 {
7724 int nocell_old, nocell_new;
7725 int noexpensive_old, noexpensive_new;
7726 int noconstrained_old, noconstrained_new;
7727
7728 /*
7729 * Deny-type restrictions are trapdoors; once set they cannot be
7730 * unset for the lifetime of the socket. This allows them to be
7731 * issued by a framework on behalf of the application without
7732 * having to worry that they can be undone.
7733 *
7734 * Note here that socket-level restrictions overrides any protocol
7735 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7736 * socket restriction issued on the socket has a higher precendence
7737 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7738 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7739 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7740 */
7741 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7742 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7743 noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7744 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7745 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7746 SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7747 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7748 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7749 noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7750
7751 /* we can only set, not clear restrictions */
7752 if ((nocell_new - nocell_old) == 0 &&
7753 (noexpensive_new - noexpensive_old) == 0 &&
7754 (noconstrained_new - noconstrained_old) == 0) {
7755 return 0;
7756 }
7757 #if INET6
7758 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7759 #else
7760 if (SOCK_DOM(so) == PF_INET) {
7761 #endif /* !INET6 */
7762 if (nocell_new - nocell_old != 0) {
7763 /*
7764 * if deny cellular is now set, do what's needed
7765 * for INPCB
7766 */
7767 inp_set_nocellular(sotoinpcb(so));
7768 }
7769 if (noexpensive_new - noexpensive_old != 0) {
7770 inp_set_noexpensive(sotoinpcb(so));
7771 }
7772 if (noconstrained_new - noconstrained_old != 0) {
7773 inp_set_noconstrained(sotoinpcb(so));
7774 }
7775 }
7776
7777 if (SOCK_DOM(so) == PF_MULTIPATH) {
7778 mptcp_set_restrictions(so);
7779 }
7780
7781 return 0;
7782 }
7783
7784 uint32_t
7785 so_get_restrictions(struct socket *so)
7786 {
7787 return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7788 SO_RESTRICT_DENY_OUT |
7789 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7790 }
7791
7792 int
7793 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7794 {
7795 struct proc *ep = PROC_NULL;
7796 int error = 0;
7797
7798 /* pid 0 is reserved for kernel */
7799 if (epid == 0) {
7800 error = EINVAL;
7801 goto done;
7802 }
7803
7804 /*
7805 * If this is an in-kernel socket, prevent its delegate
7806 * association from changing unless the socket option is
7807 * coming from within the kernel itself.
7808 */
7809 if (so->last_pid == 0 && p != kernproc) {
7810 error = EACCES;
7811 goto done;
7812 }
7813
7814 /*
7815 * If this is issued by a process that's recorded as the
7816 * real owner of the socket, or if the pid is the same as
7817 * the process's own pid, then proceed. Otherwise ensure
7818 * that the issuing process has the necessary privileges.
7819 */
7820 if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7821 if ((error = priv_check_cred(kauth_cred_get(),
7822 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7823 error = EACCES;
7824 goto done;
7825 }
7826 }
7827
7828 /* Find the process that corresponds to the effective pid */
7829 if ((ep = proc_find(epid)) == PROC_NULL) {
7830 error = ESRCH;
7831 goto done;
7832 }
7833
7834 /*
7835 * If a process tries to delegate the socket to itself, then
7836 * there's really nothing to do; treat it as a way for the
7837 * delegate association to be cleared. Note that we check
7838 * the passed-in proc rather than calling proc_selfpid(),
7839 * as we need to check the process issuing the socket option
7840 * which could be kernproc. Given that we don't allow 0 for
7841 * effective pid, it means that a delegated in-kernel socket
7842 * stays delegated during its lifetime (which is probably OK.)
7843 */
7844 if (epid == proc_pid(p)) {
7845 so->so_flags &= ~SOF_DELEGATED;
7846 so->e_upid = 0;
7847 so->e_pid = 0;
7848 uuid_clear(so->e_uuid);
7849 } else {
7850 so->so_flags |= SOF_DELEGATED;
7851 so->e_upid = proc_uniqueid(ep);
7852 so->e_pid = proc_pid(ep);
7853 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
7854 }
7855 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7856 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
7857 }
7858 done:
7859 if (error == 0 && net_io_policy_log) {
7860 uuid_string_t buf;
7861
7862 uuid_unparse(so->e_uuid, buf);
7863 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7864 "euuid %s%s\n", __func__, proc_name_address(p),
7865 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7866 SOCK_DOM(so), SOCK_TYPE(so),
7867 so->e_pid, proc_name_address(ep), buf,
7868 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7869 } else if (error != 0 && net_io_policy_log) {
7870 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7871 "ERROR (%d)\n", __func__, proc_name_address(p),
7872 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7873 SOCK_DOM(so), SOCK_TYPE(so),
7874 epid, (ep == PROC_NULL) ? "PROC_NULL" :
7875 proc_name_address(ep), error);
7876 }
7877
7878 /* Update this socket's policy upon success */
7879 if (error == 0) {
7880 so->so_policy_gencnt *= -1;
7881 so_update_policy(so);
7882 #if NECP
7883 so_update_necp_policy(so, NULL, NULL);
7884 #endif /* NECP */
7885 }
7886
7887 if (ep != PROC_NULL) {
7888 proc_rele(ep);
7889 }
7890
7891 return error;
7892 }
7893
7894 int
7895 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
7896 {
7897 uuid_string_t buf;
7898 uuid_t uuid;
7899 int error = 0;
7900
7901 /* UUID must not be all-zeroes (reserved for kernel) */
7902 if (uuid_is_null(euuid)) {
7903 error = EINVAL;
7904 goto done;
7905 }
7906
7907 /*
7908 * If this is an in-kernel socket, prevent its delegate
7909 * association from changing unless the socket option is
7910 * coming from within the kernel itself.
7911 */
7912 if (so->last_pid == 0 && p != kernproc) {
7913 error = EACCES;
7914 goto done;
7915 }
7916
7917 /* Get the UUID of the issuing process */
7918 proc_getexecutableuuid(p, uuid, sizeof(uuid));
7919
7920 /*
7921 * If this is issued by a process that's recorded as the
7922 * real owner of the socket, or if the uuid is the same as
7923 * the process's own uuid, then proceed. Otherwise ensure
7924 * that the issuing process has the necessary privileges.
7925 */
7926 if (check_cred &&
7927 (uuid_compare(euuid, so->last_uuid) != 0 ||
7928 uuid_compare(euuid, uuid) != 0)) {
7929 if ((error = priv_check_cred(kauth_cred_get(),
7930 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7931 error = EACCES;
7932 goto done;
7933 }
7934 }
7935
7936 /*
7937 * If a process tries to delegate the socket to itself, then
7938 * there's really nothing to do; treat it as a way for the
7939 * delegate association to be cleared. Note that we check
7940 * the uuid of the passed-in proc rather than that of the
7941 * current process, as we need to check the process issuing
7942 * the socket option which could be kernproc itself. Given
7943 * that we don't allow 0 for effective uuid, it means that
7944 * a delegated in-kernel socket stays delegated during its
7945 * lifetime (which is okay.)
7946 */
7947 if (uuid_compare(euuid, uuid) == 0) {
7948 so->so_flags &= ~SOF_DELEGATED;
7949 so->e_upid = 0;
7950 so->e_pid = 0;
7951 uuid_clear(so->e_uuid);
7952 } else {
7953 so->so_flags |= SOF_DELEGATED;
7954 /*
7955 * Unlike so_set_effective_pid(), we only have the UUID
7956 * here and the process ID is not known. Inherit the
7957 * real {pid,upid} of the socket.
7958 */
7959 so->e_upid = so->last_upid;
7960 so->e_pid = so->last_pid;
7961 uuid_copy(so->e_uuid, euuid);
7962 }
7963 /*
7964 * The following will clear the effective process name as it's the same
7965 * as the real process
7966 */
7967 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7968 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
7969 }
7970 done:
7971 if (error == 0 && net_io_policy_log) {
7972 uuid_unparse(so->e_uuid, buf);
7973 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7974 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
7975 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7976 SOCK_TYPE(so), so->e_pid, buf,
7977 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7978 } else if (error != 0 && net_io_policy_log) {
7979 uuid_unparse(euuid, buf);
7980 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7981 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
7982 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7983 SOCK_TYPE(so), buf, error);
7984 }
7985
7986 /* Update this socket's policy upon success */
7987 if (error == 0) {
7988 so->so_policy_gencnt *= -1;
7989 so_update_policy(so);
7990 #if NECP
7991 so_update_necp_policy(so, NULL, NULL);
7992 #endif /* NECP */
7993 }
7994
7995 return error;
7996 }
7997
7998 void
7999 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8000 uint32_t ev_datalen)
8001 {
8002 struct kev_msg ev_msg;
8003
8004 /*
8005 * A netpolicy event always starts with a netpolicy_event_data
8006 * structure, but the caller can provide for a longer event
8007 * structure to post, depending on the event code.
8008 */
8009 VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8010
8011 bzero(&ev_msg, sizeof(ev_msg));
8012 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8013 ev_msg.kev_class = KEV_NETWORK_CLASS;
8014 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
8015 ev_msg.event_code = ev_code;
8016
8017 ev_msg.dv[0].data_ptr = ev_data;
8018 ev_msg.dv[0].data_length = ev_datalen;
8019
8020 kev_post_msg(&ev_msg);
8021 }
8022
8023 void
8024 socket_post_kev_msg(uint32_t ev_code,
8025 struct kev_socket_event_data *ev_data,
8026 uint32_t ev_datalen)
8027 {
8028 struct kev_msg ev_msg;
8029
8030 bzero(&ev_msg, sizeof(ev_msg));
8031 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8032 ev_msg.kev_class = KEV_NETWORK_CLASS;
8033 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8034 ev_msg.event_code = ev_code;
8035
8036 ev_msg.dv[0].data_ptr = ev_data;
8037 ev_msg.dv[0].data_length = ev_datalen;
8038
8039 kev_post_msg(&ev_msg);
8040 }
8041
8042 void
8043 socket_post_kev_msg_closed(struct socket *so)
8044 {
8045 struct kev_socket_closed ev;
8046 struct sockaddr *socksa = NULL, *peersa = NULL;
8047 int err;
8048 bzero(&ev, sizeof(ev));
8049 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8050 if (err == 0) {
8051 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8052 &peersa);
8053 if (err == 0) {
8054 memcpy(&ev.ev_data.kev_sockname, socksa,
8055 min(socksa->sa_len,
8056 sizeof(ev.ev_data.kev_sockname)));
8057 memcpy(&ev.ev_data.kev_peername, peersa,
8058 min(peersa->sa_len,
8059 sizeof(ev.ev_data.kev_peername)));
8060 socket_post_kev_msg(KEV_SOCKET_CLOSED,
8061 &ev.ev_data, sizeof(ev));
8062 }
8063 }
8064 if (socksa != NULL) {
8065 FREE(socksa, M_SONAME);
8066 }
8067 if (peersa != NULL) {
8068 FREE(peersa, M_SONAME);
8069 }
8070 }