]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/uipc_socket.c
e1a5241a2ddb1f81f65e81a85bf531ef84a529ac
[apple/xnu.git] / bsd / kern / uipc_socket.c
1 /*
2 * Copyright (c) 1998-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/net_api_stats.h>
102 #include <net/ntstat.h>
103 #include <net/content_filter.h>
104 #include <netinet/in.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet/in_tclass.h>
107 #include <netinet/in_var.h>
108 #include <netinet/tcp_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet6/ip6_var.h>
111 #include <netinet/flow_divert.h>
112 #include <kern/zalloc.h>
113 #include <kern/locks.h>
114 #include <machine/limits.h>
115 #include <libkern/OSAtomic.h>
116 #include <pexpert/pexpert.h>
117 #include <kern/assert.h>
118 #include <kern/task.h>
119 #include <kern/policy_internal.h>
120
121 #include <sys/kpi_mbuf.h>
122 #include <sys/mcache.h>
123 #include <sys/unpcb.h>
124 #include <libkern/section_keywords.h>
125
126 #if CONFIG_MACF
127 #include <security/mac_framework.h>
128 #endif /* MAC */
129
130 #if MULTIPATH
131 #include <netinet/mp_pcb.h>
132 #include <netinet/mptcp_var.h>
133 #endif /* MULTIPATH */
134
135 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
136
137 #if DEBUG || DEVELOPMENT
138 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
139 #else
140 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
141 #endif
142
143 /* TODO: this should be in a header file somewhere */
144 extern char *proc_name_address(void *p);
145
146 static u_int32_t so_cache_hw; /* High water mark for socache */
147 static u_int32_t so_cache_timeouts; /* number of timeouts */
148 static u_int32_t so_cache_max_freed; /* max freed per timeout */
149 static u_int32_t cached_sock_count = 0;
150 STAILQ_HEAD(, socket) so_cache_head;
151 int max_cached_sock_count = MAX_CACHED_SOCKETS;
152 static u_int32_t so_cache_time;
153 static int socketinit_done;
154 static struct zone *so_cache_zone;
155
156 static lck_grp_t *so_cache_mtx_grp;
157 static lck_attr_t *so_cache_mtx_attr;
158 static lck_grp_attr_t *so_cache_mtx_grp_attr;
159 static lck_mtx_t *so_cache_mtx;
160
161 #include <machine/limits.h>
162
163 static int filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
164 static void filt_sordetach(struct knote *kn);
165 static int filt_soread(struct knote *kn, long hint);
166 static int filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
167 static int filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
168
169 static int filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
170 static void filt_sowdetach(struct knote *kn);
171 static int filt_sowrite(struct knote *kn, long hint);
172 static int filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
173 static int filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
174
175 static int filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
176 static void filt_sockdetach(struct knote *kn);
177 static int filt_sockev(struct knote *kn, long hint);
178 static int filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
179 static int filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
180
181 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
182 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
183
184 SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
185 .f_isfd = 1,
186 .f_attach = filt_sorattach,
187 .f_detach = filt_sordetach,
188 .f_event = filt_soread,
189 .f_touch = filt_sortouch,
190 .f_process = filt_sorprocess,
191 };
192
193 SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
194 .f_isfd = 1,
195 .f_attach = filt_sowattach,
196 .f_detach = filt_sowdetach,
197 .f_event = filt_sowrite,
198 .f_touch = filt_sowtouch,
199 .f_process = filt_sowprocess,
200 };
201
202 SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
203 .f_isfd = 1,
204 .f_attach = filt_sockattach,
205 .f_detach = filt_sockdetach,
206 .f_event = filt_sockev,
207 .f_touch = filt_socktouch,
208 .f_process = filt_sockprocess,
209 };
210
211 SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
212 .f_isfd = 1,
213 .f_attach = filt_sorattach,
214 .f_detach = filt_sordetach,
215 .f_event = filt_soread,
216 .f_touch = filt_sortouch,
217 .f_process = filt_sorprocess,
218 };
219
220 SYSCTL_DECL(_kern_ipc);
221
222 #define EVEN_MORE_LOCKING_DEBUG 0
223
224 int socket_debug = 0;
225 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
226 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
227
228 static unsigned long sodefunct_calls = 0;
229 SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
230 &sodefunct_calls, "");
231
232 ZONE_DECLARE(socket_zone, "socket", sizeof(struct socket), ZC_ZFREE_CLEARMEM);
233 so_gen_t so_gencnt; /* generation count for sockets */
234
235 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
236 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
237
238 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
239 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
240 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
241 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
242 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
243 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
244 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
245 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
246 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
247
248 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
249
250 int somaxconn = SOMAXCONN;
251 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
252 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
253
254 /* Should we get a maximum also ??? */
255 static int sosendmaxchain = 65536;
256 static int sosendminchain = 16384;
257 static int sorecvmincopy = 16384;
258 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
259 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
260 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
261 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
262
263 /*
264 * Set to enable jumbo clusters (if available) for large writes when
265 * the socket is marked with SOF_MULTIPAGES; see below.
266 */
267 int sosendjcl = 1;
268 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
269 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
270
271 /*
272 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
273 * writes on the socket for all protocols on any network interfaces,
274 * depending upon sosendjcl above. Be extra careful when setting this
275 * to 1, because sending down packets that cross physical pages down to
276 * broken drivers (those that falsely assume that the physical pages
277 * are contiguous) might lead to system panics or silent data corruption.
278 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
279 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
280 * capable. Set this to 1 only for testing/debugging purposes.
281 */
282 int sosendjcl_ignore_capab = 0;
283 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
284 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
285
286 /*
287 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
288 * writes on the socket for all protocols on any network interfaces.
289 * Be extra careful when setting this to 1, because sending down packets with
290 * clusters larger that 2 KB might lead to system panics or data corruption.
291 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
292 * on the outgoing interface
293 * Set this to 1 for testing/debugging purposes only.
294 */
295 int sosendbigcl_ignore_capab = 0;
296 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
297 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
298
299 int sodefunctlog = 0;
300 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
301 &sodefunctlog, 0, "");
302
303 int sothrottlelog = 0;
304 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
305 &sothrottlelog, 0, "");
306
307 int sorestrictrecv = 1;
308 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
309 &sorestrictrecv, 0, "Enable inbound interface restrictions");
310
311 int sorestrictsend = 1;
312 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
313 &sorestrictsend, 0, "Enable outbound interface restrictions");
314
315 int soreserveheadroom = 1;
316 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
317 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
318
319 #if (DEBUG || DEVELOPMENT)
320 int so_notsent_lowat_check = 1;
321 SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
322 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
323 #endif /* DEBUG || DEVELOPMENT */
324
325 int so_accept_list_waits = 0;
326 #if (DEBUG || DEVELOPMENT)
327 SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
328 &so_accept_list_waits, 0, "number of waits for listener incomp list");
329 #endif /* DEBUG || DEVELOPMENT */
330
331 extern struct inpcbinfo tcbinfo;
332
333 /* TODO: these should be in header file */
334 extern int get_inpcb_str_size(void);
335 extern int get_tcp_str_size(void);
336
337 vm_size_t so_cache_zone_element_size;
338
339 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
340 user_ssize_t *);
341 static void cached_sock_alloc(struct socket **, zalloc_flags_t);
342 static void cached_sock_free(struct socket *);
343
344 /*
345 * Maximum of extended background idle sockets per process
346 * Set to zero to disable further setting of the option
347 */
348
349 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
350 #define SO_IDLE_BK_IDLE_TIME 600
351 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
352
353 struct soextbkidlestat soextbkidlestat;
354
355 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
356 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
357 "Maximum of extended background idle sockets per process");
358
359 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
360 &soextbkidlestat.so_xbkidle_time, 0,
361 "Time in seconds to keep extended background idle sockets");
362
363 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
364 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
365 "High water mark for extended background idle sockets");
366
367 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
368 &soextbkidlestat, soextbkidlestat, "");
369
370 int so_set_extended_bk_idle(struct socket *, int);
371
372
373 /*
374 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
375 * setting the DSCP code on the packet based on the service class; see
376 * <rdar://problem/11277343> for details.
377 */
378 __private_extern__ u_int32_t sotcdb = 0;
379 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
380 &sotcdb, 0, "");
381
382 void
383 socketinit(void)
384 {
385 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
386 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
387
388 #ifdef __LP64__
389 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
390 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
391 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
392 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
393 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
394 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
395 #else
396 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
397 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
398 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
399 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
400 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
401 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
402 #endif
403
404 if (socketinit_done) {
405 printf("socketinit: already called...\n");
406 return;
407 }
408 socketinit_done = 1;
409
410 PE_parse_boot_argn("socket_debug", &socket_debug,
411 sizeof(socket_debug));
412
413 /*
414 * allocate lock group attribute and group for socket cache mutex
415 */
416 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
417 so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
418 so_cache_mtx_grp_attr);
419
420 /*
421 * allocate the lock attribute for socket cache mutex
422 */
423 so_cache_mtx_attr = lck_attr_alloc_init();
424
425 /* cached sockets mutex */
426 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
427 if (so_cache_mtx == NULL) {
428 panic("%s: unable to allocate so_cache_mtx\n", __func__);
429 /* NOTREACHED */
430 }
431 STAILQ_INIT(&so_cache_head);
432
433 so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
434 + get_inpcb_str_size() + 4 + get_tcp_str_size());
435
436 so_cache_zone = zone_create("socache zone", so_cache_zone_element_size,
437 ZC_ZFREE_CLEARMEM | ZC_NOENCRYPT);
438
439 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
440 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
441 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
442 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
443
444 in_pcbinit();
445 sflt_init();
446 socket_tclass_init();
447 #if MULTIPATH
448 mp_pcbinit();
449 #endif /* MULTIPATH */
450 }
451
452 static void
453 cached_sock_alloc(struct socket **so, zalloc_flags_t how)
454 {
455 caddr_t temp;
456 uintptr_t offset;
457
458 lck_mtx_lock(so_cache_mtx);
459
460 if (!STAILQ_EMPTY(&so_cache_head)) {
461 VERIFY(cached_sock_count > 0);
462
463 *so = STAILQ_FIRST(&so_cache_head);
464 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
465 STAILQ_NEXT((*so), so_cache_ent) = NULL;
466
467 cached_sock_count--;
468 lck_mtx_unlock(so_cache_mtx);
469
470 temp = (*so)->so_saved_pcb;
471 bzero((caddr_t)*so, sizeof(struct socket));
472
473 (*so)->so_saved_pcb = temp;
474 } else {
475 lck_mtx_unlock(so_cache_mtx);
476
477 *so = zalloc_flags(so_cache_zone, how | Z_ZERO);
478
479 /*
480 * Define offsets for extra structures into our
481 * single block of memory. Align extra structures
482 * on longword boundaries.
483 */
484
485 offset = (uintptr_t)*so;
486 offset += sizeof(struct socket);
487
488 offset = ALIGN(offset);
489
490 (*so)->so_saved_pcb = (caddr_t)offset;
491 offset += get_inpcb_str_size();
492
493 offset = ALIGN(offset);
494
495 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
496 (caddr_t)offset;
497 }
498
499 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
500 }
501
502 static void
503 cached_sock_free(struct socket *so)
504 {
505 lck_mtx_lock(so_cache_mtx);
506
507 so_cache_time = net_uptime();
508 if (++cached_sock_count > max_cached_sock_count) {
509 --cached_sock_count;
510 lck_mtx_unlock(so_cache_mtx);
511 zfree(so_cache_zone, so);
512 } else {
513 if (so_cache_hw < cached_sock_count) {
514 so_cache_hw = cached_sock_count;
515 }
516
517 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
518
519 so->cache_timestamp = so_cache_time;
520 lck_mtx_unlock(so_cache_mtx);
521 }
522 }
523
524 void
525 so_update_last_owner_locked(struct socket *so, proc_t self)
526 {
527 if (so->last_pid != 0) {
528 /*
529 * last_pid and last_upid should remain zero for sockets
530 * created using sock_socket. The check above achieves that
531 */
532 if (self == PROC_NULL) {
533 self = current_proc();
534 }
535
536 if (so->last_upid != proc_uniqueid(self) ||
537 so->last_pid != proc_pid(self)) {
538 so->last_upid = proc_uniqueid(self);
539 so->last_pid = proc_pid(self);
540 proc_getexecutableuuid(self, so->last_uuid,
541 sizeof(so->last_uuid));
542 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
543 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
544 }
545 }
546 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
547 }
548 }
549
550 void
551 so_update_policy(struct socket *so)
552 {
553 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
554 (void) inp_update_policy(sotoinpcb(so));
555 }
556 }
557
558 #if NECP
559 static void
560 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
561 struct sockaddr *override_remote_addr)
562 {
563 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
564 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
565 override_remote_addr, 0);
566 }
567 }
568 #endif /* NECP */
569
570 boolean_t
571 so_cache_timer(void)
572 {
573 struct socket *p;
574 int n_freed = 0;
575 boolean_t rc = FALSE;
576
577 lck_mtx_lock(so_cache_mtx);
578 so_cache_timeouts++;
579 so_cache_time = net_uptime();
580
581 while (!STAILQ_EMPTY(&so_cache_head)) {
582 VERIFY(cached_sock_count > 0);
583 p = STAILQ_FIRST(&so_cache_head);
584 if ((so_cache_time - p->cache_timestamp) <
585 SO_CACHE_TIME_LIMIT) {
586 break;
587 }
588
589 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
590 --cached_sock_count;
591
592 zfree(so_cache_zone, p);
593
594 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
595 so_cache_max_freed++;
596 break;
597 }
598 }
599
600 /* Schedule again if there is more to cleanup */
601 if (!STAILQ_EMPTY(&so_cache_head)) {
602 rc = TRUE;
603 }
604
605 lck_mtx_unlock(so_cache_mtx);
606 return rc;
607 }
608
609 /*
610 * Get a socket structure from our zone, and initialize it.
611 * We don't implement `waitok' yet (see comments in uipc_domain.c).
612 * Note that it would probably be better to allocate socket
613 * and PCB at the same time, but I'm not convinced that all
614 * the protocols can be easily modified to do this.
615 */
616 struct socket *
617 soalloc(int waitok, int dom, int type)
618 {
619 zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
620 struct socket *so;
621
622 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
623 cached_sock_alloc(&so, how);
624 } else {
625 so = zalloc_flags(socket_zone, how | Z_ZERO);
626 }
627 if (so != NULL) {
628 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
629
630 /*
631 * Increment the socket allocation statistics
632 */
633 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
634 }
635
636 return so;
637 }
638
639 int
640 socreate_internal(int dom, struct socket **aso, int type, int proto,
641 struct proc *p, uint32_t flags, struct proc *ep)
642 {
643 struct protosw *prp;
644 struct socket *so;
645 int error = 0;
646 #if defined(XNU_TARGET_OS_OSX)
647 pid_t rpid = -1;
648 #endif
649
650 #if TCPDEBUG
651 extern int tcpconsdebug;
652 #endif
653
654 VERIFY(aso != NULL);
655 *aso = NULL;
656
657 if (proto != 0) {
658 prp = pffindproto(dom, proto, type);
659 } else {
660 prp = pffindtype(dom, type);
661 }
662
663 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
664 if (pffinddomain(dom) == NULL) {
665 return EAFNOSUPPORT;
666 }
667 if (proto != 0) {
668 if (pffindprotonotype(dom, proto) != NULL) {
669 return EPROTOTYPE;
670 }
671 }
672 return EPROTONOSUPPORT;
673 }
674 if (prp->pr_type != type) {
675 return EPROTOTYPE;
676 }
677 so = soalloc(1, dom, type);
678 if (so == NULL) {
679 return ENOBUFS;
680 }
681
682 switch (dom) {
683 case PF_LOCAL:
684 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
685 break;
686 case PF_INET:
687 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
688 if (type == SOCK_STREAM) {
689 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
690 } else {
691 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
692 }
693 break;
694 case PF_ROUTE:
695 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
696 break;
697 case PF_NDRV:
698 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
699 break;
700 case PF_KEY:
701 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
702 break;
703 case PF_INET6:
704 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
705 if (type == SOCK_STREAM) {
706 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
707 } else {
708 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
709 }
710 break;
711 case PF_SYSTEM:
712 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
713 break;
714 case PF_MULTIPATH:
715 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
716 break;
717 default:
718 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
719 break;
720 }
721
722 if (flags & SOCF_MPTCP) {
723 so->so_state |= SS_NBIO;
724 }
725
726 TAILQ_INIT(&so->so_incomp);
727 TAILQ_INIT(&so->so_comp);
728 so->so_type = type;
729 so->last_upid = proc_uniqueid(p);
730 so->last_pid = proc_pid(p);
731 proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
732 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
733
734 if (ep != PROC_NULL && ep != p) {
735 so->e_upid = proc_uniqueid(ep);
736 so->e_pid = proc_pid(ep);
737 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
738 so->so_flags |= SOF_DELEGATED;
739 #if defined(XNU_TARGET_OS_OSX)
740 if (ep->p_responsible_pid != so->e_pid) {
741 rpid = ep->p_responsible_pid;
742 }
743 #endif
744 }
745
746 #if defined(XNU_TARGET_OS_OSX)
747 if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
748 rpid = p->p_responsible_pid;
749 }
750
751 so->so_rpid = -1;
752 uuid_clear(so->so_ruuid);
753 if (rpid >= 0) {
754 proc_t rp = proc_find(rpid);
755 if (rp != PROC_NULL) {
756 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
757 so->so_rpid = rpid;
758 proc_rele(rp);
759 }
760 }
761 #endif
762
763 so->so_cred = kauth_cred_proc_ref(p);
764 if (!suser(kauth_cred_get(), NULL)) {
765 so->so_state |= SS_PRIV;
766 }
767
768 so->so_proto = prp;
769 so->so_rcv.sb_flags |= SB_RECV;
770 so->so_rcv.sb_so = so->so_snd.sb_so = so;
771 so->next_lock_lr = 0;
772 so->next_unlock_lr = 0;
773
774 /*
775 * Attachment will create the per pcb lock if necessary and
776 * increase refcount for creation, make sure it's done before
777 * socket is inserted in lists.
778 */
779 so->so_usecount++;
780
781 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
782 if (error != 0) {
783 /*
784 * Warning:
785 * If so_pcb is not zero, the socket will be leaked,
786 * so protocol attachment handler must be coded carefuly
787 */
788 so->so_state |= SS_NOFDREF;
789 VERIFY(so->so_usecount > 0);
790 so->so_usecount--;
791 sofreelastref(so, 1); /* will deallocate the socket */
792 return error;
793 }
794
795 /*
796 * Note: needs so_pcb to be set after pru_attach
797 */
798 if (prp->pr_update_last_owner != NULL) {
799 (*prp->pr_update_last_owner)(so, p, ep);
800 }
801
802 atomic_add_32(&prp->pr_domain->dom_refs, 1);
803
804 /* Attach socket filters for this protocol */
805 sflt_initsock(so);
806 #if TCPDEBUG
807 if (tcpconsdebug == 2) {
808 so->so_options |= SO_DEBUG;
809 }
810 #endif
811 so_set_default_traffic_class(so);
812
813 /*
814 * If this thread or task is marked to create backgrounded sockets,
815 * mark the socket as background.
816 */
817 if (!(flags & SOCF_MPTCP) &&
818 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
819 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
820 so->so_background_thread = current_thread();
821 }
822
823 switch (dom) {
824 /*
825 * Don't mark Unix domain or system
826 * eligible for defunct by default.
827 */
828 case PF_LOCAL:
829 case PF_SYSTEM:
830 so->so_flags |= SOF_NODEFUNCT;
831 break;
832 default:
833 break;
834 }
835
836 /*
837 * Entitlements can't be checked at socket creation time except if the
838 * application requested a feature guarded by a privilege (c.f., socket
839 * delegation).
840 * The priv(9) and the Sandboxing APIs are designed with the idea that
841 * a privilege check should only be triggered by a userland request.
842 * A privilege check at socket creation time is time consuming and
843 * could trigger many authorisation error messages from the security
844 * APIs.
845 */
846
847 *aso = so;
848
849 return 0;
850 }
851
852 /*
853 * Returns: 0 Success
854 * EAFNOSUPPORT
855 * EPROTOTYPE
856 * EPROTONOSUPPORT
857 * ENOBUFS
858 * <pru_attach>:ENOBUFS[AF_UNIX]
859 * <pru_attach>:ENOBUFS[TCP]
860 * <pru_attach>:ENOMEM[TCP]
861 * <pru_attach>:??? [other protocol families, IPSEC]
862 */
863 int
864 socreate(int dom, struct socket **aso, int type, int proto)
865 {
866 return socreate_internal(dom, aso, type, proto, current_proc(), 0,
867 PROC_NULL);
868 }
869
870 int
871 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
872 {
873 int error = 0;
874 struct proc *ep = PROC_NULL;
875
876 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
877 error = ESRCH;
878 goto done;
879 }
880
881 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
882
883 /*
884 * It might not be wise to hold the proc reference when calling
885 * socreate_internal since it calls soalloc with M_WAITOK
886 */
887 done:
888 if (ep != PROC_NULL) {
889 proc_rele(ep);
890 }
891
892 return error;
893 }
894
895 /*
896 * Returns: 0 Success
897 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
898 * <pru_bind>:EAFNOSUPPORT Address family not supported
899 * <pru_bind>:EADDRNOTAVAIL Address not available.
900 * <pru_bind>:EINVAL Invalid argument
901 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
902 * <pru_bind>:EACCES Permission denied
903 * <pru_bind>:EADDRINUSE Address in use
904 * <pru_bind>:EAGAIN Resource unavailable, try again
905 * <pru_bind>:EPERM Operation not permitted
906 * <pru_bind>:???
907 * <sf_bind>:???
908 *
909 * Notes: It's not possible to fully enumerate the return codes above,
910 * since socket filter authors and protocol family authors may
911 * not choose to limit their error returns to those listed, even
912 * though this may result in some software operating incorrectly.
913 *
914 * The error codes which are enumerated above are those known to
915 * be returned by the tcp_usr_bind function supplied.
916 */
917 int
918 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
919 {
920 struct proc *p = current_proc();
921 int error = 0;
922
923 if (dolock) {
924 socket_lock(so, 1);
925 }
926
927 so_update_last_owner_locked(so, p);
928 so_update_policy(so);
929
930 #if NECP
931 so_update_necp_policy(so, nam, NULL);
932 #endif /* NECP */
933
934 /*
935 * If this is a bind request on a socket that has been marked
936 * as inactive, reject it now before we go any further.
937 */
938 if (so->so_flags & SOF_DEFUNCT) {
939 error = EINVAL;
940 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
941 __func__, proc_pid(p), proc_best_name(p),
942 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
943 SOCK_DOM(so), SOCK_TYPE(so), error);
944 goto out;
945 }
946
947 /* Socket filter */
948 error = sflt_bind(so, nam);
949
950 if (error == 0) {
951 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
952 }
953 out:
954 if (dolock) {
955 socket_unlock(so, 1);
956 }
957
958 if (error == EJUSTRETURN) {
959 error = 0;
960 }
961
962 return error;
963 }
964
965 void
966 sodealloc(struct socket *so)
967 {
968 kauth_cred_unref(&so->so_cred);
969
970 /* Remove any filters */
971 sflt_termsock(so);
972
973 #if CONTENT_FILTER
974 cfil_sock_detach(so);
975 #endif /* CONTENT_FILTER */
976
977 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
978
979 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
980 cached_sock_free(so);
981 } else {
982 zfree(socket_zone, so);
983 }
984 }
985
986 /*
987 * Returns: 0 Success
988 * EINVAL
989 * EOPNOTSUPP
990 * <pru_listen>:EINVAL[AF_UNIX]
991 * <pru_listen>:EINVAL[TCP]
992 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
993 * <pru_listen>:EINVAL[TCP] Invalid argument
994 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
995 * <pru_listen>:EACCES[TCP] Permission denied
996 * <pru_listen>:EADDRINUSE[TCP] Address in use
997 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
998 * <pru_listen>:EPERM[TCP] Operation not permitted
999 * <sf_listen>:???
1000 *
1001 * Notes: Other <pru_listen> returns depend on the protocol family; all
1002 * <sf_listen> returns depend on what the filter author causes
1003 * their filter to return.
1004 */
1005 int
1006 solisten(struct socket *so, int backlog)
1007 {
1008 struct proc *p = current_proc();
1009 int error = 0;
1010
1011 socket_lock(so, 1);
1012
1013 so_update_last_owner_locked(so, p);
1014 so_update_policy(so);
1015
1016 #if NECP
1017 so_update_necp_policy(so, NULL, NULL);
1018 #endif /* NECP */
1019
1020 if (so->so_proto == NULL) {
1021 error = EINVAL;
1022 goto out;
1023 }
1024 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1025 error = EOPNOTSUPP;
1026 goto out;
1027 }
1028
1029 /*
1030 * If the listen request is made on a socket that is not fully
1031 * disconnected, or on a socket that has been marked as inactive,
1032 * reject the request now.
1033 */
1034 if ((so->so_state &
1035 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1036 (so->so_flags & SOF_DEFUNCT)) {
1037 error = EINVAL;
1038 if (so->so_flags & SOF_DEFUNCT) {
1039 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1040 "(%d)\n", __func__, proc_pid(p),
1041 proc_best_name(p),
1042 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1043 SOCK_DOM(so), SOCK_TYPE(so), error);
1044 }
1045 goto out;
1046 }
1047
1048 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1049 error = EPERM;
1050 goto out;
1051 }
1052
1053 error = sflt_listen(so);
1054 if (error == 0) {
1055 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1056 }
1057
1058 if (error) {
1059 if (error == EJUSTRETURN) {
1060 error = 0;
1061 }
1062 goto out;
1063 }
1064
1065 if (TAILQ_EMPTY(&so->so_comp)) {
1066 so->so_options |= SO_ACCEPTCONN;
1067 }
1068 /*
1069 * POSIX: The implementation may have an upper limit on the length of
1070 * the listen queue-either global or per accepting socket. If backlog
1071 * exceeds this limit, the length of the listen queue is set to the
1072 * limit.
1073 *
1074 * If listen() is called with a backlog argument value that is less
1075 * than 0, the function behaves as if it had been called with a backlog
1076 * argument value of 0.
1077 *
1078 * A backlog argument of 0 may allow the socket to accept connections,
1079 * in which case the length of the listen queue may be set to an
1080 * implementation-defined minimum value.
1081 */
1082 if (backlog <= 0 || backlog > somaxconn) {
1083 backlog = somaxconn;
1084 }
1085
1086 so->so_qlimit = backlog;
1087 out:
1088 socket_unlock(so, 1);
1089 return error;
1090 }
1091
1092 /*
1093 * The "accept list lock" protects the fields related to the listener queues
1094 * because we can unlock a socket to respect the lock ordering between
1095 * the listener socket and its clients sockets. The lock ordering is first to
1096 * acquire the client socket before the listener socket.
1097 *
1098 * The accept list lock serializes access to the following fields:
1099 * - of the listener socket:
1100 * - so_comp
1101 * - so_incomp
1102 * - so_qlen
1103 * - so_inqlen
1104 * - of client sockets that are in so_comp or so_incomp:
1105 * - so_head
1106 * - so_list
1107 *
1108 * As one can see the accept list lock protects the consistent of the
1109 * linkage of the client sockets.
1110 *
1111 * Note that those fields may be read without holding the accept list lock
1112 * for a preflight provided the accept list lock is taken when committing
1113 * to take an action based on the result of the preflight. The preflight
1114 * saves the cost of doing the unlock/lock dance.
1115 */
1116 void
1117 so_acquire_accept_list(struct socket *head, struct socket *so)
1118 {
1119 lck_mtx_t *mutex_held;
1120
1121 if (head->so_proto->pr_getlock == NULL) {
1122 return;
1123 }
1124 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1125 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1126
1127 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1128 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1129 return;
1130 }
1131 if (so != NULL) {
1132 socket_unlock(so, 0);
1133 }
1134 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1135 so_accept_list_waits += 1;
1136 msleep((caddr_t)&head->so_incomp, mutex_held,
1137 PSOCK | PCATCH, __func__, NULL);
1138 }
1139 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1140 if (so != NULL) {
1141 socket_unlock(head, 0);
1142 socket_lock(so, 0);
1143 socket_lock(head, 0);
1144 }
1145 }
1146
1147 void
1148 so_release_accept_list(struct socket *head)
1149 {
1150 if (head->so_proto->pr_getlock != NULL) {
1151 lck_mtx_t *mutex_held;
1152
1153 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1154 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1155
1156 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1157 wakeup((caddr_t)&head->so_incomp);
1158 }
1159 }
1160
1161 void
1162 sofreelastref(struct socket *so, int dealloc)
1163 {
1164 struct socket *head = so->so_head;
1165
1166 /* Assume socket is locked */
1167
1168 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1169 selthreadclear(&so->so_snd.sb_sel);
1170 selthreadclear(&so->so_rcv.sb_sel);
1171 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1172 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1173 so->so_event = sonullevent;
1174 return;
1175 }
1176 if (head != NULL) {
1177 /*
1178 * Need to lock the listener when the protocol has
1179 * per socket locks
1180 */
1181 if (head->so_proto->pr_getlock != NULL) {
1182 socket_lock(head, 1);
1183 so_acquire_accept_list(head, so);
1184 }
1185 if (so->so_state & SS_INCOMP) {
1186 so->so_state &= ~SS_INCOMP;
1187 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1188 head->so_incqlen--;
1189 head->so_qlen--;
1190 so->so_head = NULL;
1191
1192 if (head->so_proto->pr_getlock != NULL) {
1193 so_release_accept_list(head);
1194 socket_unlock(head, 1);
1195 }
1196 } else if (so->so_state & SS_COMP) {
1197 if (head->so_proto->pr_getlock != NULL) {
1198 so_release_accept_list(head);
1199 socket_unlock(head, 1);
1200 }
1201 /*
1202 * We must not decommission a socket that's
1203 * on the accept(2) queue. If we do, then
1204 * accept(2) may hang after select(2) indicated
1205 * that the listening socket was ready.
1206 */
1207 selthreadclear(&so->so_snd.sb_sel);
1208 selthreadclear(&so->so_rcv.sb_sel);
1209 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1210 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1211 so->so_event = sonullevent;
1212 return;
1213 } else {
1214 if (head->so_proto->pr_getlock != NULL) {
1215 so_release_accept_list(head);
1216 socket_unlock(head, 1);
1217 }
1218 printf("sofree: not queued\n");
1219 }
1220 }
1221 sowflush(so);
1222 sorflush(so);
1223
1224 #if FLOW_DIVERT
1225 if (so->so_flags & SOF_FLOW_DIVERT) {
1226 flow_divert_detach(so);
1227 }
1228 #endif /* FLOW_DIVERT */
1229
1230 /* 3932268: disable upcall */
1231 so->so_rcv.sb_flags &= ~SB_UPCALL;
1232 so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1233 so->so_event = sonullevent;
1234
1235 if (dealloc) {
1236 sodealloc(so);
1237 }
1238 }
1239
1240 void
1241 soclose_wait_locked(struct socket *so)
1242 {
1243 lck_mtx_t *mutex_held;
1244
1245 if (so->so_proto->pr_getlock != NULL) {
1246 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1247 } else {
1248 mutex_held = so->so_proto->pr_domain->dom_mtx;
1249 }
1250 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1251
1252 /*
1253 * Double check here and return if there's no outstanding upcall;
1254 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1255 */
1256 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1257 return;
1258 }
1259 so->so_rcv.sb_flags &= ~SB_UPCALL;
1260 so->so_snd.sb_flags &= ~SB_UPCALL;
1261 so->so_flags |= SOF_CLOSEWAIT;
1262
1263 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1264 "soclose_wait_locked", NULL);
1265 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1266 so->so_flags &= ~SOF_CLOSEWAIT;
1267 }
1268
1269 /*
1270 * Close a socket on last file table reference removal.
1271 * Initiate disconnect if connected.
1272 * Free socket when disconnect complete.
1273 */
1274 int
1275 soclose_locked(struct socket *so)
1276 {
1277 int error = 0;
1278 struct timespec ts;
1279
1280 if (so->so_usecount == 0) {
1281 panic("soclose: so=%p refcount=0\n", so);
1282 /* NOTREACHED */
1283 }
1284
1285 sflt_notify(so, sock_evt_closing, NULL);
1286
1287 if (so->so_upcallusecount) {
1288 soclose_wait_locked(so);
1289 }
1290
1291 #if CONTENT_FILTER
1292 /*
1293 * We have to wait until the content filters are done
1294 */
1295 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1296 cfil_sock_close_wait(so);
1297 cfil_sock_is_closed(so);
1298 cfil_sock_detach(so);
1299 }
1300 #endif /* CONTENT_FILTER */
1301
1302 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1303 soresume(current_proc(), so, 1);
1304 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1305 }
1306
1307 if ((so->so_options & SO_ACCEPTCONN)) {
1308 struct socket *sp, *sonext;
1309 int persocklock = 0;
1310 int incomp_overflow_only;
1311
1312 /*
1313 * We do not want new connection to be added
1314 * to the connection queues
1315 */
1316 so->so_options &= ~SO_ACCEPTCONN;
1317
1318 /*
1319 * We can drop the lock on the listener once
1320 * we've acquired the incoming list
1321 */
1322 if (so->so_proto->pr_getlock != NULL) {
1323 persocklock = 1;
1324 so_acquire_accept_list(so, NULL);
1325 socket_unlock(so, 0);
1326 }
1327 again:
1328 incomp_overflow_only = 1;
1329
1330 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1331 /*
1332 * Radar 5350314
1333 * skip sockets thrown away by tcpdropdropblreq
1334 * they will get cleanup by the garbage collection.
1335 * otherwise, remove the incomp socket from the queue
1336 * and let soabort trigger the appropriate cleanup.
1337 */
1338 if (sp->so_flags & SOF_OVERFLOW) {
1339 continue;
1340 }
1341
1342 if (persocklock != 0) {
1343 socket_lock(sp, 1);
1344 }
1345
1346 /*
1347 * Radar 27945981
1348 * The extra reference for the list insure the
1349 * validity of the socket pointer when we perform the
1350 * unlock of the head above
1351 */
1352 if (sp->so_state & SS_INCOMP) {
1353 sp->so_state &= ~SS_INCOMP;
1354 sp->so_head = NULL;
1355 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1356 so->so_incqlen--;
1357 so->so_qlen--;
1358
1359 (void) soabort(sp);
1360 } else {
1361 panic("%s sp %p in so_incomp but !SS_INCOMP",
1362 __func__, sp);
1363 }
1364
1365 if (persocklock != 0) {
1366 socket_unlock(sp, 1);
1367 }
1368 }
1369
1370 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1371 /* Dequeue from so_comp since sofree() won't do it */
1372 if (persocklock != 0) {
1373 socket_lock(sp, 1);
1374 }
1375
1376 if (sp->so_state & SS_COMP) {
1377 sp->so_state &= ~SS_COMP;
1378 sp->so_head = NULL;
1379 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1380 so->so_qlen--;
1381
1382 (void) soabort(sp);
1383 } else {
1384 panic("%s sp %p in so_comp but !SS_COMP",
1385 __func__, sp);
1386 }
1387
1388 if (persocklock) {
1389 socket_unlock(sp, 1);
1390 }
1391 }
1392
1393 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1394 #if (DEBUG | DEVELOPMENT)
1395 panic("%s head %p so_comp not empty\n", __func__, so);
1396 #endif /* (DEVELOPMENT || DEBUG) */
1397
1398 goto again;
1399 }
1400
1401 if (!TAILQ_EMPTY(&so->so_comp)) {
1402 #if (DEBUG | DEVELOPMENT)
1403 panic("%s head %p so_comp not empty\n", __func__, so);
1404 #endif /* (DEVELOPMENT || DEBUG) */
1405
1406 goto again;
1407 }
1408
1409 if (persocklock) {
1410 socket_lock(so, 0);
1411 so_release_accept_list(so);
1412 }
1413 }
1414 if (so->so_pcb == NULL) {
1415 /* 3915887: mark the socket as ready for dealloc */
1416 so->so_flags |= SOF_PCBCLEARING;
1417 goto discard;
1418 }
1419 if (so->so_state & SS_ISCONNECTED) {
1420 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1421 error = sodisconnectlocked(so);
1422 if (error) {
1423 goto drop;
1424 }
1425 }
1426 if (so->so_options & SO_LINGER) {
1427 lck_mtx_t *mutex_held;
1428
1429 if ((so->so_state & SS_ISDISCONNECTING) &&
1430 (so->so_state & SS_NBIO)) {
1431 goto drop;
1432 }
1433 if (so->so_proto->pr_getlock != NULL) {
1434 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1435 } else {
1436 mutex_held = so->so_proto->pr_domain->dom_mtx;
1437 }
1438 while (so->so_state & SS_ISCONNECTED) {
1439 ts.tv_sec = (so->so_linger / 100);
1440 ts.tv_nsec = (so->so_linger % 100) *
1441 NSEC_PER_USEC * 1000 * 10;
1442 error = msleep((caddr_t)&so->so_timeo,
1443 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1444 if (error) {
1445 /*
1446 * It's OK when the time fires,
1447 * don't report an error
1448 */
1449 if (error == EWOULDBLOCK) {
1450 error = 0;
1451 }
1452 break;
1453 }
1454 }
1455 }
1456 }
1457 drop:
1458 if (so->so_usecount == 0) {
1459 panic("soclose: usecount is zero so=%p\n", so);
1460 /* NOTREACHED */
1461 }
1462 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1463 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1464 if (error == 0) {
1465 error = error2;
1466 }
1467 }
1468 if (so->so_usecount <= 0) {
1469 panic("soclose: usecount is zero so=%p\n", so);
1470 /* NOTREACHED */
1471 }
1472 discard:
1473 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1474 (so->so_state & SS_NOFDREF)) {
1475 panic("soclose: NOFDREF");
1476 /* NOTREACHED */
1477 }
1478 so->so_state |= SS_NOFDREF;
1479
1480 if ((so->so_flags & SOF_KNOTE) != 0) {
1481 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1482 }
1483
1484 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1485
1486 VERIFY(so->so_usecount > 0);
1487 so->so_usecount--;
1488 sofree(so);
1489 return error;
1490 }
1491
1492 int
1493 soclose(struct socket *so)
1494 {
1495 int error = 0;
1496 socket_lock(so, 1);
1497
1498 if (so->so_retaincnt == 0) {
1499 error = soclose_locked(so);
1500 } else {
1501 /*
1502 * if the FD is going away, but socket is
1503 * retained in kernel remove its reference
1504 */
1505 so->so_usecount--;
1506 if (so->so_usecount < 2) {
1507 panic("soclose: retaincnt non null and so=%p "
1508 "usecount=%d\n", so, so->so_usecount);
1509 }
1510 }
1511 socket_unlock(so, 1);
1512 return error;
1513 }
1514
1515 /*
1516 * Must be called at splnet...
1517 */
1518 /* Should already be locked */
1519 int
1520 soabort(struct socket *so)
1521 {
1522 int error;
1523
1524 #ifdef MORE_LOCKING_DEBUG
1525 lck_mtx_t *mutex_held;
1526
1527 if (so->so_proto->pr_getlock != NULL) {
1528 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1529 } else {
1530 mutex_held = so->so_proto->pr_domain->dom_mtx;
1531 }
1532 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1533 #endif
1534
1535 if ((so->so_flags & SOF_ABORTED) == 0) {
1536 so->so_flags |= SOF_ABORTED;
1537 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1538 if (error) {
1539 sofree(so);
1540 return error;
1541 }
1542 }
1543 return 0;
1544 }
1545
1546 int
1547 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1548 {
1549 int error;
1550
1551 if (dolock) {
1552 socket_lock(so, 1);
1553 }
1554
1555 so_update_last_owner_locked(so, PROC_NULL);
1556 so_update_policy(so);
1557 #if NECP
1558 so_update_necp_policy(so, NULL, NULL);
1559 #endif /* NECP */
1560
1561 if ((so->so_state & SS_NOFDREF) == 0) {
1562 panic("soaccept: !NOFDREF");
1563 }
1564 so->so_state &= ~SS_NOFDREF;
1565 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1566
1567 if (dolock) {
1568 socket_unlock(so, 1);
1569 }
1570 return error;
1571 }
1572
1573 int
1574 soaccept(struct socket *so, struct sockaddr **nam)
1575 {
1576 return soacceptlock(so, nam, 1);
1577 }
1578
1579 int
1580 soacceptfilter(struct socket *so, struct socket *head)
1581 {
1582 struct sockaddr *local = NULL, *remote = NULL;
1583 int error = 0;
1584
1585 /*
1586 * Hold the lock even if this socket has not been made visible
1587 * to the filter(s). For sockets with global locks, this protects
1588 * against the head or peer going away
1589 */
1590 socket_lock(so, 1);
1591 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1592 sogetaddr_locked(so, &local, 0) != 0) {
1593 so->so_state &= ~SS_NOFDREF;
1594 socket_unlock(so, 1);
1595 soclose(so);
1596 /* Out of resources; try it again next time */
1597 error = ECONNABORTED;
1598 goto done;
1599 }
1600
1601 error = sflt_accept(head, so, local, remote);
1602
1603 /*
1604 * If we get EJUSTRETURN from one of the filters, mark this socket
1605 * as inactive and return it anyway. This newly accepted socket
1606 * will be disconnected later before we hand it off to the caller.
1607 */
1608 if (error == EJUSTRETURN) {
1609 error = 0;
1610 (void) sosetdefunct(current_proc(), so,
1611 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1612 }
1613
1614 if (error != 0) {
1615 /*
1616 * This may seem like a duplication to the above error
1617 * handling part when we return ECONNABORTED, except
1618 * the following is done while holding the lock since
1619 * the socket has been exposed to the filter(s) earlier.
1620 */
1621 so->so_state &= ~SS_NOFDREF;
1622 socket_unlock(so, 1);
1623 soclose(so);
1624 /* Propagate socket filter's error code to the caller */
1625 } else {
1626 socket_unlock(so, 1);
1627 }
1628 done:
1629 /* Callee checks for NULL pointer */
1630 sock_freeaddr(remote);
1631 sock_freeaddr(local);
1632 return error;
1633 }
1634
1635 /*
1636 * Returns: 0 Success
1637 * EOPNOTSUPP Operation not supported on socket
1638 * EISCONN Socket is connected
1639 * <pru_connect>:EADDRNOTAVAIL Address not available.
1640 * <pru_connect>:EINVAL Invalid argument
1641 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1642 * <pru_connect>:EACCES Permission denied
1643 * <pru_connect>:EADDRINUSE Address in use
1644 * <pru_connect>:EAGAIN Resource unavailable, try again
1645 * <pru_connect>:EPERM Operation not permitted
1646 * <sf_connect_out>:??? [anything a filter writer might set]
1647 */
1648 int
1649 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1650 {
1651 int error;
1652 struct proc *p = current_proc();
1653
1654 if (dolock) {
1655 socket_lock(so, 1);
1656 }
1657
1658 so_update_last_owner_locked(so, p);
1659 so_update_policy(so);
1660
1661 #if NECP
1662 so_update_necp_policy(so, NULL, nam);
1663 #endif /* NECP */
1664
1665 /*
1666 * If this is a listening socket or if this is a previously-accepted
1667 * socket that has been marked as inactive, reject the connect request.
1668 */
1669 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1670 error = EOPNOTSUPP;
1671 if (so->so_flags & SOF_DEFUNCT) {
1672 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1673 "(%d)\n", __func__, proc_pid(p),
1674 proc_best_name(p),
1675 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1676 SOCK_DOM(so), SOCK_TYPE(so), error);
1677 }
1678 if (dolock) {
1679 socket_unlock(so, 1);
1680 }
1681 return error;
1682 }
1683
1684 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1685 if (dolock) {
1686 socket_unlock(so, 1);
1687 }
1688 return EPERM;
1689 }
1690
1691 /*
1692 * If protocol is connection-based, can only connect once.
1693 * Otherwise, if connected, try to disconnect first.
1694 * This allows user to disconnect by connecting to, e.g.,
1695 * a null address.
1696 */
1697 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1698 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1699 (error = sodisconnectlocked(so)))) {
1700 error = EISCONN;
1701 } else {
1702 /*
1703 * Run connect filter before calling protocol:
1704 * - non-blocking connect returns before completion;
1705 */
1706 error = sflt_connectout(so, nam);
1707 if (error != 0) {
1708 if (error == EJUSTRETURN) {
1709 error = 0;
1710 }
1711 } else {
1712 error = (*so->so_proto->pr_usrreqs->pru_connect)
1713 (so, nam, p);
1714 if (error != 0) {
1715 so->so_state &= ~SS_ISCONNECTING;
1716 }
1717 }
1718 }
1719 if (dolock) {
1720 socket_unlock(so, 1);
1721 }
1722 return error;
1723 }
1724
1725 int
1726 soconnect(struct socket *so, struct sockaddr *nam)
1727 {
1728 return soconnectlock(so, nam, 1);
1729 }
1730
1731 /*
1732 * Returns: 0 Success
1733 * <pru_connect2>:EINVAL[AF_UNIX]
1734 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1735 * <pru_connect2>:??? [other protocol families]
1736 *
1737 * Notes: <pru_connect2> is not supported by [TCP].
1738 */
1739 int
1740 soconnect2(struct socket *so1, struct socket *so2)
1741 {
1742 int error;
1743
1744 socket_lock(so1, 1);
1745 if (so2->so_proto->pr_lock) {
1746 socket_lock(so2, 1);
1747 }
1748
1749 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1750
1751 socket_unlock(so1, 1);
1752 if (so2->so_proto->pr_lock) {
1753 socket_unlock(so2, 1);
1754 }
1755 return error;
1756 }
1757
1758 int
1759 soconnectxlocked(struct socket *so, struct sockaddr *src,
1760 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1761 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1762 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1763 {
1764 int error;
1765
1766 so_update_last_owner_locked(so, p);
1767 so_update_policy(so);
1768
1769 /*
1770 * If this is a listening socket or if this is a previously-accepted
1771 * socket that has been marked as inactive, reject the connect request.
1772 */
1773 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1774 error = EOPNOTSUPP;
1775 if (so->so_flags & SOF_DEFUNCT) {
1776 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1777 "(%d)\n", __func__, proc_pid(p),
1778 proc_best_name(p),
1779 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1780 SOCK_DOM(so), SOCK_TYPE(so), error);
1781 }
1782 return error;
1783 }
1784
1785 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1786 return EPERM;
1787 }
1788
1789 /*
1790 * If protocol is connection-based, can only connect once
1791 * unless PR_MULTICONN is set. Otherwise, if connected,
1792 * try to disconnect first. This allows user to disconnect
1793 * by connecting to, e.g., a null address.
1794 */
1795 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1796 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1797 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1798 (error = sodisconnectlocked(so)) != 0)) {
1799 error = EISCONN;
1800 } else {
1801 if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1802 (flags & CONNECT_DATA_IDEMPOTENT)) {
1803 so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1804
1805 if (flags & CONNECT_DATA_AUTHENTICATED) {
1806 so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1807 }
1808 }
1809
1810 /*
1811 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1812 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1813 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1814 * Case 3 allows user to combine write with connect even if they have
1815 * no use for TFO (such as regular TCP, and UDP).
1816 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1817 */
1818 if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1819 ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1820 so->so_flags1 |= SOF1_PRECONNECT_DATA;
1821 }
1822
1823 /*
1824 * If a user sets data idempotent and does not pass an uio, or
1825 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1826 * SOF1_DATA_IDEMPOTENT.
1827 */
1828 if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1829 (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1830 /* We should return EINVAL instead perhaps. */
1831 so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1832 }
1833
1834 /*
1835 * Run connect filter before calling protocol:
1836 * - non-blocking connect returns before completion;
1837 */
1838 error = sflt_connectout(so, dst);
1839 if (error != 0) {
1840 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1841 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1842 if (error == EJUSTRETURN) {
1843 error = 0;
1844 }
1845 } else {
1846 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1847 (so, src, dst, p, ifscope, aid, pcid,
1848 flags, arg, arglen, auio, bytes_written);
1849 if (error != 0) {
1850 so->so_state &= ~SS_ISCONNECTING;
1851 if (error != EINPROGRESS) {
1852 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1853 }
1854 }
1855 }
1856 }
1857
1858 return error;
1859 }
1860
1861 int
1862 sodisconnectlocked(struct socket *so)
1863 {
1864 int error;
1865
1866 if ((so->so_state & SS_ISCONNECTED) == 0) {
1867 error = ENOTCONN;
1868 goto bad;
1869 }
1870 if (so->so_state & SS_ISDISCONNECTING) {
1871 error = EALREADY;
1872 goto bad;
1873 }
1874
1875 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1876 if (error == 0) {
1877 sflt_notify(so, sock_evt_disconnected, NULL);
1878 }
1879
1880 bad:
1881 return error;
1882 }
1883
1884 /* Locking version */
1885 int
1886 sodisconnect(struct socket *so)
1887 {
1888 int error;
1889
1890 socket_lock(so, 1);
1891 error = sodisconnectlocked(so);
1892 socket_unlock(so, 1);
1893 return error;
1894 }
1895
1896 int
1897 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1898 {
1899 int error;
1900
1901 /*
1902 * Call the protocol disconnectx handler; let it handle all
1903 * matters related to the connection state of this session.
1904 */
1905 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1906 if (error == 0) {
1907 /*
1908 * The event applies only for the session, not for
1909 * the disconnection of individual subflows.
1910 */
1911 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1912 sflt_notify(so, sock_evt_disconnected, NULL);
1913 }
1914 }
1915 return error;
1916 }
1917
1918 int
1919 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1920 {
1921 int error;
1922
1923 socket_lock(so, 1);
1924 error = sodisconnectxlocked(so, aid, cid);
1925 socket_unlock(so, 1);
1926 return error;
1927 }
1928
1929 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1930
1931 /*
1932 * sosendcheck will lock the socket buffer if it isn't locked and
1933 * verify that there is space for the data being inserted.
1934 *
1935 * Returns: 0 Success
1936 * EPIPE
1937 * sblock:EWOULDBLOCK
1938 * sblock:EINTR
1939 * sbwait:EBADF
1940 * sbwait:EINTR
1941 * [so_error]:???
1942 */
1943 int
1944 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1945 int32_t clen, int32_t atomic, int flags, int *sblocked)
1946 {
1947 int error = 0;
1948 int32_t space;
1949 int assumelock = 0;
1950
1951 restart:
1952 if (*sblocked == 0) {
1953 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1954 so->so_send_filt_thread != 0 &&
1955 so->so_send_filt_thread == current_thread()) {
1956 /*
1957 * We're being called recursively from a filter,
1958 * allow this to continue. Radar 4150520.
1959 * Don't set sblocked because we don't want
1960 * to perform an unlock later.
1961 */
1962 assumelock = 1;
1963 } else {
1964 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1965 if (error) {
1966 if (so->so_flags & SOF_DEFUNCT) {
1967 goto defunct;
1968 }
1969 return error;
1970 }
1971 *sblocked = 1;
1972 }
1973 }
1974
1975 /*
1976 * If a send attempt is made on a socket that has been marked
1977 * as inactive (disconnected), reject the request.
1978 */
1979 if (so->so_flags & SOF_DEFUNCT) {
1980 defunct:
1981 error = EPIPE;
1982 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1983 __func__, proc_selfpid(), proc_best_name(current_proc()),
1984 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1985 SOCK_DOM(so), SOCK_TYPE(so), error);
1986 return error;
1987 }
1988
1989 if (so->so_state & SS_CANTSENDMORE) {
1990 #if CONTENT_FILTER
1991 /*
1992 * Can re-inject data of half closed connections
1993 */
1994 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1995 so->so_snd.sb_cfil_thread == current_thread() &&
1996 cfil_sock_data_pending(&so->so_snd) != 0) {
1997 CFIL_LOG(LOG_INFO,
1998 "so %llx ignore SS_CANTSENDMORE",
1999 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2000 } else
2001 #endif /* CONTENT_FILTER */
2002 return EPIPE;
2003 }
2004 if (so->so_error) {
2005 error = so->so_error;
2006 so->so_error = 0;
2007 return error;
2008 }
2009
2010 if ((so->so_state & SS_ISCONNECTED) == 0) {
2011 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2012 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2013 (resid != 0 || clen == 0) &&
2014 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2015 return ENOTCONN;
2016 }
2017 } else if (addr == 0) {
2018 return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2019 ENOTCONN : EDESTADDRREQ;
2020 }
2021 }
2022
2023 space = sbspace(&so->so_snd);
2024
2025 if (flags & MSG_OOB) {
2026 space += 1024;
2027 }
2028 if ((atomic && resid > so->so_snd.sb_hiwat) ||
2029 clen > so->so_snd.sb_hiwat) {
2030 return EMSGSIZE;
2031 }
2032
2033 if ((space < resid + clen &&
2034 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2035 space < clen)) ||
2036 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2037 /*
2038 * don't block the connectx call when there's more data
2039 * than can be copied.
2040 */
2041 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2042 if (space == 0) {
2043 return EWOULDBLOCK;
2044 }
2045 if (space < (int32_t)so->so_snd.sb_lowat) {
2046 return 0;
2047 }
2048 }
2049 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2050 assumelock) {
2051 return EWOULDBLOCK;
2052 }
2053 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
2054 *sblocked = 0;
2055 error = sbwait(&so->so_snd);
2056 if (error) {
2057 if (so->so_flags & SOF_DEFUNCT) {
2058 goto defunct;
2059 }
2060 return error;
2061 }
2062 goto restart;
2063 }
2064 return 0;
2065 }
2066
2067 /*
2068 * Send on a socket.
2069 * If send must go all at once and message is larger than
2070 * send buffering, then hard error.
2071 * Lock against other senders.
2072 * If must go all at once and not enough room now, then
2073 * inform user that this would block and do nothing.
2074 * Otherwise, if nonblocking, send as much as possible.
2075 * The data to be sent is described by "uio" if nonzero,
2076 * otherwise by the mbuf chain "top" (which must be null
2077 * if uio is not). Data provided in mbuf chain must be small
2078 * enough to send all at once.
2079 *
2080 * Returns nonzero on error, timeout or signal; callers
2081 * must check for short counts if EINTR/ERESTART are returned.
2082 * Data and control buffers are freed on return.
2083 *
2084 * Returns: 0 Success
2085 * EOPNOTSUPP
2086 * EINVAL
2087 * ENOBUFS
2088 * uiomove:EFAULT
2089 * sosendcheck:EPIPE
2090 * sosendcheck:EWOULDBLOCK
2091 * sosendcheck:EINTR
2092 * sosendcheck:EBADF
2093 * sosendcheck:EINTR
2094 * sosendcheck:??? [value from so_error]
2095 * <pru_send>:ECONNRESET[TCP]
2096 * <pru_send>:EINVAL[TCP]
2097 * <pru_send>:ENOBUFS[TCP]
2098 * <pru_send>:EADDRINUSE[TCP]
2099 * <pru_send>:EADDRNOTAVAIL[TCP]
2100 * <pru_send>:EAFNOSUPPORT[TCP]
2101 * <pru_send>:EACCES[TCP]
2102 * <pru_send>:EAGAIN[TCP]
2103 * <pru_send>:EPERM[TCP]
2104 * <pru_send>:EMSGSIZE[TCP]
2105 * <pru_send>:EHOSTUNREACH[TCP]
2106 * <pru_send>:ENETUNREACH[TCP]
2107 * <pru_send>:ENETDOWN[TCP]
2108 * <pru_send>:ENOMEM[TCP]
2109 * <pru_send>:ENOBUFS[TCP]
2110 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2111 * <pru_send>:EINVAL[AF_UNIX]
2112 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2113 * <pru_send>:EPIPE[AF_UNIX]
2114 * <pru_send>:ENOTCONN[AF_UNIX]
2115 * <pru_send>:EISCONN[AF_UNIX]
2116 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2117 * <sf_data_out>:??? [whatever a filter author chooses]
2118 *
2119 * Notes: Other <pru_send> returns depend on the protocol family; all
2120 * <sf_data_out> returns depend on what the filter author causes
2121 * their filter to return.
2122 */
2123 int
2124 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2125 struct mbuf *top, struct mbuf *control, int flags)
2126 {
2127 struct mbuf **mp;
2128 struct mbuf *m, *freelist = NULL;
2129 user_ssize_t space, len, resid, orig_resid;
2130 int clen = 0, error, dontroute, mlen, sendflags;
2131 int atomic = sosendallatonce(so) || top;
2132 int sblocked = 0;
2133 struct proc *p = current_proc();
2134 uint16_t headroom = 0;
2135 boolean_t en_tracing = FALSE;
2136
2137 if (uio != NULL) {
2138 resid = uio_resid(uio);
2139 } else {
2140 resid = top->m_pkthdr.len;
2141 }
2142
2143 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2144 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2145
2146 socket_lock(so, 1);
2147
2148 /*
2149 * trace if tracing & network (vs. unix) sockets & and
2150 * non-loopback
2151 */
2152 if (ENTR_SHOULDTRACE &&
2153 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2154 struct inpcb *inp = sotoinpcb(so);
2155 if (inp->inp_last_outifp != NULL &&
2156 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2157 en_tracing = TRUE;
2158 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2159 VM_KERNEL_ADDRPERM(so),
2160 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2161 (int64_t)resid);
2162 orig_resid = resid;
2163 }
2164 }
2165
2166 /*
2167 * Re-injection should not affect process accounting
2168 */
2169 if ((flags & MSG_SKIPCFIL) == 0) {
2170 so_update_last_owner_locked(so, p);
2171 so_update_policy(so);
2172
2173 #if NECP
2174 so_update_necp_policy(so, NULL, addr);
2175 #endif /* NECP */
2176 }
2177
2178 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2179 error = EOPNOTSUPP;
2180 goto out_locked;
2181 }
2182
2183 /*
2184 * In theory resid should be unsigned.
2185 * However, space must be signed, as it might be less than 0
2186 * if we over-committed, and we must use a signed comparison
2187 * of space and resid. On the other hand, a negative resid
2188 * causes us to loop sending 0-length segments to the protocol.
2189 *
2190 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2191 *
2192 * Note: We limit resid to be a positive int value as we use
2193 * imin() to set bytes_to_copy -- radr://14558484
2194 */
2195 if (resid < 0 || resid > INT_MAX ||
2196 (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2197 error = EINVAL;
2198 goto out_locked;
2199 }
2200
2201 dontroute = (flags & MSG_DONTROUTE) &&
2202 (so->so_options & SO_DONTROUTE) == 0 &&
2203 (so->so_proto->pr_flags & PR_ATOMIC);
2204 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2205
2206 if (control != NULL) {
2207 clen = control->m_len;
2208 }
2209
2210 if (soreserveheadroom != 0) {
2211 headroom = so->so_pktheadroom;
2212 }
2213
2214 do {
2215 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2216 &sblocked);
2217 if (error) {
2218 goto out_locked;
2219 }
2220
2221 mp = &top;
2222 space = sbspace(&so->so_snd) - clen;
2223 space += ((flags & MSG_OOB) ? 1024 : 0);
2224
2225 do {
2226 if (uio == NULL) {
2227 /*
2228 * Data is prepackaged in "top".
2229 */
2230 resid = 0;
2231 if (flags & MSG_EOR) {
2232 top->m_flags |= M_EOR;
2233 }
2234 } else {
2235 int chainlength;
2236 int bytes_to_copy;
2237 boolean_t jumbocl;
2238 boolean_t bigcl;
2239 int bytes_to_alloc;
2240
2241 bytes_to_copy = imin(resid, space);
2242
2243 bytes_to_alloc = bytes_to_copy;
2244 if (top == NULL) {
2245 bytes_to_alloc += headroom;
2246 }
2247
2248 if (sosendminchain > 0) {
2249 chainlength = 0;
2250 } else {
2251 chainlength = sosendmaxchain;
2252 }
2253
2254 /*
2255 * Use big 4 KB cluster when the outgoing interface
2256 * does not prefer 2 KB clusters
2257 */
2258 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2259 sosendbigcl_ignore_capab;
2260
2261 /*
2262 * Attempt to use larger than system page-size
2263 * clusters for large writes only if there is
2264 * a jumbo cluster pool and if the socket is
2265 * marked accordingly.
2266 */
2267 jumbocl = sosendjcl && njcl > 0 &&
2268 ((so->so_flags & SOF_MULTIPAGES) ||
2269 sosendjcl_ignore_capab) &&
2270 bigcl;
2271
2272 socket_unlock(so, 0);
2273
2274 do {
2275 int num_needed;
2276 int hdrs_needed = (top == NULL) ? 1 : 0;
2277
2278 /*
2279 * try to maintain a local cache of mbuf
2280 * clusters needed to complete this
2281 * write the list is further limited to
2282 * the number that are currently needed
2283 * to fill the socket this mechanism
2284 * allows a large number of mbufs/
2285 * clusters to be grabbed under a single
2286 * mbuf lock... if we can't get any
2287 * clusters, than fall back to trying
2288 * for mbufs if we fail early (or
2289 * miscalcluate the number needed) make
2290 * sure to release any clusters we
2291 * haven't yet consumed.
2292 */
2293 if (freelist == NULL &&
2294 bytes_to_alloc > MBIGCLBYTES &&
2295 jumbocl) {
2296 num_needed =
2297 bytes_to_alloc / M16KCLBYTES;
2298
2299 if ((bytes_to_alloc -
2300 (num_needed * M16KCLBYTES))
2301 >= MINCLSIZE) {
2302 num_needed++;
2303 }
2304
2305 freelist =
2306 m_getpackets_internal(
2307 (unsigned int *)&num_needed,
2308 hdrs_needed, M_WAIT, 0,
2309 M16KCLBYTES);
2310 /*
2311 * Fall back to 4K cluster size
2312 * if allocation failed
2313 */
2314 }
2315
2316 if (freelist == NULL &&
2317 bytes_to_alloc > MCLBYTES &&
2318 bigcl) {
2319 num_needed =
2320 bytes_to_alloc / MBIGCLBYTES;
2321
2322 if ((bytes_to_alloc -
2323 (num_needed * MBIGCLBYTES)) >=
2324 MINCLSIZE) {
2325 num_needed++;
2326 }
2327
2328 freelist =
2329 m_getpackets_internal(
2330 (unsigned int *)&num_needed,
2331 hdrs_needed, M_WAIT, 0,
2332 MBIGCLBYTES);
2333 /*
2334 * Fall back to cluster size
2335 * if allocation failed
2336 */
2337 }
2338
2339 /*
2340 * Allocate a cluster as we want to
2341 * avoid to split the data in more
2342 * that one segment and using MINCLSIZE
2343 * would lead us to allocate two mbufs
2344 */
2345 if (soreserveheadroom != 0 &&
2346 freelist == NULL &&
2347 ((top == NULL &&
2348 bytes_to_alloc > _MHLEN) ||
2349 bytes_to_alloc > _MLEN)) {
2350 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2351 MCLBYTES;
2352 freelist =
2353 m_getpackets_internal(
2354 (unsigned int *)&num_needed,
2355 hdrs_needed, M_WAIT, 0,
2356 MCLBYTES);
2357 /*
2358 * Fall back to a single mbuf
2359 * if allocation failed
2360 */
2361 } else if (freelist == NULL &&
2362 bytes_to_alloc > MINCLSIZE) {
2363 num_needed =
2364 bytes_to_alloc / MCLBYTES;
2365
2366 if ((bytes_to_alloc -
2367 (num_needed * MCLBYTES)) >=
2368 MINCLSIZE) {
2369 num_needed++;
2370 }
2371
2372 freelist =
2373 m_getpackets_internal(
2374 (unsigned int *)&num_needed,
2375 hdrs_needed, M_WAIT, 0,
2376 MCLBYTES);
2377 /*
2378 * Fall back to a single mbuf
2379 * if allocation failed
2380 */
2381 }
2382 /*
2383 * For datagram protocols, leave
2384 * headroom for protocol headers
2385 * in the first cluster of the chain
2386 */
2387 if (freelist != NULL && atomic &&
2388 top == NULL && headroom > 0) {
2389 freelist->m_data += headroom;
2390 }
2391
2392 /*
2393 * Fall back to regular mbufs without
2394 * reserving the socket headroom
2395 */
2396 if (freelist == NULL) {
2397 if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2398 if (top == NULL) {
2399 MGETHDR(freelist,
2400 M_WAIT, MT_DATA);
2401 } else {
2402 MGET(freelist,
2403 M_WAIT, MT_DATA);
2404 }
2405 }
2406
2407 if (freelist == NULL) {
2408 error = ENOBUFS;
2409 socket_lock(so, 0);
2410 goto out_locked;
2411 }
2412 /*
2413 * For datagram protocols,
2414 * leave room for protocol
2415 * headers in first mbuf.
2416 */
2417 if (atomic && top == NULL &&
2418 bytes_to_copy < MHLEN) {
2419 MH_ALIGN(freelist,
2420 bytes_to_copy);
2421 }
2422 }
2423 m = freelist;
2424 freelist = m->m_next;
2425 m->m_next = NULL;
2426
2427 if ((m->m_flags & M_EXT)) {
2428 mlen = m->m_ext.ext_size -
2429 M_LEADINGSPACE(m);
2430 } else if ((m->m_flags & M_PKTHDR)) {
2431 mlen =
2432 MHLEN - M_LEADINGSPACE(m);
2433 } else {
2434 mlen = MLEN - M_LEADINGSPACE(m);
2435 }
2436 len = imin(mlen, bytes_to_copy);
2437
2438 chainlength += len;
2439
2440 space -= len;
2441
2442 error = uiomove(mtod(m, caddr_t),
2443 len, uio);
2444
2445 resid = uio_resid(uio);
2446
2447 m->m_len = len;
2448 *mp = m;
2449 top->m_pkthdr.len += len;
2450 if (error) {
2451 break;
2452 }
2453 mp = &m->m_next;
2454 if (resid <= 0) {
2455 if (flags & MSG_EOR) {
2456 top->m_flags |= M_EOR;
2457 }
2458 break;
2459 }
2460 bytes_to_copy = min(resid, space);
2461 } while (space > 0 &&
2462 (chainlength < sosendmaxchain || atomic ||
2463 resid < MINCLSIZE));
2464
2465 socket_lock(so, 0);
2466
2467 if (error) {
2468 goto out_locked;
2469 }
2470 }
2471
2472 if (dontroute) {
2473 so->so_options |= SO_DONTROUTE;
2474 }
2475
2476 /*
2477 * Compute flags here, for pru_send and NKEs
2478 *
2479 * If the user set MSG_EOF, the protocol
2480 * understands this flag and nothing left to
2481 * send then use PRU_SEND_EOF instead of PRU_SEND.
2482 */
2483 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2484 ((flags & MSG_EOF) &&
2485 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2486 (resid <= 0)) ? PRUS_EOF :
2487 /* If there is more to send set PRUS_MORETOCOME */
2488 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2489
2490 if ((flags & MSG_SKIPCFIL) == 0) {
2491 /*
2492 * Socket filter processing
2493 */
2494 error = sflt_data_out(so, addr, &top,
2495 &control, (sendflags & MSG_OOB) ?
2496 sock_data_filt_flag_oob : 0);
2497 if (error) {
2498 if (error == EJUSTRETURN) {
2499 error = 0;
2500 goto packet_consumed;
2501 }
2502 goto out_locked;
2503 }
2504 #if CONTENT_FILTER
2505 /*
2506 * Content filter processing
2507 */
2508 error = cfil_sock_data_out(so, addr, top,
2509 control, sendflags);
2510 if (error) {
2511 if (error == EJUSTRETURN) {
2512 error = 0;
2513 clen = 0;
2514 control = NULL;
2515 top = NULL;
2516 }
2517 goto out_locked;
2518 }
2519 #endif /* CONTENT_FILTER */
2520 }
2521 error = (*so->so_proto->pr_usrreqs->pru_send)
2522 (so, sendflags, top, addr, control, p);
2523
2524 packet_consumed:
2525 if (dontroute) {
2526 so->so_options &= ~SO_DONTROUTE;
2527 }
2528
2529 clen = 0;
2530 control = NULL;
2531 top = NULL;
2532 mp = &top;
2533 if (error) {
2534 goto out_locked;
2535 }
2536 } while (resid && space > 0);
2537 } while (resid);
2538
2539 out_locked:
2540 if (sblocked) {
2541 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2542 } else {
2543 socket_unlock(so, 1);
2544 }
2545 if (top != NULL) {
2546 m_freem(top);
2547 }
2548 if (control != NULL) {
2549 m_freem(control);
2550 }
2551 if (freelist != NULL) {
2552 m_freem_list(freelist);
2553 }
2554
2555 soclearfastopen(so);
2556
2557 if (en_tracing) {
2558 /* resid passed here is the bytes left in uio */
2559 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2560 VM_KERNEL_ADDRPERM(so),
2561 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2562 (int64_t)(orig_resid - resid));
2563 }
2564 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2565 so->so_snd.sb_cc, space, error);
2566
2567 return error;
2568 }
2569
2570 int
2571 sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2572 {
2573 struct mbuf *m0 = NULL, *control_end = NULL;
2574
2575 socket_lock_assert_owned(so);
2576
2577 /*
2578 * top must points to mbuf chain to be sent.
2579 * If control is not NULL, top must be packet header
2580 */
2581 VERIFY(top != NULL &&
2582 (control == NULL || top->m_flags & M_PKTHDR));
2583
2584 /*
2585 * If control is not passed in, see if we can get it
2586 * from top.
2587 */
2588 if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2589 // Locate start of control if present and start of data
2590 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2591 if (m0->m_flags & M_PKTHDR) {
2592 top = m0;
2593 break;
2594 } else if (m0->m_type == MT_CONTROL) {
2595 if (control == NULL) {
2596 // Found start of control
2597 control = m0;
2598 }
2599 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2600 // Found end of control
2601 control_end = m0;
2602 }
2603 }
2604 }
2605 if (control_end != NULL) {
2606 control_end->m_next = NULL;
2607 }
2608 }
2609
2610 int error = (*so->so_proto->pr_usrreqs->pru_send)
2611 (so, sendflags, top, addr, control, current_proc());
2612
2613 return error;
2614 }
2615
2616 /*
2617 * Supported only connected sockets (no address) without ancillary data
2618 * (control mbuf) for atomic protocols
2619 */
2620 int
2621 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2622 {
2623 struct mbuf *m, *freelist = NULL;
2624 user_ssize_t len, resid;
2625 int error, dontroute, mlen;
2626 int atomic = sosendallatonce(so);
2627 int sblocked = 0;
2628 struct proc *p = current_proc();
2629 u_int uiofirst = 0;
2630 u_int uiolast = 0;
2631 struct mbuf *top = NULL;
2632 uint16_t headroom = 0;
2633 boolean_t bigcl;
2634
2635 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2636 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2637
2638 if (so->so_type != SOCK_DGRAM) {
2639 error = EINVAL;
2640 goto out;
2641 }
2642 if (atomic == 0) {
2643 error = EINVAL;
2644 goto out;
2645 }
2646 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2647 error = EPROTONOSUPPORT;
2648 goto out;
2649 }
2650 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2651 error = EINVAL;
2652 goto out;
2653 }
2654 resid = uio_array_resid(uioarray, uiocnt);
2655
2656 /*
2657 * In theory resid should be unsigned.
2658 * However, space must be signed, as it might be less than 0
2659 * if we over-committed, and we must use a signed comparison
2660 * of space and resid. On the other hand, a negative resid
2661 * causes us to loop sending 0-length segments to the protocol.
2662 *
2663 * Note: We limit resid to be a positive int value as we use
2664 * imin() to set bytes_to_copy -- radr://14558484
2665 */
2666 if (resid < 0 || resid > INT_MAX) {
2667 error = EINVAL;
2668 goto out;
2669 }
2670
2671 socket_lock(so, 1);
2672 so_update_last_owner_locked(so, p);
2673 so_update_policy(so);
2674
2675 #if NECP
2676 so_update_necp_policy(so, NULL, NULL);
2677 #endif /* NECP */
2678
2679 dontroute = (flags & MSG_DONTROUTE) &&
2680 (so->so_options & SO_DONTROUTE) == 0 &&
2681 (so->so_proto->pr_flags & PR_ATOMIC);
2682 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2683
2684 error = sosendcheck(so, NULL, resid, 0, atomic, flags, &sblocked);
2685 if (error) {
2686 goto release;
2687 }
2688
2689 /*
2690 * Use big 4 KB clusters when the outgoing interface does not prefer
2691 * 2 KB clusters
2692 */
2693 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2694
2695 if (soreserveheadroom != 0) {
2696 headroom = so->so_pktheadroom;
2697 }
2698
2699 do {
2700 int i;
2701 int num_needed = 0;
2702 int chainlength;
2703 size_t maxpktlen = 0;
2704 int bytes_to_alloc;
2705
2706 if (sosendminchain > 0) {
2707 chainlength = 0;
2708 } else {
2709 chainlength = sosendmaxchain;
2710 }
2711
2712 socket_unlock(so, 0);
2713
2714 /*
2715 * Find a set of uio that fit in a reasonable number
2716 * of mbuf packets
2717 */
2718 for (i = uiofirst; i < uiocnt; i++) {
2719 struct uio *auio = uioarray[i];
2720
2721 len = uio_resid(auio);
2722
2723 /* Do nothing for empty messages */
2724 if (len == 0) {
2725 continue;
2726 }
2727
2728 num_needed += 1;
2729 uiolast += 1;
2730
2731 if (len > maxpktlen) {
2732 maxpktlen = len;
2733 }
2734
2735 chainlength += len;
2736 if (chainlength > sosendmaxchain) {
2737 break;
2738 }
2739 }
2740 /*
2741 * Nothing left to send
2742 */
2743 if (num_needed == 0) {
2744 socket_lock(so, 0);
2745 break;
2746 }
2747 /*
2748 * Allocate buffer large enough to include headroom space for
2749 * network and link header
2750 *
2751 */
2752 bytes_to_alloc = maxpktlen + headroom;
2753
2754 /*
2755 * Allocate a single contiguous buffer of the smallest available
2756 * size when possible
2757 */
2758 if (bytes_to_alloc > MCLBYTES &&
2759 bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2760 freelist = m_getpackets_internal(
2761 (unsigned int *)&num_needed,
2762 num_needed, M_WAIT, 1,
2763 MBIGCLBYTES);
2764 } else if (bytes_to_alloc > _MHLEN &&
2765 bytes_to_alloc <= MCLBYTES) {
2766 freelist = m_getpackets_internal(
2767 (unsigned int *)&num_needed,
2768 num_needed, M_WAIT, 1,
2769 MCLBYTES);
2770 } else {
2771 freelist = m_allocpacket_internal(
2772 (unsigned int *)&num_needed,
2773 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2774 }
2775
2776 if (freelist == NULL) {
2777 socket_lock(so, 0);
2778 error = ENOMEM;
2779 goto release;
2780 }
2781 /*
2782 * Copy each uio of the set into its own mbuf packet
2783 */
2784 for (i = uiofirst, m = freelist;
2785 i < uiolast && m != NULL;
2786 i++) {
2787 int bytes_to_copy;
2788 struct mbuf *n;
2789 struct uio *auio = uioarray[i];
2790
2791 bytes_to_copy = uio_resid(auio);
2792
2793 /* Do nothing for empty messages */
2794 if (bytes_to_copy == 0) {
2795 continue;
2796 }
2797 /*
2798 * Leave headroom for protocol headers
2799 * in the first mbuf of the chain
2800 */
2801 m->m_data += headroom;
2802
2803 for (n = m; n != NULL; n = n->m_next) {
2804 if ((m->m_flags & M_EXT)) {
2805 mlen = m->m_ext.ext_size -
2806 M_LEADINGSPACE(m);
2807 } else if ((m->m_flags & M_PKTHDR)) {
2808 mlen =
2809 MHLEN - M_LEADINGSPACE(m);
2810 } else {
2811 mlen = MLEN - M_LEADINGSPACE(m);
2812 }
2813 len = imin(mlen, bytes_to_copy);
2814
2815 /*
2816 * Note: uiomove() decrements the iovec
2817 * length
2818 */
2819 error = uiomove(mtod(n, caddr_t),
2820 len, auio);
2821 if (error != 0) {
2822 break;
2823 }
2824 n->m_len = len;
2825 m->m_pkthdr.len += len;
2826
2827 VERIFY(m->m_pkthdr.len <= maxpktlen);
2828
2829 bytes_to_copy -= len;
2830 resid -= len;
2831 }
2832 if (m->m_pkthdr.len == 0) {
2833 printf(
2834 "%s:%d so %llx pkt %llx type %u len null\n",
2835 __func__, __LINE__,
2836 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2837 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2838 m->m_type);
2839 }
2840 if (error != 0) {
2841 break;
2842 }
2843 m = m->m_nextpkt;
2844 }
2845
2846 socket_lock(so, 0);
2847
2848 if (error) {
2849 goto release;
2850 }
2851 top = freelist;
2852 freelist = NULL;
2853
2854 if (dontroute) {
2855 so->so_options |= SO_DONTROUTE;
2856 }
2857
2858 if ((flags & MSG_SKIPCFIL) == 0) {
2859 struct mbuf **prevnextp = NULL;
2860
2861 for (i = uiofirst, m = top;
2862 i < uiolast && m != NULL;
2863 i++) {
2864 struct mbuf *nextpkt = m->m_nextpkt;
2865
2866 /*
2867 * Socket filter processing
2868 */
2869 error = sflt_data_out(so, NULL, &m,
2870 NULL, 0);
2871 if (error != 0 && error != EJUSTRETURN) {
2872 goto release;
2873 }
2874
2875 #if CONTENT_FILTER
2876 if (error == 0) {
2877 /*
2878 * Content filter processing
2879 */
2880 error = cfil_sock_data_out(so, NULL, m,
2881 NULL, 0);
2882 if (error != 0 && error != EJUSTRETURN) {
2883 goto release;
2884 }
2885 }
2886 #endif /* CONTENT_FILTER */
2887 /*
2888 * Remove packet from the list when
2889 * swallowed by a filter
2890 */
2891 if (error == EJUSTRETURN) {
2892 error = 0;
2893 if (prevnextp != NULL) {
2894 *prevnextp = nextpkt;
2895 } else {
2896 top = nextpkt;
2897 }
2898 }
2899
2900 m = nextpkt;
2901 if (m != NULL) {
2902 prevnextp = &m->m_nextpkt;
2903 }
2904 }
2905 }
2906 if (top != NULL) {
2907 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2908 (so, 0, top, NULL, NULL, p);
2909 }
2910
2911 if (dontroute) {
2912 so->so_options &= ~SO_DONTROUTE;
2913 }
2914
2915 top = NULL;
2916 uiofirst = uiolast;
2917 } while (resid > 0 && error == 0);
2918 release:
2919 if (sblocked) {
2920 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2921 } else {
2922 socket_unlock(so, 1);
2923 }
2924 out:
2925 if (top != NULL) {
2926 m_freem(top);
2927 }
2928 if (freelist != NULL) {
2929 m_freem_list(freelist);
2930 }
2931
2932 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2933 so->so_snd.sb_cc, 0, error);
2934
2935 return error;
2936 }
2937
2938 /*
2939 * May return ERESTART when packet is dropped by MAC policy check
2940 */
2941 static int
2942 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2943 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2944 {
2945 int error = 0;
2946 struct mbuf *m = *mp;
2947 struct mbuf *nextrecord = *nextrecordp;
2948
2949 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2950 #if CONFIG_MACF_SOCKET_SUBSET
2951 /*
2952 * Call the MAC framework for policy checking if we're in
2953 * the user process context and the socket isn't connected.
2954 */
2955 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2956 struct mbuf *m0 = m;
2957 /*
2958 * Dequeue this record (temporarily) from the receive
2959 * list since we're about to drop the socket's lock
2960 * where a new record may arrive and be appended to
2961 * the list. Upon MAC policy failure, the record
2962 * will be freed. Otherwise, we'll add it back to
2963 * the head of the list. We cannot rely on SB_LOCK
2964 * because append operation uses the socket's lock.
2965 */
2966 do {
2967 m->m_nextpkt = NULL;
2968 sbfree(&so->so_rcv, m);
2969 m = m->m_next;
2970 } while (m != NULL);
2971 m = m0;
2972 so->so_rcv.sb_mb = nextrecord;
2973 SB_EMPTY_FIXUP(&so->so_rcv);
2974 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2975 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2976 socket_unlock(so, 0);
2977
2978 if (mac_socket_check_received(proc_ucred(p), so,
2979 mtod(m, struct sockaddr *)) != 0) {
2980 /*
2981 * MAC policy failure; free this record and
2982 * process the next record (or block until
2983 * one is available). We have adjusted sb_cc
2984 * and sb_mbcnt above so there is no need to
2985 * call sbfree() again.
2986 */
2987 m_freem(m);
2988 /*
2989 * Clear SB_LOCK but don't unlock the socket.
2990 * Process the next record or wait for one.
2991 */
2992 socket_lock(so, 0);
2993 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2994 error = ERESTART;
2995 goto done;
2996 }
2997 socket_lock(so, 0);
2998 /*
2999 * If the socket has been defunct'd, drop it.
3000 */
3001 if (so->so_flags & SOF_DEFUNCT) {
3002 m_freem(m);
3003 error = ENOTCONN;
3004 goto done;
3005 }
3006 /*
3007 * Re-adjust the socket receive list and re-enqueue
3008 * the record in front of any packets which may have
3009 * been appended while we dropped the lock.
3010 */
3011 for (m = m0; m->m_next != NULL; m = m->m_next) {
3012 sballoc(&so->so_rcv, m);
3013 }
3014 sballoc(&so->so_rcv, m);
3015 if (so->so_rcv.sb_mb == NULL) {
3016 so->so_rcv.sb_lastrecord = m0;
3017 so->so_rcv.sb_mbtail = m;
3018 }
3019 m = m0;
3020 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3021 so->so_rcv.sb_mb = m;
3022 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3023 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3024 }
3025 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3026 if (psa != NULL) {
3027 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3028 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3029 error = EWOULDBLOCK;
3030 goto done;
3031 }
3032 }
3033 if (flags & MSG_PEEK) {
3034 m = m->m_next;
3035 } else {
3036 sbfree(&so->so_rcv, m);
3037 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3038 panic("%s: about to create invalid socketbuf",
3039 __func__);
3040 /* NOTREACHED */
3041 }
3042 MFREE(m, so->so_rcv.sb_mb);
3043 m = so->so_rcv.sb_mb;
3044 if (m != NULL) {
3045 m->m_nextpkt = nextrecord;
3046 } else {
3047 so->so_rcv.sb_mb = nextrecord;
3048 SB_EMPTY_FIXUP(&so->so_rcv);
3049 }
3050 }
3051 done:
3052 *mp = m;
3053 *nextrecordp = nextrecord;
3054
3055 return error;
3056 }
3057
3058 /*
3059 * Process one or more MT_CONTROL mbufs present before any data mbufs
3060 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3061 * just copy the data; if !MSG_PEEK, we call into the protocol to
3062 * perform externalization.
3063 */
3064 static int
3065 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3066 struct mbuf **mp, struct mbuf **nextrecordp)
3067 {
3068 int error = 0;
3069 struct mbuf *cm = NULL, *cmn;
3070 struct mbuf **cme = &cm;
3071 struct sockbuf *sb_rcv = &so->so_rcv;
3072 struct mbuf **msgpcm = NULL;
3073 struct mbuf *m = *mp;
3074 struct mbuf *nextrecord = *nextrecordp;
3075 struct protosw *pr = so->so_proto;
3076
3077 /*
3078 * Externalizing the control messages would require us to
3079 * drop the socket's lock below. Once we re-acquire the
3080 * lock, the mbuf chain might change. In order to preserve
3081 * consistency, we unlink all control messages from the
3082 * first mbuf chain in one shot and link them separately
3083 * onto a different chain.
3084 */
3085 do {
3086 if (flags & MSG_PEEK) {
3087 if (controlp != NULL) {
3088 if (*controlp == NULL) {
3089 msgpcm = controlp;
3090 }
3091 *controlp = m_copy(m, 0, m->m_len);
3092
3093 /*
3094 * If we failed to allocate an mbuf,
3095 * release any previously allocated
3096 * mbufs for control data. Return
3097 * an error. Keep the mbufs in the
3098 * socket as this is using
3099 * MSG_PEEK flag.
3100 */
3101 if (*controlp == NULL) {
3102 m_freem(*msgpcm);
3103 error = ENOBUFS;
3104 goto done;
3105 }
3106 controlp = &(*controlp)->m_next;
3107 }
3108 m = m->m_next;
3109 } else {
3110 m->m_nextpkt = NULL;
3111 sbfree(sb_rcv, m);
3112 sb_rcv->sb_mb = m->m_next;
3113 m->m_next = NULL;
3114 *cme = m;
3115 cme = &(*cme)->m_next;
3116 m = sb_rcv->sb_mb;
3117 }
3118 } while (m != NULL && m->m_type == MT_CONTROL);
3119
3120 if (!(flags & MSG_PEEK)) {
3121 if (sb_rcv->sb_mb != NULL) {
3122 sb_rcv->sb_mb->m_nextpkt = nextrecord;
3123 } else {
3124 sb_rcv->sb_mb = nextrecord;
3125 SB_EMPTY_FIXUP(sb_rcv);
3126 }
3127 if (nextrecord == NULL) {
3128 sb_rcv->sb_lastrecord = m;
3129 }
3130 }
3131
3132 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3133 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3134
3135 while (cm != NULL) {
3136 int cmsg_type;
3137
3138 cmn = cm->m_next;
3139 cm->m_next = NULL;
3140 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3141
3142 /*
3143 * Call the protocol to externalize SCM_RIGHTS message
3144 * and return the modified message to the caller upon
3145 * success. Otherwise, all other control messages are
3146 * returned unmodified to the caller. Note that we
3147 * only get into this loop if MSG_PEEK is not set.
3148 */
3149 if (pr->pr_domain->dom_externalize != NULL &&
3150 cmsg_type == SCM_RIGHTS) {
3151 /*
3152 * Release socket lock: see 3903171. This
3153 * would also allow more records to be appended
3154 * to the socket buffer. We still have SB_LOCK
3155 * set on it, so we can be sure that the head
3156 * of the mbuf chain won't change.
3157 */
3158 socket_unlock(so, 0);
3159 error = (*pr->pr_domain->dom_externalize)(cm);
3160 socket_lock(so, 0);
3161 } else {
3162 error = 0;
3163 }
3164
3165 if (controlp != NULL && error == 0) {
3166 *controlp = cm;
3167 controlp = &(*controlp)->m_next;
3168 } else {
3169 (void) m_free(cm);
3170 }
3171 cm = cmn;
3172 }
3173 /*
3174 * Update the value of nextrecord in case we received new
3175 * records when the socket was unlocked above for
3176 * externalizing SCM_RIGHTS.
3177 */
3178 if (m != NULL) {
3179 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3180 } else {
3181 nextrecord = sb_rcv->sb_mb;
3182 }
3183
3184 done:
3185 *mp = m;
3186 *nextrecordp = nextrecord;
3187
3188 return error;
3189 }
3190
3191 /*
3192 * If we have less data than requested, block awaiting more
3193 * (subject to any timeout) if:
3194 * 1. the current count is less than the low water mark, or
3195 * 2. MSG_WAITALL is set, and it is possible to do the entire
3196 * receive operation at once if we block (resid <= hiwat).
3197 * 3. MSG_DONTWAIT is not set
3198 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3199 * we have to do the receive in sections, and thus risk returning
3200 * a short count if a timeout or signal occurs after we start.
3201 */
3202 static boolean_t
3203 so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3204 {
3205 struct protosw *pr = so->so_proto;
3206
3207 /* No mbufs in the receive-queue? Wait! */
3208 if (m == NULL) {
3209 return true;
3210 }
3211
3212 /* Not enough data in the receive socket-buffer - we may have to wait */
3213 if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3214 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3215 /*
3216 * Application did set the lowater-mark, so we should wait for
3217 * this data to be present.
3218 */
3219 if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3220 return true;
3221 }
3222
3223 /*
3224 * Application wants all the data - so let's try to do the
3225 * receive-operation at once by waiting for everything to
3226 * be there.
3227 */
3228 if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3229 return true;
3230 }
3231 }
3232
3233 return false;
3234 }
3235
3236 /*
3237 * Implement receive operations on a socket.
3238 * We depend on the way that records are added to the sockbuf
3239 * by sbappend*. In particular, each record (mbufs linked through m_next)
3240 * must begin with an address if the protocol so specifies,
3241 * followed by an optional mbuf or mbufs containing ancillary data,
3242 * and then zero or more mbufs of data.
3243 * In order to avoid blocking network interrupts for the entire time here,
3244 * we splx() while doing the actual copy to user space.
3245 * Although the sockbuf is locked, new data may still be appended,
3246 * and thus we must maintain consistency of the sockbuf during that time.
3247 *
3248 * The caller may receive the data as a single mbuf chain by supplying
3249 * an mbuf **mp0 for use in returning the chain. The uio is then used
3250 * only for the count in uio_resid.
3251 *
3252 * Returns: 0 Success
3253 * ENOBUFS
3254 * ENOTCONN
3255 * EWOULDBLOCK
3256 * uiomove:EFAULT
3257 * sblock:EWOULDBLOCK
3258 * sblock:EINTR
3259 * sbwait:EBADF
3260 * sbwait:EINTR
3261 * sodelayed_copy:EFAULT
3262 * <pru_rcvoob>:EINVAL[TCP]
3263 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3264 * <pru_rcvoob>:???
3265 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3266 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3267 * <pr_domain->dom_externalize>:???
3268 *
3269 * Notes: Additional return values from calls through <pru_rcvoob> and
3270 * <pr_domain->dom_externalize> depend on protocols other than
3271 * TCP or AF_UNIX, which are documented above.
3272 */
3273 int
3274 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3275 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3276 {
3277 struct mbuf *m, **mp, *ml = NULL;
3278 struct mbuf *nextrecord, *free_list;
3279 int flags, error, offset;
3280 user_ssize_t len;
3281 struct protosw *pr = so->so_proto;
3282 int moff, type = 0;
3283 user_ssize_t orig_resid = uio_resid(uio);
3284 user_ssize_t delayed_copy_len;
3285 int can_delay;
3286 struct proc *p = current_proc();
3287 boolean_t en_tracing = FALSE;
3288
3289 /*
3290 * Sanity check on the length passed by caller as we are making 'int'
3291 * comparisons
3292 */
3293 if (orig_resid < 0 || orig_resid > INT_MAX) {
3294 return EINVAL;
3295 }
3296
3297 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3298 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3299 so->so_rcv.sb_hiwat);
3300
3301 socket_lock(so, 1);
3302 so_update_last_owner_locked(so, p);
3303 so_update_policy(so);
3304
3305 #ifdef MORE_LOCKING_DEBUG
3306 if (so->so_usecount == 1) {
3307 panic("%s: so=%x no other reference on socket\n", __func__, so);
3308 /* NOTREACHED */
3309 }
3310 #endif
3311 mp = mp0;
3312 if (psa != NULL) {
3313 *psa = NULL;
3314 }
3315 if (controlp != NULL) {
3316 *controlp = NULL;
3317 }
3318 if (flagsp != NULL) {
3319 flags = *flagsp & ~MSG_EOR;
3320 } else {
3321 flags = 0;
3322 }
3323
3324 /*
3325 * If a recv attempt is made on a previously-accepted socket
3326 * that has been marked as inactive (disconnected), reject
3327 * the request.
3328 */
3329 if (so->so_flags & SOF_DEFUNCT) {
3330 struct sockbuf *sb = &so->so_rcv;
3331
3332 error = ENOTCONN;
3333 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3334 __func__, proc_pid(p), proc_best_name(p),
3335 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3336 SOCK_DOM(so), SOCK_TYPE(so), error);
3337 /*
3338 * This socket should have been disconnected and flushed
3339 * prior to being returned from sodefunct(); there should
3340 * be no data on its receive list, so panic otherwise.
3341 */
3342 if (so->so_state & SS_DEFUNCT) {
3343 sb_empty_assert(sb, __func__);
3344 }
3345 socket_unlock(so, 1);
3346 return error;
3347 }
3348
3349 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3350 pr->pr_usrreqs->pru_preconnect) {
3351 /*
3352 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3353 * calling write() right after this. *If* the app calls a read
3354 * we do not want to block this read indefinetely. Thus,
3355 * we trigger a connect so that the session gets initiated.
3356 */
3357 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3358
3359 if (error) {
3360 socket_unlock(so, 1);
3361 return error;
3362 }
3363 }
3364
3365 if (ENTR_SHOULDTRACE &&
3366 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3367 /*
3368 * enable energy tracing for inet sockets that go over
3369 * non-loopback interfaces only.
3370 */
3371 struct inpcb *inp = sotoinpcb(so);
3372 if (inp->inp_last_outifp != NULL &&
3373 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3374 en_tracing = TRUE;
3375 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3376 VM_KERNEL_ADDRPERM(so),
3377 ((so->so_state & SS_NBIO) ?
3378 kEnTrFlagNonBlocking : 0),
3379 (int64_t)orig_resid);
3380 }
3381 }
3382
3383 /*
3384 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3385 * regardless of the flags argument. Here is the case were
3386 * out-of-band data is not inline.
3387 */
3388 if ((flags & MSG_OOB) ||
3389 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3390 (so->so_options & SO_OOBINLINE) == 0 &&
3391 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3392 m = m_get(M_WAIT, MT_DATA);
3393 if (m == NULL) {
3394 socket_unlock(so, 1);
3395 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3396 ENOBUFS, 0, 0, 0, 0);
3397 return ENOBUFS;
3398 }
3399 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3400 if (error) {
3401 goto bad;
3402 }
3403 socket_unlock(so, 0);
3404 do {
3405 error = uiomove(mtod(m, caddr_t),
3406 imin(uio_resid(uio), m->m_len), uio);
3407 m = m_free(m);
3408 } while (uio_resid(uio) && error == 0 && m != NULL);
3409 socket_lock(so, 0);
3410 bad:
3411 if (m != NULL) {
3412 m_freem(m);
3413 }
3414
3415 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3416 if (error == EWOULDBLOCK || error == EINVAL) {
3417 /*
3418 * Let's try to get normal data:
3419 * EWOULDBLOCK: out-of-band data not
3420 * receive yet. EINVAL: out-of-band data
3421 * already read.
3422 */
3423 error = 0;
3424 goto nooob;
3425 } else if (error == 0 && flagsp != NULL) {
3426 *flagsp |= MSG_OOB;
3427 }
3428 }
3429 socket_unlock(so, 1);
3430 if (en_tracing) {
3431 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3432 VM_KERNEL_ADDRPERM(so), 0,
3433 (int64_t)(orig_resid - uio_resid(uio)));
3434 }
3435 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3436 0, 0, 0, 0);
3437
3438 return error;
3439 }
3440 nooob:
3441 if (mp != NULL) {
3442 *mp = NULL;
3443 }
3444
3445 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3446 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3447 }
3448
3449 free_list = NULL;
3450 delayed_copy_len = 0;
3451 restart:
3452 #ifdef MORE_LOCKING_DEBUG
3453 if (so->so_usecount <= 1) {
3454 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3455 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3456 }
3457 #endif
3458 /*
3459 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3460 * and if so just return to the caller. This could happen when
3461 * soreceive() is called by a socket upcall function during the
3462 * time the socket is freed. The socket buffer would have been
3463 * locked across the upcall, therefore we cannot put this thread
3464 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3465 * we may livelock), because the lock on the socket buffer will
3466 * only be released when the upcall routine returns to its caller.
3467 * Because the socket has been officially closed, there can be
3468 * no further read on it.
3469 *
3470 * A multipath subflow socket would have its SS_NOFDREF set by
3471 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3472 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3473 */
3474 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3475 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3476 socket_unlock(so, 1);
3477 return 0;
3478 }
3479
3480 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3481 if (error) {
3482 socket_unlock(so, 1);
3483 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3484 0, 0, 0, 0);
3485 if (en_tracing) {
3486 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3487 VM_KERNEL_ADDRPERM(so), 0,
3488 (int64_t)(orig_resid - uio_resid(uio)));
3489 }
3490 return error;
3491 }
3492
3493 m = so->so_rcv.sb_mb;
3494 if (so_should_wait(so, uio, m, flags)) {
3495 /*
3496 * Panic if we notice inconsistencies in the socket's
3497 * receive list; both sb_mb and sb_cc should correctly
3498 * reflect the contents of the list, otherwise we may
3499 * end up with false positives during select() or poll()
3500 * which could put the application in a bad state.
3501 */
3502 SB_MB_CHECK(&so->so_rcv);
3503
3504 if (so->so_error) {
3505 if (m != NULL) {
3506 goto dontblock;
3507 }
3508 error = so->so_error;
3509 if ((flags & MSG_PEEK) == 0) {
3510 so->so_error = 0;
3511 }
3512 goto release;
3513 }
3514 if (so->so_state & SS_CANTRCVMORE) {
3515 #if CONTENT_FILTER
3516 /*
3517 * Deal with half closed connections
3518 */
3519 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3520 cfil_sock_data_pending(&so->so_rcv) != 0) {
3521 CFIL_LOG(LOG_INFO,
3522 "so %llx ignore SS_CANTRCVMORE",
3523 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3524 } else
3525 #endif /* CONTENT_FILTER */
3526 if (m != NULL) {
3527 goto dontblock;
3528 } else {
3529 goto release;
3530 }
3531 }
3532 for (; m != NULL; m = m->m_next) {
3533 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3534 m = so->so_rcv.sb_mb;
3535 goto dontblock;
3536 }
3537 }
3538 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3539 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3540 error = ENOTCONN;
3541 goto release;
3542 }
3543 if (uio_resid(uio) == 0) {
3544 goto release;
3545 }
3546
3547 if ((so->so_state & SS_NBIO) ||
3548 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3549 error = EWOULDBLOCK;
3550 goto release;
3551 }
3552 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3553 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3554 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3555 #if EVEN_MORE_LOCKING_DEBUG
3556 if (socket_debug) {
3557 printf("Waiting for socket data\n");
3558 }
3559 #endif
3560
3561 /*
3562 * Depending on the protocol (e.g. TCP), the following
3563 * might cause the socket lock to be dropped and later
3564 * be reacquired, and more data could have arrived and
3565 * have been appended to the receive socket buffer by
3566 * the time it returns. Therefore, we only sleep in
3567 * sbwait() below if and only if the wait-condition is still
3568 * true.
3569 */
3570 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3571 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3572 }
3573
3574 error = 0;
3575 if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3576 error = sbwait(&so->so_rcv);
3577 }
3578
3579 #if EVEN_MORE_LOCKING_DEBUG
3580 if (socket_debug) {
3581 printf("SORECEIVE - sbwait returned %d\n", error);
3582 }
3583 #endif
3584 if (so->so_usecount < 1) {
3585 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3586 __func__, so, so->so_usecount);
3587 /* NOTREACHED */
3588 }
3589 if (error) {
3590 socket_unlock(so, 1);
3591 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3592 0, 0, 0, 0);
3593 if (en_tracing) {
3594 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3595 VM_KERNEL_ADDRPERM(so), 0,
3596 (int64_t)(orig_resid - uio_resid(uio)));
3597 }
3598 return error;
3599 }
3600 goto restart;
3601 }
3602 dontblock:
3603 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3604 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3605 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3606 nextrecord = m->m_nextpkt;
3607
3608 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3609 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3610 mp0 == NULL);
3611 if (error == ERESTART) {
3612 goto restart;
3613 } else if (error != 0) {
3614 goto release;
3615 }
3616 orig_resid = 0;
3617 }
3618
3619 /*
3620 * Process one or more MT_CONTROL mbufs present before any data mbufs
3621 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3622 * just copy the data; if !MSG_PEEK, we call into the protocol to
3623 * perform externalization.
3624 */
3625 if (m != NULL && m->m_type == MT_CONTROL) {
3626 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3627 if (error != 0) {
3628 goto release;
3629 }
3630 orig_resid = 0;
3631 }
3632
3633 if (m != NULL) {
3634 if (!(flags & MSG_PEEK)) {
3635 /*
3636 * We get here because m points to an mbuf following
3637 * any MT_SONAME or MT_CONTROL mbufs which have been
3638 * processed above. In any case, m should be pointing
3639 * to the head of the mbuf chain, and the nextrecord
3640 * should be either NULL or equal to m->m_nextpkt.
3641 * See comments above about SB_LOCK.
3642 */
3643 if (m != so->so_rcv.sb_mb ||
3644 m->m_nextpkt != nextrecord) {
3645 panic("%s: post-control !sync so=%p m=%p "
3646 "nextrecord=%p\n", __func__, so, m,
3647 nextrecord);
3648 /* NOTREACHED */
3649 }
3650 if (nextrecord == NULL) {
3651 so->so_rcv.sb_lastrecord = m;
3652 }
3653 }
3654 type = m->m_type;
3655 if (type == MT_OOBDATA) {
3656 flags |= MSG_OOB;
3657 }
3658 } else {
3659 if (!(flags & MSG_PEEK)) {
3660 SB_EMPTY_FIXUP(&so->so_rcv);
3661 }
3662 }
3663 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3664 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3665
3666 moff = 0;
3667 offset = 0;
3668
3669 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
3670 can_delay = 1;
3671 } else {
3672 can_delay = 0;
3673 }
3674
3675 while (m != NULL &&
3676 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3677 if (m->m_type == MT_OOBDATA) {
3678 if (type != MT_OOBDATA) {
3679 break;
3680 }
3681 } else if (type == MT_OOBDATA) {
3682 break;
3683 }
3684 /*
3685 * Make sure to allways set MSG_OOB event when getting
3686 * out of band data inline.
3687 */
3688 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3689 (so->so_options & SO_OOBINLINE) != 0 &&
3690 (so->so_state & SS_RCVATMARK) != 0) {
3691 flags |= MSG_OOB;
3692 }
3693 so->so_state &= ~SS_RCVATMARK;
3694 len = uio_resid(uio) - delayed_copy_len;
3695 if (so->so_oobmark && len > so->so_oobmark - offset) {
3696 len = so->so_oobmark - offset;
3697 }
3698 if (len > m->m_len - moff) {
3699 len = m->m_len - moff;
3700 }
3701 /*
3702 * If mp is set, just pass back the mbufs.
3703 * Otherwise copy them out via the uio, then free.
3704 * Sockbuf must be consistent here (points to current mbuf,
3705 * it points to next record) when we drop priority;
3706 * we must note any additions to the sockbuf when we
3707 * block interrupts again.
3708 */
3709 if (mp == NULL) {
3710 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3711 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3712 if (can_delay && len == m->m_len) {
3713 /*
3714 * only delay the copy if we're consuming the
3715 * mbuf and we're NOT in MSG_PEEK mode
3716 * and we have enough data to make it worthwile
3717 * to drop and retake the lock... can_delay
3718 * reflects the state of the 2 latter
3719 * constraints moff should always be zero
3720 * in these cases
3721 */
3722 delayed_copy_len += len;
3723 } else {
3724 if (delayed_copy_len) {
3725 error = sodelayed_copy(so, uio,
3726 &free_list, &delayed_copy_len);
3727
3728 if (error) {
3729 goto release;
3730 }
3731 /*
3732 * can only get here if MSG_PEEK is not
3733 * set therefore, m should point at the
3734 * head of the rcv queue; if it doesn't,
3735 * it means something drastically
3736 * changed while we were out from behind
3737 * the lock in sodelayed_copy. perhaps
3738 * a RST on the stream. in any event,
3739 * the stream has been interrupted. it's
3740 * probably best just to return whatever
3741 * data we've moved and let the caller
3742 * sort it out...
3743 */
3744 if (m != so->so_rcv.sb_mb) {
3745 break;
3746 }
3747 }
3748 socket_unlock(so, 0);
3749 error = uiomove(mtod(m, caddr_t) + moff,
3750 (int)len, uio);
3751 socket_lock(so, 0);
3752
3753 if (error) {
3754 goto release;
3755 }
3756 }
3757 } else {
3758 uio_setresid(uio, (uio_resid(uio) - len));
3759 }
3760 if (len == m->m_len - moff) {
3761 if (m->m_flags & M_EOR) {
3762 flags |= MSG_EOR;
3763 }
3764 if (flags & MSG_PEEK) {
3765 m = m->m_next;
3766 moff = 0;
3767 } else {
3768 nextrecord = m->m_nextpkt;
3769 sbfree(&so->so_rcv, m);
3770 m->m_nextpkt = NULL;
3771
3772 if (mp != NULL) {
3773 *mp = m;
3774 mp = &m->m_next;
3775 so->so_rcv.sb_mb = m = m->m_next;
3776 *mp = NULL;
3777 } else {
3778 if (free_list == NULL) {
3779 free_list = m;
3780 } else {
3781 ml->m_next = m;
3782 }
3783 ml = m;
3784 so->so_rcv.sb_mb = m = m->m_next;
3785 ml->m_next = NULL;
3786 }
3787 if (m != NULL) {
3788 m->m_nextpkt = nextrecord;
3789 if (nextrecord == NULL) {
3790 so->so_rcv.sb_lastrecord = m;
3791 }
3792 } else {
3793 so->so_rcv.sb_mb = nextrecord;
3794 SB_EMPTY_FIXUP(&so->so_rcv);
3795 }
3796 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3797 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3798 }
3799 } else {
3800 if (flags & MSG_PEEK) {
3801 moff += len;
3802 } else {
3803 if (mp != NULL) {
3804 int copy_flag;
3805
3806 if (flags & MSG_DONTWAIT) {
3807 copy_flag = M_DONTWAIT;
3808 } else {
3809 copy_flag = M_WAIT;
3810 }
3811 *mp = m_copym(m, 0, len, copy_flag);
3812 /*
3813 * Failed to allocate an mbuf?
3814 * Adjust uio_resid back, it was
3815 * adjusted down by len bytes which
3816 * we didn't copy over.
3817 */
3818 if (*mp == NULL) {
3819 uio_setresid(uio,
3820 (uio_resid(uio) + len));
3821 break;
3822 }
3823 }
3824 m->m_data += len;
3825 m->m_len -= len;
3826 so->so_rcv.sb_cc -= len;
3827 }
3828 }
3829 if (so->so_oobmark) {
3830 if ((flags & MSG_PEEK) == 0) {
3831 so->so_oobmark -= len;
3832 if (so->so_oobmark == 0) {
3833 so->so_state |= SS_RCVATMARK;
3834 break;
3835 }
3836 } else {
3837 offset += len;
3838 if (offset == so->so_oobmark) {
3839 break;
3840 }
3841 }
3842 }
3843 if (flags & MSG_EOR) {
3844 break;
3845 }
3846 /*
3847 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3848 * (for non-atomic socket), we must not quit until
3849 * "uio->uio_resid == 0" or an error termination.
3850 * If a signal/timeout occurs, return with a short
3851 * count but without error. Keep sockbuf locked
3852 * against other readers.
3853 */
3854 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3855 (uio_resid(uio) - delayed_copy_len) > 0 &&
3856 !sosendallatonce(so) && !nextrecord) {
3857 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3858 #if CONTENT_FILTER
3859 && cfil_sock_data_pending(&so->so_rcv) == 0
3860 #endif /* CONTENT_FILTER */
3861 )) {
3862 goto release;
3863 }
3864
3865 /*
3866 * Depending on the protocol (e.g. TCP), the following
3867 * might cause the socket lock to be dropped and later
3868 * be reacquired, and more data could have arrived and
3869 * have been appended to the receive socket buffer by
3870 * the time it returns. Therefore, we only sleep in
3871 * sbwait() below if and only if the socket buffer is
3872 * empty, in order to avoid a false sleep.
3873 */
3874 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3875 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3876 }
3877
3878 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3879 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3880
3881 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3882 error = 0;
3883 goto release;
3884 }
3885 /*
3886 * have to wait until after we get back from the sbwait
3887 * to do the copy because we will drop the lock if we
3888 * have enough data that has been delayed... by dropping
3889 * the lock we open up a window allowing the netisr
3890 * thread to process the incoming packets and to change
3891 * the state of this socket... we're issuing the sbwait
3892 * because the socket is empty and we're expecting the
3893 * netisr thread to wake us up when more packets arrive;
3894 * if we allow that processing to happen and then sbwait
3895 * we could stall forever with packets sitting in the
3896 * socket if no further packets arrive from the remote
3897 * side.
3898 *
3899 * we want to copy before we've collected all the data
3900 * to satisfy this request to allow the copy to overlap
3901 * the incoming packet processing on an MP system
3902 */
3903 if (delayed_copy_len > sorecvmincopy &&
3904 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3905 error = sodelayed_copy(so, uio,
3906 &free_list, &delayed_copy_len);
3907
3908 if (error) {
3909 goto release;
3910 }
3911 }
3912 m = so->so_rcv.sb_mb;
3913 if (m != NULL) {
3914 nextrecord = m->m_nextpkt;
3915 }
3916 SB_MB_CHECK(&so->so_rcv);
3917 }
3918 }
3919 #ifdef MORE_LOCKING_DEBUG
3920 if (so->so_usecount <= 1) {
3921 panic("%s: after big while so=%p ref=%d on socket\n",
3922 __func__, so, so->so_usecount);
3923 /* NOTREACHED */
3924 }
3925 #endif
3926
3927 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3928 if (so->so_options & SO_DONTTRUNC) {
3929 flags |= MSG_RCVMORE;
3930 } else {
3931 flags |= MSG_TRUNC;
3932 if ((flags & MSG_PEEK) == 0) {
3933 (void) sbdroprecord(&so->so_rcv);
3934 }
3935 }
3936 }
3937
3938 /*
3939 * pru_rcvd below (for TCP) may cause more data to be received
3940 * if the socket lock is dropped prior to sending the ACK; some
3941 * legacy OpenTransport applications don't handle this well
3942 * (if it receives less data than requested while MSG_HAVEMORE
3943 * is set), and so we set the flag now based on what we know
3944 * prior to calling pru_rcvd.
3945 */
3946 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
3947 flags |= MSG_HAVEMORE;
3948 }
3949
3950 if ((flags & MSG_PEEK) == 0) {
3951 if (m == NULL) {
3952 so->so_rcv.sb_mb = nextrecord;
3953 /*
3954 * First part is an inline SB_EMPTY_FIXUP(). Second
3955 * part makes sure sb_lastrecord is up-to-date if
3956 * there is still data in the socket buffer.
3957 */
3958 if (so->so_rcv.sb_mb == NULL) {
3959 so->so_rcv.sb_mbtail = NULL;
3960 so->so_rcv.sb_lastrecord = NULL;
3961 } else if (nextrecord->m_nextpkt == NULL) {
3962 so->so_rcv.sb_lastrecord = nextrecord;
3963 }
3964 SB_MB_CHECK(&so->so_rcv);
3965 }
3966 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3967 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3968 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
3969 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3970 }
3971 }
3972
3973 if (delayed_copy_len) {
3974 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3975 if (error) {
3976 goto release;
3977 }
3978 }
3979 if (free_list != NULL) {
3980 m_freem_list(free_list);
3981 free_list = NULL;
3982 }
3983
3984 if (orig_resid == uio_resid(uio) && orig_resid &&
3985 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3986 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3987 goto restart;
3988 }
3989
3990 if (flagsp != NULL) {
3991 *flagsp |= flags;
3992 }
3993 release:
3994 #ifdef MORE_LOCKING_DEBUG
3995 if (so->so_usecount <= 1) {
3996 panic("%s: release so=%p ref=%d on socket\n", __func__,
3997 so, so->so_usecount);
3998 /* NOTREACHED */
3999 }
4000 #endif
4001 if (delayed_copy_len) {
4002 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4003 }
4004
4005 if (free_list != NULL) {
4006 m_freem_list(free_list);
4007 }
4008
4009 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4010
4011 if (en_tracing) {
4012 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4013 VM_KERNEL_ADDRPERM(so),
4014 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4015 (int64_t)(orig_resid - uio_resid(uio)));
4016 }
4017 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4018 so->so_rcv.sb_cc, 0, error);
4019
4020 return error;
4021 }
4022
4023 /*
4024 * Returns: 0 Success
4025 * uiomove:EFAULT
4026 */
4027 static int
4028 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4029 user_ssize_t *resid)
4030 {
4031 int error = 0;
4032 struct mbuf *m;
4033
4034 m = *free_list;
4035
4036 socket_unlock(so, 0);
4037
4038 while (m != NULL && error == 0) {
4039 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
4040 m = m->m_next;
4041 }
4042 m_freem_list(*free_list);
4043
4044 *free_list = NULL;
4045 *resid = 0;
4046
4047 socket_lock(so, 0);
4048
4049 return error;
4050 }
4051
4052 static int
4053 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4054 u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4055 {
4056 #pragma unused(so)
4057 int error = 0;
4058 struct mbuf *ml, *m;
4059 int i = 0;
4060 struct uio *auio;
4061
4062 for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4063 ml = ml->m_nextpkt, i++) {
4064 auio = msgarray[i].uio;
4065 for (m = ml; m != NULL; m = m->m_next) {
4066 error = uiomove(mtod(m, caddr_t), m->m_len, auio);
4067 if (error != 0) {
4068 goto out;
4069 }
4070 }
4071 }
4072 out:
4073 m_freem_list(*free_list);
4074
4075 *free_list = NULL;
4076 *resid = 0;
4077
4078 return error;
4079 }
4080
4081 int
4082 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4083 int *flagsp)
4084 {
4085 struct mbuf *m;
4086 struct mbuf *nextrecord;
4087 struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4088 int error;
4089 user_ssize_t len, pktlen, delayed_copy_len = 0;
4090 struct protosw *pr = so->so_proto;
4091 user_ssize_t resid;
4092 struct proc *p = current_proc();
4093 struct uio *auio = NULL;
4094 int npkts = 0;
4095 int sblocked = 0;
4096 struct sockaddr **psa = NULL;
4097 struct mbuf **controlp = NULL;
4098 int can_delay;
4099 int flags;
4100 struct mbuf *free_others = NULL;
4101
4102 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4103 so, uiocnt,
4104 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4105
4106 /*
4107 * Sanity checks:
4108 * - Only supports don't wait flags
4109 * - Only support datagram sockets (could be extended to raw)
4110 * - Must be atomic
4111 * - Protocol must support packet chains
4112 * - The uio array is NULL (should we panic?)
4113 */
4114 if (flagsp != NULL) {
4115 flags = *flagsp;
4116 } else {
4117 flags = 0;
4118 }
4119 if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4120 MSG_NBIO)) {
4121 printf("%s invalid flags 0x%x\n", __func__, flags);
4122 error = EINVAL;
4123 goto out;
4124 }
4125 if (so->so_type != SOCK_DGRAM) {
4126 error = EINVAL;
4127 goto out;
4128 }
4129 if (sosendallatonce(so) == 0) {
4130 error = EINVAL;
4131 goto out;
4132 }
4133 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4134 error = EPROTONOSUPPORT;
4135 goto out;
4136 }
4137 if (msgarray == NULL) {
4138 printf("%s uioarray is NULL\n", __func__);
4139 error = EINVAL;
4140 goto out;
4141 }
4142 if (uiocnt == 0) {
4143 printf("%s uiocnt is 0\n", __func__);
4144 error = EINVAL;
4145 goto out;
4146 }
4147 /*
4148 * Sanity check on the length passed by caller as we are making 'int'
4149 * comparisons
4150 */
4151 resid = recv_msg_array_resid(msgarray, uiocnt);
4152 if (resid < 0 || resid > INT_MAX) {
4153 error = EINVAL;
4154 goto out;
4155 }
4156
4157 if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
4158 can_delay = 1;
4159 } else {
4160 can_delay = 0;
4161 }
4162
4163 socket_lock(so, 1);
4164 so_update_last_owner_locked(so, p);
4165 so_update_policy(so);
4166
4167 #if NECP
4168 so_update_necp_policy(so, NULL, NULL);
4169 #endif /* NECP */
4170
4171 /*
4172 * If a recv attempt is made on a previously-accepted socket
4173 * that has been marked as inactive (disconnected), reject
4174 * the request.
4175 */
4176 if (so->so_flags & SOF_DEFUNCT) {
4177 struct sockbuf *sb = &so->so_rcv;
4178
4179 error = ENOTCONN;
4180 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4181 __func__, proc_pid(p), proc_best_name(p),
4182 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4183 SOCK_DOM(so), SOCK_TYPE(so), error);
4184 /*
4185 * This socket should have been disconnected and flushed
4186 * prior to being returned from sodefunct(); there should
4187 * be no data on its receive list, so panic otherwise.
4188 */
4189 if (so->so_state & SS_DEFUNCT) {
4190 sb_empty_assert(sb, __func__);
4191 }
4192 goto release;
4193 }
4194
4195 next:
4196 /*
4197 * The uio may be empty
4198 */
4199 if (npkts >= uiocnt) {
4200 error = 0;
4201 goto release;
4202 }
4203 restart:
4204 /*
4205 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4206 * and if so just return to the caller. This could happen when
4207 * soreceive() is called by a socket upcall function during the
4208 * time the socket is freed. The socket buffer would have been
4209 * locked across the upcall, therefore we cannot put this thread
4210 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4211 * we may livelock), because the lock on the socket buffer will
4212 * only be released when the upcall routine returns to its caller.
4213 * Because the socket has been officially closed, there can be
4214 * no further read on it.
4215 */
4216 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4217 (SS_NOFDREF | SS_CANTRCVMORE)) {
4218 error = 0;
4219 goto release;
4220 }
4221
4222 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4223 if (error) {
4224 goto release;
4225 }
4226 sblocked = 1;
4227
4228 m = so->so_rcv.sb_mb;
4229 /*
4230 * Block awaiting more datagram if needed
4231 */
4232 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4233 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4234 ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4235 /*
4236 * Panic if we notice inconsistencies in the socket's
4237 * receive list; both sb_mb and sb_cc should correctly
4238 * reflect the contents of the list, otherwise we may
4239 * end up with false positives during select() or poll()
4240 * which could put the application in a bad state.
4241 */
4242 SB_MB_CHECK(&so->so_rcv);
4243
4244 if (so->so_error) {
4245 error = so->so_error;
4246 if ((flags & MSG_PEEK) == 0) {
4247 so->so_error = 0;
4248 }
4249 goto release;
4250 }
4251 if (so->so_state & SS_CANTRCVMORE) {
4252 goto release;
4253 }
4254 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4255 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4256 error = ENOTCONN;
4257 goto release;
4258 }
4259 if ((so->so_state & SS_NBIO) ||
4260 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4261 error = EWOULDBLOCK;
4262 goto release;
4263 }
4264 /*
4265 * Do not block if we got some data
4266 */
4267 if (free_list != NULL) {
4268 error = 0;
4269 goto release;
4270 }
4271
4272 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4273 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4274
4275 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4276 sblocked = 0;
4277
4278 error = sbwait(&so->so_rcv);
4279 if (error) {
4280 goto release;
4281 }
4282 goto restart;
4283 }
4284
4285 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4286 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4287 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4288
4289 /*
4290 * Consume the current uio index as we have a datagram
4291 */
4292 auio = msgarray[npkts].uio;
4293 resid = uio_resid(auio);
4294 msgarray[npkts].which |= SOCK_MSG_DATA;
4295 psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4296 &msgarray[npkts].psa : NULL;
4297 controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4298 &msgarray[npkts].controlp : NULL;
4299 npkts += 1;
4300 nextrecord = m->m_nextpkt;
4301
4302 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4303 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4304 if (error == ERESTART) {
4305 goto restart;
4306 } else if (error != 0) {
4307 goto release;
4308 }
4309 }
4310
4311 if (m != NULL && m->m_type == MT_CONTROL) {
4312 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4313 if (error != 0) {
4314 goto release;
4315 }
4316 }
4317
4318 if (m->m_pkthdr.len == 0) {
4319 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4320 __func__, __LINE__,
4321 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4322 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4323 m->m_type);
4324 }
4325
4326 /*
4327 * Loop to copy the mbufs of the current record
4328 * Support zero length packets
4329 */
4330 ml = NULL;
4331 pktlen = 0;
4332 while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4333 if (m->m_len == 0) {
4334 panic("%p m_len zero", m);
4335 }
4336 if (m->m_type == 0) {
4337 panic("%p m_type zero", m);
4338 }
4339 /*
4340 * Clip to the residual length
4341 */
4342 if (len > m->m_len) {
4343 len = m->m_len;
4344 }
4345 pktlen += len;
4346 /*
4347 * Copy the mbufs via the uio or delay the copy
4348 * Sockbuf must be consistent here (points to current mbuf,
4349 * it points to next record) when we drop priority;
4350 * we must note any additions to the sockbuf when we
4351 * block interrupts again.
4352 */
4353 if (len > 0 && can_delay == 0) {
4354 socket_unlock(so, 0);
4355 error = uiomove(mtod(m, caddr_t), (int)len, auio);
4356 socket_lock(so, 0);
4357 if (error) {
4358 goto release;
4359 }
4360 } else {
4361 delayed_copy_len += len;
4362 }
4363
4364 if (len == m->m_len) {
4365 /*
4366 * m was entirely copied
4367 */
4368 sbfree(&so->so_rcv, m);
4369 nextrecord = m->m_nextpkt;
4370 m->m_nextpkt = NULL;
4371
4372 /*
4373 * Set the first packet to the head of the free list
4374 */
4375 if (free_list == NULL) {
4376 free_list = m;
4377 }
4378 /*
4379 * Link current packet to tail of free list
4380 */
4381 if (ml == NULL) {
4382 if (free_tail != NULL) {
4383 free_tail->m_nextpkt = m;
4384 }
4385 free_tail = m;
4386 }
4387 /*
4388 * Link current mbuf to last mbuf of current packet
4389 */
4390 if (ml != NULL) {
4391 ml->m_next = m;
4392 }
4393 ml = m;
4394
4395 /*
4396 * Move next buf to head of socket buffer
4397 */
4398 so->so_rcv.sb_mb = m = ml->m_next;
4399 ml->m_next = NULL;
4400
4401 if (m != NULL) {
4402 m->m_nextpkt = nextrecord;
4403 if (nextrecord == NULL) {
4404 so->so_rcv.sb_lastrecord = m;
4405 }
4406 } else {
4407 so->so_rcv.sb_mb = nextrecord;
4408 SB_EMPTY_FIXUP(&so->so_rcv);
4409 }
4410 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4411 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4412 } else {
4413 /*
4414 * Stop the loop on partial copy
4415 */
4416 break;
4417 }
4418 }
4419 #ifdef MORE_LOCKING_DEBUG
4420 if (so->so_usecount <= 1) {
4421 panic("%s: after big while so=%llx ref=%d on socket\n",
4422 __func__,
4423 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4424 /* NOTREACHED */
4425 }
4426 #endif
4427 /*
4428 * Tell the caller we made a partial copy
4429 */
4430 if (m != NULL) {
4431 if (so->so_options & SO_DONTTRUNC) {
4432 /*
4433 * Copyout first the freelist then the partial mbuf
4434 */
4435 socket_unlock(so, 0);
4436 if (delayed_copy_len) {
4437 error = sodelayed_copy_list(so, msgarray,
4438 uiocnt, &free_list, &delayed_copy_len);
4439 }
4440
4441 if (error == 0) {
4442 error = uiomove(mtod(m, caddr_t), (int)len,
4443 auio);
4444 }
4445 socket_lock(so, 0);
4446 if (error) {
4447 goto release;
4448 }
4449
4450 m->m_data += len;
4451 m->m_len -= len;
4452 so->so_rcv.sb_cc -= len;
4453 flags |= MSG_RCVMORE;
4454 } else {
4455 (void) sbdroprecord(&so->so_rcv);
4456 nextrecord = so->so_rcv.sb_mb;
4457 m = NULL;
4458 flags |= MSG_TRUNC;
4459 }
4460 }
4461
4462 if (m == NULL) {
4463 so->so_rcv.sb_mb = nextrecord;
4464 /*
4465 * First part is an inline SB_EMPTY_FIXUP(). Second
4466 * part makes sure sb_lastrecord is up-to-date if
4467 * there is still data in the socket buffer.
4468 */
4469 if (so->so_rcv.sb_mb == NULL) {
4470 so->so_rcv.sb_mbtail = NULL;
4471 so->so_rcv.sb_lastrecord = NULL;
4472 } else if (nextrecord->m_nextpkt == NULL) {
4473 so->so_rcv.sb_lastrecord = nextrecord;
4474 }
4475 SB_MB_CHECK(&so->so_rcv);
4476 }
4477 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4478 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4479
4480 /*
4481 * We can continue to the next packet as long as:
4482 * - We haven't exhausted the uio array
4483 * - There was no error
4484 * - A packet was not truncated
4485 * - We can still receive more data
4486 */
4487 if (npkts < uiocnt && error == 0 &&
4488 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4489 (so->so_state & SS_CANTRCVMORE) == 0) {
4490 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4491 sblocked = 0;
4492
4493 goto next;
4494 }
4495 if (flagsp != NULL) {
4496 *flagsp |= flags;
4497 }
4498
4499 release:
4500 /*
4501 * pru_rcvd may cause more data to be received if the socket lock
4502 * is dropped so we set MSG_HAVEMORE now based on what we know.
4503 * That way the caller won't be surprised if it receives less data
4504 * than requested.
4505 */
4506 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4507 flags |= MSG_HAVEMORE;
4508 }
4509
4510 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
4511 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4512 }
4513
4514 if (sblocked) {
4515 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4516 } else {
4517 socket_unlock(so, 1);
4518 }
4519
4520 if (delayed_copy_len) {
4521 error = sodelayed_copy_list(so, msgarray, uiocnt,
4522 &free_list, &delayed_copy_len);
4523 }
4524 out:
4525 /*
4526 * Amortize the cost of freeing the mbufs
4527 */
4528 if (free_list != NULL) {
4529 m_freem_list(free_list);
4530 }
4531 if (free_others != NULL) {
4532 m_freem_list(free_others);
4533 }
4534
4535 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4536 0, 0, 0, 0);
4537 return error;
4538 }
4539
4540 static int
4541 so_statistics_event_to_nstat_event(int64_t *input_options,
4542 uint64_t *nstat_event)
4543 {
4544 int error = 0;
4545 switch (*input_options) {
4546 case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4547 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4548 break;
4549 case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4550 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4551 break;
4552 #if (DEBUG || DEVELOPMENT)
4553 case SO_STATISTICS_EVENT_RESERVED_1:
4554 *nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4555 break;
4556 case SO_STATISTICS_EVENT_RESERVED_2:
4557 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4558 break;
4559 #endif /* (DEBUG || DEVELOPMENT) */
4560 default:
4561 error = EINVAL;
4562 break;
4563 }
4564 return error;
4565 }
4566
4567 /*
4568 * Returns: 0 Success
4569 * EINVAL
4570 * ENOTCONN
4571 * <pru_shutdown>:EINVAL
4572 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4573 * <pru_shutdown>:ENOBUFS[TCP]
4574 * <pru_shutdown>:EMSGSIZE[TCP]
4575 * <pru_shutdown>:EHOSTUNREACH[TCP]
4576 * <pru_shutdown>:ENETUNREACH[TCP]
4577 * <pru_shutdown>:ENETDOWN[TCP]
4578 * <pru_shutdown>:ENOMEM[TCP]
4579 * <pru_shutdown>:EACCES[TCP]
4580 * <pru_shutdown>:EMSGSIZE[TCP]
4581 * <pru_shutdown>:ENOBUFS[TCP]
4582 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4583 * <pru_shutdown>:??? [other protocol families]
4584 */
4585 int
4586 soshutdown(struct socket *so, int how)
4587 {
4588 int error;
4589
4590 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4591
4592 switch (how) {
4593 case SHUT_RD:
4594 case SHUT_WR:
4595 case SHUT_RDWR:
4596 socket_lock(so, 1);
4597 if ((so->so_state &
4598 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4599 error = ENOTCONN;
4600 } else {
4601 error = soshutdownlock(so, how);
4602 }
4603 socket_unlock(so, 1);
4604 break;
4605 default:
4606 error = EINVAL;
4607 break;
4608 }
4609
4610 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4611
4612 return error;
4613 }
4614
4615 int
4616 soshutdownlock_final(struct socket *so, int how)
4617 {
4618 struct protosw *pr = so->so_proto;
4619 int error = 0;
4620
4621 sflt_notify(so, sock_evt_shutdown, &how);
4622
4623 if (how != SHUT_WR) {
4624 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4625 /* read already shut down */
4626 error = ENOTCONN;
4627 goto done;
4628 }
4629 sorflush(so);
4630 }
4631 if (how != SHUT_RD) {
4632 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4633 /* write already shut down */
4634 error = ENOTCONN;
4635 goto done;
4636 }
4637 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4638 }
4639 done:
4640 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4641 return error;
4642 }
4643
4644 int
4645 soshutdownlock(struct socket *so, int how)
4646 {
4647 int error = 0;
4648
4649 #if CONTENT_FILTER
4650 /*
4651 * A content filter may delay the actual shutdown until it
4652 * has processed the pending data
4653 */
4654 if (so->so_flags & SOF_CONTENT_FILTER) {
4655 error = cfil_sock_shutdown(so, &how);
4656 if (error == EJUSTRETURN) {
4657 error = 0;
4658 goto done;
4659 } else if (error != 0) {
4660 goto done;
4661 }
4662 }
4663 #endif /* CONTENT_FILTER */
4664
4665 error = soshutdownlock_final(so, how);
4666
4667 done:
4668 return error;
4669 }
4670
4671 void
4672 sowflush(struct socket *so)
4673 {
4674 struct sockbuf *sb = &so->so_snd;
4675
4676 /*
4677 * Obtain lock on the socket buffer (SB_LOCK). This is required
4678 * to prevent the socket buffer from being unexpectedly altered
4679 * while it is used by another thread in socket send/receive.
4680 *
4681 * sblock() must not fail here, hence the assertion.
4682 */
4683 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4684 VERIFY(sb->sb_flags & SB_LOCK);
4685
4686 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4687 sb->sb_flags |= SB_DROP;
4688 sb->sb_upcall = NULL;
4689 sb->sb_upcallarg = NULL;
4690
4691 sbunlock(sb, TRUE); /* keep socket locked */
4692
4693 selthreadclear(&sb->sb_sel);
4694 sbrelease(sb);
4695 }
4696
4697 void
4698 sorflush(struct socket *so)
4699 {
4700 struct sockbuf *sb = &so->so_rcv;
4701 struct protosw *pr = so->so_proto;
4702 struct sockbuf asb;
4703 #ifdef notyet
4704 lck_mtx_t *mutex_held;
4705 /*
4706 * XXX: This code is currently commented out, because we may get here
4707 * as part of sofreelastref(), and at that time, pr_getlock() may no
4708 * longer be able to return us the lock; this will be fixed in future.
4709 */
4710 if (so->so_proto->pr_getlock != NULL) {
4711 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4712 } else {
4713 mutex_held = so->so_proto->pr_domain->dom_mtx;
4714 }
4715
4716 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4717 #endif /* notyet */
4718
4719 sflt_notify(so, sock_evt_flush_read, NULL);
4720
4721 socantrcvmore(so);
4722
4723 /*
4724 * Obtain lock on the socket buffer (SB_LOCK). This is required
4725 * to prevent the socket buffer from being unexpectedly altered
4726 * while it is used by another thread in socket send/receive.
4727 *
4728 * sblock() must not fail here, hence the assertion.
4729 */
4730 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4731 VERIFY(sb->sb_flags & SB_LOCK);
4732
4733 /*
4734 * Copy only the relevant fields from "sb" to "asb" which we
4735 * need for sbrelease() to function. In particular, skip
4736 * sb_sel as it contains the wait queue linkage, which would
4737 * wreak havoc if we were to issue selthreadclear() on "asb".
4738 * Make sure to not carry over SB_LOCK in "asb", as we need
4739 * to acquire it later as part of sbrelease().
4740 */
4741 bzero(&asb, sizeof(asb));
4742 asb.sb_cc = sb->sb_cc;
4743 asb.sb_hiwat = sb->sb_hiwat;
4744 asb.sb_mbcnt = sb->sb_mbcnt;
4745 asb.sb_mbmax = sb->sb_mbmax;
4746 asb.sb_ctl = sb->sb_ctl;
4747 asb.sb_lowat = sb->sb_lowat;
4748 asb.sb_mb = sb->sb_mb;
4749 asb.sb_mbtail = sb->sb_mbtail;
4750 asb.sb_lastrecord = sb->sb_lastrecord;
4751 asb.sb_so = sb->sb_so;
4752 asb.sb_flags = sb->sb_flags;
4753 asb.sb_flags &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4754 asb.sb_flags |= SB_DROP;
4755
4756 /*
4757 * Ideally we'd bzero() these and preserve the ones we need;
4758 * but to do that we'd need to shuffle things around in the
4759 * sockbuf, and we can't do it now because there are KEXTS
4760 * that are directly referring to the socket structure.
4761 *
4762 * Setting SB_DROP acts as a barrier to prevent further appends.
4763 * Clearing SB_SEL is done for selthreadclear() below.
4764 */
4765 sb->sb_cc = 0;
4766 sb->sb_hiwat = 0;
4767 sb->sb_mbcnt = 0;
4768 sb->sb_mbmax = 0;
4769 sb->sb_ctl = 0;
4770 sb->sb_lowat = 0;
4771 sb->sb_mb = NULL;
4772 sb->sb_mbtail = NULL;
4773 sb->sb_lastrecord = NULL;
4774 sb->sb_timeo.tv_sec = 0;
4775 sb->sb_timeo.tv_usec = 0;
4776 sb->sb_upcall = NULL;
4777 sb->sb_upcallarg = NULL;
4778 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4779 sb->sb_flags |= SB_DROP;
4780
4781 sbunlock(sb, TRUE); /* keep socket locked */
4782
4783 /*
4784 * Note that selthreadclear() is called on the original "sb" and
4785 * not the local "asb" because of the way wait queue linkage is
4786 * implemented. Given that selwakeup() may be triggered, SB_SEL
4787 * should no longer be set (cleared above.)
4788 */
4789 selthreadclear(&sb->sb_sel);
4790
4791 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4792 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4793 }
4794
4795 sbrelease(&asb);
4796 }
4797
4798 /*
4799 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4800 * an additional variant to handle the case where the option value needs
4801 * to be some kind of integer, but not a specific size.
4802 * In addition to their use here, these functions are also called by the
4803 * protocol-level pr_ctloutput() routines.
4804 *
4805 * Returns: 0 Success
4806 * EINVAL
4807 * copyin:EFAULT
4808 */
4809 int
4810 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4811 {
4812 size_t valsize;
4813
4814 /*
4815 * If the user gives us more than we wanted, we ignore it,
4816 * but if we don't get the minimum length the caller
4817 * wants, we return EINVAL. On success, sopt->sopt_valsize
4818 * is set to however much we actually retrieved.
4819 */
4820 if ((valsize = sopt->sopt_valsize) < minlen) {
4821 return EINVAL;
4822 }
4823 if (valsize > len) {
4824 sopt->sopt_valsize = valsize = len;
4825 }
4826
4827 if (sopt->sopt_p != kernproc) {
4828 return copyin(sopt->sopt_val, buf, valsize);
4829 }
4830
4831 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4832 return 0;
4833 }
4834
4835 /*
4836 * sooptcopyin_timeval
4837 * Copy in a timeval value into tv_p, and take into account whether the
4838 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4839 * code here so that we can verify the 64-bit tv_sec value before we lose
4840 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4841 */
4842 static int
4843 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4844 {
4845 int error;
4846
4847 if (proc_is64bit(sopt->sopt_p)) {
4848 struct user64_timeval tv64;
4849
4850 if (sopt->sopt_valsize < sizeof(tv64)) {
4851 return EINVAL;
4852 }
4853
4854 sopt->sopt_valsize = sizeof(tv64);
4855 if (sopt->sopt_p != kernproc) {
4856 error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4857 if (error != 0) {
4858 return error;
4859 }
4860 } else {
4861 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4862 sizeof(tv64));
4863 }
4864 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4865 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4866 return EDOM;
4867 }
4868
4869 tv_p->tv_sec = tv64.tv_sec;
4870 tv_p->tv_usec = tv64.tv_usec;
4871 } else {
4872 struct user32_timeval tv32;
4873
4874 if (sopt->sopt_valsize < sizeof(tv32)) {
4875 return EINVAL;
4876 }
4877
4878 sopt->sopt_valsize = sizeof(tv32);
4879 if (sopt->sopt_p != kernproc) {
4880 error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4881 if (error != 0) {
4882 return error;
4883 }
4884 } else {
4885 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4886 sizeof(tv32));
4887 }
4888 #ifndef __LP64__
4889 /*
4890 * K64todo "comparison is always false due to
4891 * limited range of data type"
4892 */
4893 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4894 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4895 return EDOM;
4896 }
4897 #endif
4898 tv_p->tv_sec = tv32.tv_sec;
4899 tv_p->tv_usec = tv32.tv_usec;
4900 }
4901 return 0;
4902 }
4903
4904 int
4905 soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4906 boolean_t ignore_delegate)
4907 {
4908 kauth_cred_t cred = NULL;
4909 proc_t ep = PROC_NULL;
4910 uid_t uid;
4911 int error = 0;
4912
4913 if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4914 ep = proc_find(so->e_pid);
4915 if (ep) {
4916 cred = kauth_cred_proc_ref(ep);
4917 }
4918 }
4919
4920 uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4921
4922 /* uid is 0 for root */
4923 if (uid != 0 || !allow_root) {
4924 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4925 }
4926 if (cred) {
4927 kauth_cred_unref(&cred);
4928 }
4929 if (ep != PROC_NULL) {
4930 proc_rele(ep);
4931 }
4932
4933 return error;
4934 }
4935
4936 /*
4937 * Returns: 0 Success
4938 * EINVAL
4939 * ENOPROTOOPT
4940 * ENOBUFS
4941 * EDOM
4942 * sooptcopyin:EINVAL
4943 * sooptcopyin:EFAULT
4944 * sooptcopyin_timeval:EINVAL
4945 * sooptcopyin_timeval:EFAULT
4946 * sooptcopyin_timeval:EDOM
4947 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4948 * <pr_ctloutput>:???w
4949 * sflt_attach_private:??? [whatever a filter author chooses]
4950 * <sf_setoption>:??? [whatever a filter author chooses]
4951 *
4952 * Notes: Other <pru_listen> returns depend on the protocol family; all
4953 * <sf_listen> returns depend on what the filter author causes
4954 * their filter to return.
4955 */
4956 int
4957 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4958 {
4959 int error, optval;
4960 int64_t long_optval;
4961 struct linger l;
4962 struct timeval tv;
4963
4964 if (sopt->sopt_dir != SOPT_SET) {
4965 sopt->sopt_dir = SOPT_SET;
4966 }
4967
4968 if (dolock) {
4969 socket_lock(so, 1);
4970 }
4971
4972 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4973 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4974 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4975 /* the socket has been shutdown, no more sockopt's */
4976 error = EINVAL;
4977 goto out;
4978 }
4979
4980 error = sflt_setsockopt(so, sopt);
4981 if (error != 0) {
4982 if (error == EJUSTRETURN) {
4983 error = 0;
4984 }
4985 goto out;
4986 }
4987
4988 if (sopt->sopt_level != SOL_SOCKET) {
4989 if (so->so_proto != NULL &&
4990 so->so_proto->pr_ctloutput != NULL) {
4991 error = (*so->so_proto->pr_ctloutput)(so, sopt);
4992 goto out;
4993 }
4994 error = ENOPROTOOPT;
4995 } else {
4996 /*
4997 * Allow socket-level (SOL_SOCKET) options to be filtered by
4998 * the protocol layer, if needed. A zero value returned from
4999 * the handler means use default socket-level processing as
5000 * done by the rest of this routine. Otherwise, any other
5001 * return value indicates that the option is unsupported.
5002 */
5003 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5004 pru_socheckopt(so, sopt)) != 0) {
5005 goto out;
5006 }
5007
5008 error = 0;
5009 switch (sopt->sopt_name) {
5010 case SO_LINGER:
5011 case SO_LINGER_SEC:
5012 error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5013 if (error != 0) {
5014 goto out;
5015 }
5016
5017 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
5018 l.l_linger : l.l_linger * hz;
5019 if (l.l_onoff != 0) {
5020 so->so_options |= SO_LINGER;
5021 } else {
5022 so->so_options &= ~SO_LINGER;
5023 }
5024 break;
5025
5026 case SO_DEBUG:
5027 case SO_KEEPALIVE:
5028 case SO_DONTROUTE:
5029 case SO_USELOOPBACK:
5030 case SO_BROADCAST:
5031 case SO_REUSEADDR:
5032 case SO_REUSEPORT:
5033 case SO_OOBINLINE:
5034 case SO_TIMESTAMP:
5035 case SO_TIMESTAMP_MONOTONIC:
5036 case SO_TIMESTAMP_CONTINUOUS:
5037 case SO_DONTTRUNC:
5038 case SO_WANTMORE:
5039 case SO_WANTOOBFLAG:
5040 case SO_NOWAKEFROMSLEEP:
5041 case SO_NOAPNFALLBK:
5042 error = sooptcopyin(sopt, &optval, sizeof(optval),
5043 sizeof(optval));
5044 if (error != 0) {
5045 goto out;
5046 }
5047 if (optval) {
5048 so->so_options |= sopt->sopt_name;
5049 } else {
5050 so->so_options &= ~sopt->sopt_name;
5051 }
5052 break;
5053
5054 case SO_SNDBUF:
5055 case SO_RCVBUF:
5056 case SO_SNDLOWAT:
5057 case SO_RCVLOWAT:
5058 error = sooptcopyin(sopt, &optval, sizeof(optval),
5059 sizeof(optval));
5060 if (error != 0) {
5061 goto out;
5062 }
5063
5064 /*
5065 * Values < 1 make no sense for any of these
5066 * options, so disallow them.
5067 */
5068 if (optval < 1) {
5069 error = EINVAL;
5070 goto out;
5071 }
5072
5073 switch (sopt->sopt_name) {
5074 case SO_SNDBUF:
5075 case SO_RCVBUF: {
5076 struct sockbuf *sb =
5077 (sopt->sopt_name == SO_SNDBUF) ?
5078 &so->so_snd : &so->so_rcv;
5079 if (sbreserve(sb, (u_int32_t)optval) == 0) {
5080 error = ENOBUFS;
5081 goto out;
5082 }
5083 sb->sb_flags |= SB_USRSIZE;
5084 sb->sb_flags &= ~SB_AUTOSIZE;
5085 sb->sb_idealsize = (u_int32_t)optval;
5086 break;
5087 }
5088 /*
5089 * Make sure the low-water is never greater than
5090 * the high-water.
5091 */
5092 case SO_SNDLOWAT: {
5093 int space = sbspace(&so->so_snd);
5094 u_int32_t hiwat = so->so_snd.sb_hiwat;
5095
5096 if (so->so_snd.sb_flags & SB_UNIX) {
5097 struct unpcb *unp =
5098 (struct unpcb *)(so->so_pcb);
5099 if (unp != NULL &&
5100 unp->unp_conn != NULL) {
5101 hiwat += unp->unp_conn->unp_cc;
5102 }
5103 }
5104
5105 so->so_snd.sb_lowat =
5106 (optval > hiwat) ?
5107 hiwat : optval;
5108
5109 if (space >= so->so_snd.sb_lowat) {
5110 sowwakeup(so);
5111 }
5112 break;
5113 }
5114 case SO_RCVLOWAT: {
5115 int64_t data_len;
5116 so->so_rcv.sb_lowat =
5117 (optval > so->so_rcv.sb_hiwat) ?
5118 so->so_rcv.sb_hiwat : optval;
5119 data_len = so->so_rcv.sb_cc
5120 - so->so_rcv.sb_ctl;
5121 if (data_len >= so->so_rcv.sb_lowat) {
5122 sorwakeup(so);
5123 }
5124 break;
5125 }
5126 }
5127 break;
5128
5129 case SO_SNDTIMEO:
5130 case SO_RCVTIMEO:
5131 error = sooptcopyin_timeval(sopt, &tv);
5132 if (error != 0) {
5133 goto out;
5134 }
5135
5136 switch (sopt->sopt_name) {
5137 case SO_SNDTIMEO:
5138 so->so_snd.sb_timeo = tv;
5139 break;
5140 case SO_RCVTIMEO:
5141 so->so_rcv.sb_timeo = tv;
5142 break;
5143 }
5144 break;
5145
5146 case SO_NKE: {
5147 struct so_nke nke;
5148
5149 error = sooptcopyin(sopt, &nke, sizeof(nke),
5150 sizeof(nke));
5151 if (error != 0) {
5152 goto out;
5153 }
5154
5155 error = sflt_attach_internal(so, nke.nke_handle);
5156 break;
5157 }
5158
5159 case SO_NOSIGPIPE:
5160 error = sooptcopyin(sopt, &optval, sizeof(optval),
5161 sizeof(optval));
5162 if (error != 0) {
5163 goto out;
5164 }
5165 if (optval != 0) {
5166 so->so_flags |= SOF_NOSIGPIPE;
5167 } else {
5168 so->so_flags &= ~SOF_NOSIGPIPE;
5169 }
5170 break;
5171
5172 case SO_NOADDRERR:
5173 error = sooptcopyin(sopt, &optval, sizeof(optval),
5174 sizeof(optval));
5175 if (error != 0) {
5176 goto out;
5177 }
5178 if (optval != 0) {
5179 so->so_flags |= SOF_NOADDRAVAIL;
5180 } else {
5181 so->so_flags &= ~SOF_NOADDRAVAIL;
5182 }
5183 break;
5184
5185 case SO_REUSESHAREUID:
5186 error = sooptcopyin(sopt, &optval, sizeof(optval),
5187 sizeof(optval));
5188 if (error != 0) {
5189 goto out;
5190 }
5191 if (optval != 0) {
5192 so->so_flags |= SOF_REUSESHAREUID;
5193 } else {
5194 so->so_flags &= ~SOF_REUSESHAREUID;
5195 }
5196 break;
5197
5198 case SO_NOTIFYCONFLICT:
5199 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5200 error = EPERM;
5201 goto out;
5202 }
5203 error = sooptcopyin(sopt, &optval, sizeof(optval),
5204 sizeof(optval));
5205 if (error != 0) {
5206 goto out;
5207 }
5208 if (optval != 0) {
5209 so->so_flags |= SOF_NOTIFYCONFLICT;
5210 } else {
5211 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5212 }
5213 break;
5214
5215 case SO_RESTRICTIONS:
5216 error = sooptcopyin(sopt, &optval, sizeof(optval),
5217 sizeof(optval));
5218 if (error != 0) {
5219 goto out;
5220 }
5221
5222 error = so_set_restrictions(so, optval);
5223 break;
5224
5225 case SO_AWDL_UNRESTRICTED:
5226 if (SOCK_DOM(so) != PF_INET &&
5227 SOCK_DOM(so) != PF_INET6) {
5228 error = EOPNOTSUPP;
5229 goto out;
5230 }
5231 error = sooptcopyin(sopt, &optval, sizeof(optval),
5232 sizeof(optval));
5233 if (error != 0) {
5234 goto out;
5235 }
5236 if (optval != 0) {
5237 error = soopt_cred_check(so,
5238 PRIV_NET_RESTRICTED_AWDL, false, false);
5239 if (error == 0) {
5240 inp_set_awdl_unrestricted(
5241 sotoinpcb(so));
5242 }
5243 } else {
5244 inp_clear_awdl_unrestricted(sotoinpcb(so));
5245 }
5246 break;
5247 case SO_INTCOPROC_ALLOW:
5248 if (SOCK_DOM(so) != PF_INET6) {
5249 error = EOPNOTSUPP;
5250 goto out;
5251 }
5252 error = sooptcopyin(sopt, &optval, sizeof(optval),
5253 sizeof(optval));
5254 if (error != 0) {
5255 goto out;
5256 }
5257 if (optval != 0 &&
5258 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5259 error = soopt_cred_check(so,
5260 PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5261 if (error == 0) {
5262 inp_set_intcoproc_allowed(
5263 sotoinpcb(so));
5264 }
5265 } else if (optval == 0) {
5266 inp_clear_intcoproc_allowed(sotoinpcb(so));
5267 }
5268 break;
5269
5270 case SO_LABEL:
5271 error = EOPNOTSUPP;
5272 break;
5273
5274 case SO_UPCALLCLOSEWAIT:
5275 error = sooptcopyin(sopt, &optval, sizeof(optval),
5276 sizeof(optval));
5277 if (error != 0) {
5278 goto out;
5279 }
5280 if (optval != 0) {
5281 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5282 } else {
5283 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5284 }
5285 break;
5286
5287 case SO_RANDOMPORT:
5288 error = sooptcopyin(sopt, &optval, sizeof(optval),
5289 sizeof(optval));
5290 if (error != 0) {
5291 goto out;
5292 }
5293 if (optval != 0) {
5294 so->so_flags |= SOF_BINDRANDOMPORT;
5295 } else {
5296 so->so_flags &= ~SOF_BINDRANDOMPORT;
5297 }
5298 break;
5299
5300 case SO_NP_EXTENSIONS: {
5301 struct so_np_extensions sonpx;
5302
5303 error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5304 sizeof(sonpx));
5305 if (error != 0) {
5306 goto out;
5307 }
5308 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5309 error = EINVAL;
5310 goto out;
5311 }
5312 /*
5313 * Only one bit defined for now
5314 */
5315 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5316 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5317 so->so_flags |= SOF_NPX_SETOPTSHUT;
5318 } else {
5319 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5320 }
5321 }
5322 break;
5323 }
5324
5325 case SO_TRAFFIC_CLASS: {
5326 error = sooptcopyin(sopt, &optval, sizeof(optval),
5327 sizeof(optval));
5328 if (error != 0) {
5329 goto out;
5330 }
5331 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5332 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5333 error = so_set_net_service_type(so, netsvc);
5334 goto out;
5335 }
5336 error = so_set_traffic_class(so, optval);
5337 if (error != 0) {
5338 goto out;
5339 }
5340 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5341 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5342 break;
5343 }
5344
5345 case SO_RECV_TRAFFIC_CLASS: {
5346 error = sooptcopyin(sopt, &optval, sizeof(optval),
5347 sizeof(optval));
5348 if (error != 0) {
5349 goto out;
5350 }
5351 if (optval == 0) {
5352 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5353 } else {
5354 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5355 }
5356 break;
5357 }
5358
5359 #if (DEVELOPMENT || DEBUG)
5360 case SO_TRAFFIC_CLASS_DBG: {
5361 struct so_tcdbg so_tcdbg;
5362
5363 error = sooptcopyin(sopt, &so_tcdbg,
5364 sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5365 if (error != 0) {
5366 goto out;
5367 }
5368 error = so_set_tcdbg(so, &so_tcdbg);
5369 if (error != 0) {
5370 goto out;
5371 }
5372 break;
5373 }
5374 #endif /* (DEVELOPMENT || DEBUG) */
5375
5376 case SO_PRIVILEGED_TRAFFIC_CLASS:
5377 error = priv_check_cred(kauth_cred_get(),
5378 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5379 if (error != 0) {
5380 goto out;
5381 }
5382 error = sooptcopyin(sopt, &optval, sizeof(optval),
5383 sizeof(optval));
5384 if (error != 0) {
5385 goto out;
5386 }
5387 if (optval == 0) {
5388 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5389 } else {
5390 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5391 }
5392 break;
5393
5394 #if (DEVELOPMENT || DEBUG)
5395 case SO_DEFUNCTIT:
5396 error = sosetdefunct(current_proc(), so, 0, FALSE);
5397 if (error == 0) {
5398 error = sodefunct(current_proc(), so, 0);
5399 }
5400
5401 break;
5402 #endif /* (DEVELOPMENT || DEBUG) */
5403
5404 case SO_DEFUNCTOK:
5405 error = sooptcopyin(sopt, &optval, sizeof(optval),
5406 sizeof(optval));
5407 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5408 if (error == 0) {
5409 error = EBADF;
5410 }
5411 goto out;
5412 }
5413 /*
5414 * Any process can set SO_DEFUNCTOK (clear
5415 * SOF_NODEFUNCT), but only root can clear
5416 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5417 */
5418 if (optval == 0 &&
5419 kauth_cred_issuser(kauth_cred_get()) == 0) {
5420 error = EPERM;
5421 goto out;
5422 }
5423 if (optval) {
5424 so->so_flags &= ~SOF_NODEFUNCT;
5425 } else {
5426 so->so_flags |= SOF_NODEFUNCT;
5427 }
5428
5429 if (SOCK_DOM(so) == PF_INET ||
5430 SOCK_DOM(so) == PF_INET6) {
5431 char s[MAX_IPv6_STR_LEN];
5432 char d[MAX_IPv6_STR_LEN];
5433 struct inpcb *inp = sotoinpcb(so);
5434
5435 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5436 "[%s %s:%d -> %s:%d] is now marked "
5437 "as %seligible for "
5438 "defunct\n", __func__, proc_selfpid(),
5439 proc_best_name(current_proc()),
5440 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5441 (SOCK_TYPE(so) == SOCK_STREAM) ?
5442 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5443 ((SOCK_DOM(so) == PF_INET) ?
5444 (void *)&inp->inp_laddr.s_addr :
5445 (void *)&inp->in6p_laddr), s, sizeof(s)),
5446 ntohs(inp->in6p_lport),
5447 inet_ntop(SOCK_DOM(so),
5448 (SOCK_DOM(so) == PF_INET) ?
5449 (void *)&inp->inp_faddr.s_addr :
5450 (void *)&inp->in6p_faddr, d, sizeof(d)),
5451 ntohs(inp->in6p_fport),
5452 (so->so_flags & SOF_NODEFUNCT) ?
5453 "not " : "");
5454 } else {
5455 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5456 "is now marked as %seligible for "
5457 "defunct\n",
5458 __func__, proc_selfpid(),
5459 proc_best_name(current_proc()),
5460 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5461 SOCK_DOM(so), SOCK_TYPE(so),
5462 (so->so_flags & SOF_NODEFUNCT) ?
5463 "not " : "");
5464 }
5465 break;
5466
5467 case SO_ISDEFUNCT:
5468 /* This option is not settable */
5469 error = EINVAL;
5470 break;
5471
5472 case SO_OPPORTUNISTIC:
5473 error = sooptcopyin(sopt, &optval, sizeof(optval),
5474 sizeof(optval));
5475 if (error == 0) {
5476 error = so_set_opportunistic(so, optval);
5477 }
5478 break;
5479
5480 case SO_FLUSH:
5481 /* This option is handled by lower layer(s) */
5482 error = 0;
5483 break;
5484
5485 case SO_RECV_ANYIF:
5486 error = sooptcopyin(sopt, &optval, sizeof(optval),
5487 sizeof(optval));
5488 if (error == 0) {
5489 error = so_set_recv_anyif(so, optval);
5490 }
5491 break;
5492
5493 case SO_TRAFFIC_MGT_BACKGROUND: {
5494 /* This option is handled by lower layer(s) */
5495 error = 0;
5496 break;
5497 }
5498
5499 #if FLOW_DIVERT
5500 case SO_FLOW_DIVERT_TOKEN:
5501 error = flow_divert_token_set(so, sopt);
5502 break;
5503 #endif /* FLOW_DIVERT */
5504
5505
5506 case SO_DELEGATED:
5507 if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5508 sizeof(optval))) != 0) {
5509 break;
5510 }
5511
5512 error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
5513 break;
5514
5515 case SO_DELEGATED_UUID: {
5516 uuid_t euuid;
5517
5518 if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5519 sizeof(euuid))) != 0) {
5520 break;
5521 }
5522
5523 error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
5524 break;
5525 }
5526
5527 #if NECP
5528 case SO_NECP_ATTRIBUTES:
5529 error = necp_set_socket_attributes(so, sopt);
5530 break;
5531
5532 case SO_NECP_CLIENTUUID: {
5533 if (SOCK_DOM(so) == PF_MULTIPATH) {
5534 /* Handled by MPTCP itself */
5535 break;
5536 }
5537
5538 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5539 error = EINVAL;
5540 goto out;
5541 }
5542
5543 struct inpcb *inp = sotoinpcb(so);
5544 if (!uuid_is_null(inp->necp_client_uuid)) {
5545 // Clear out the old client UUID if present
5546 necp_inpcb_remove_cb(inp);
5547 }
5548
5549 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5550 sizeof(uuid_t), sizeof(uuid_t));
5551 if (error != 0) {
5552 goto out;
5553 }
5554
5555 if (uuid_is_null(inp->necp_client_uuid)) {
5556 error = EINVAL;
5557 goto out;
5558 }
5559
5560 pid_t current_pid = proc_pid(current_proc());
5561 error = necp_client_register_socket_flow(current_pid,
5562 inp->necp_client_uuid, inp);
5563 if (error != 0) {
5564 uuid_clear(inp->necp_client_uuid);
5565 goto out;
5566 }
5567
5568 if (inp->inp_lport != 0) {
5569 // There is a bound local port, so this is not
5570 // a fresh socket. Assign to the client.
5571 necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5572 }
5573
5574 break;
5575 }
5576 case SO_NECP_LISTENUUID: {
5577 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5578 error = EINVAL;
5579 goto out;
5580 }
5581
5582 struct inpcb *inp = sotoinpcb(so);
5583 if (!uuid_is_null(inp->necp_client_uuid)) {
5584 error = EINVAL;
5585 goto out;
5586 }
5587
5588 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5589 sizeof(uuid_t), sizeof(uuid_t));
5590 if (error != 0) {
5591 goto out;
5592 }
5593
5594 if (uuid_is_null(inp->necp_client_uuid)) {
5595 error = EINVAL;
5596 goto out;
5597 }
5598
5599 error = necp_client_register_socket_listener(proc_pid(current_proc()),
5600 inp->necp_client_uuid, inp);
5601 if (error != 0) {
5602 uuid_clear(inp->necp_client_uuid);
5603 goto out;
5604 }
5605
5606 // Mark that the port registration is held by NECP
5607 inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5608
5609 break;
5610 }
5611 #endif /* NECP */
5612
5613 case SO_EXTENDED_BK_IDLE:
5614 error = sooptcopyin(sopt, &optval, sizeof(optval),
5615 sizeof(optval));
5616 if (error == 0) {
5617 error = so_set_extended_bk_idle(so, optval);
5618 }
5619 break;
5620
5621 case SO_MARK_CELLFALLBACK:
5622 error = sooptcopyin(sopt, &optval, sizeof(optval),
5623 sizeof(optval));
5624 if (error != 0) {
5625 goto out;
5626 }
5627 if (optval < 0) {
5628 error = EINVAL;
5629 goto out;
5630 }
5631 if (optval == 0) {
5632 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5633 } else {
5634 so->so_flags1 |= SOF1_CELLFALLBACK;
5635 }
5636 break;
5637
5638 case SO_STATISTICS_EVENT:
5639 error = sooptcopyin(sopt, &long_optval,
5640 sizeof(long_optval), sizeof(long_optval));
5641 if (error != 0) {
5642 goto out;
5643 }
5644 u_int64_t nstat_event = 0;
5645 error = so_statistics_event_to_nstat_event(
5646 &long_optval, &nstat_event);
5647 if (error != 0) {
5648 goto out;
5649 }
5650 nstat_pcb_event(sotoinpcb(so), nstat_event);
5651 break;
5652
5653 case SO_NET_SERVICE_TYPE: {
5654 error = sooptcopyin(sopt, &optval, sizeof(optval),
5655 sizeof(optval));
5656 if (error != 0) {
5657 goto out;
5658 }
5659 error = so_set_net_service_type(so, optval);
5660 break;
5661 }
5662
5663 case SO_QOSMARKING_POLICY_OVERRIDE:
5664 error = priv_check_cred(kauth_cred_get(),
5665 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5666 if (error != 0) {
5667 goto out;
5668 }
5669 error = sooptcopyin(sopt, &optval, sizeof(optval),
5670 sizeof(optval));
5671 if (error != 0) {
5672 goto out;
5673 }
5674 if (optval == 0) {
5675 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5676 } else {
5677 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5678 }
5679 break;
5680
5681 case SO_MPKL_SEND_INFO: {
5682 struct so_mpkl_send_info so_mpkl_send_info;
5683
5684 error = sooptcopyin(sopt, &so_mpkl_send_info,
5685 sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5686 if (error != 0) {
5687 goto out;
5688 }
5689 uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5690 so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5691
5692 if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5693 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5694 } else {
5695 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5696 }
5697 break;
5698 }
5699 case SO_WANT_KEV_SOCKET_CLOSED: {
5700 error = sooptcopyin(sopt, &optval, sizeof(optval),
5701 sizeof(optval));
5702 if (error != 0) {
5703 goto out;
5704 }
5705 if (optval == 0) {
5706 so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5707 } else {
5708 so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5709 }
5710 break;
5711 }
5712 default:
5713 error = ENOPROTOOPT;
5714 break;
5715 }
5716 if (error == 0 && so->so_proto != NULL &&
5717 so->so_proto->pr_ctloutput != NULL) {
5718 (void) so->so_proto->pr_ctloutput(so, sopt);
5719 }
5720 }
5721 out:
5722 if (dolock) {
5723 socket_unlock(so, 1);
5724 }
5725 return error;
5726 }
5727
5728 /* Helper routines for getsockopt */
5729 int
5730 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5731 {
5732 int error;
5733 size_t valsize;
5734
5735 error = 0;
5736
5737 /*
5738 * Documented get behavior is that we always return a value,
5739 * possibly truncated to fit in the user's buffer.
5740 * Traditional behavior is that we always tell the user
5741 * precisely how much we copied, rather than something useful
5742 * like the total amount we had available for her.
5743 * Note that this interface is not idempotent; the entire answer must
5744 * generated ahead of time.
5745 */
5746 valsize = min(len, sopt->sopt_valsize);
5747 sopt->sopt_valsize = valsize;
5748 if (sopt->sopt_val != USER_ADDR_NULL) {
5749 if (sopt->sopt_p != kernproc) {
5750 error = copyout(buf, sopt->sopt_val, valsize);
5751 } else {
5752 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5753 }
5754 }
5755 return error;
5756 }
5757
5758 static int
5759 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5760 {
5761 int error;
5762 size_t len;
5763 struct user64_timeval tv64 = {};
5764 struct user32_timeval tv32 = {};
5765 const void * val;
5766 size_t valsize;
5767
5768 error = 0;
5769 if (proc_is64bit(sopt->sopt_p)) {
5770 len = sizeof(tv64);
5771 tv64.tv_sec = tv_p->tv_sec;
5772 tv64.tv_usec = tv_p->tv_usec;
5773 val = &tv64;
5774 } else {
5775 len = sizeof(tv32);
5776 tv32.tv_sec = tv_p->tv_sec;
5777 tv32.tv_usec = tv_p->tv_usec;
5778 val = &tv32;
5779 }
5780 valsize = min(len, sopt->sopt_valsize);
5781 sopt->sopt_valsize = valsize;
5782 if (sopt->sopt_val != USER_ADDR_NULL) {
5783 if (sopt->sopt_p != kernproc) {
5784 error = copyout(val, sopt->sopt_val, valsize);
5785 } else {
5786 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5787 }
5788 }
5789 return error;
5790 }
5791
5792 /*
5793 * Return: 0 Success
5794 * ENOPROTOOPT
5795 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5796 * <pr_ctloutput>:???
5797 * <sf_getoption>:???
5798 */
5799 int
5800 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5801 {
5802 int error, optval;
5803 struct linger l;
5804 struct timeval tv;
5805
5806 if (sopt->sopt_dir != SOPT_GET) {
5807 sopt->sopt_dir = SOPT_GET;
5808 }
5809
5810 if (dolock) {
5811 socket_lock(so, 1);
5812 }
5813
5814 error = sflt_getsockopt(so, sopt);
5815 if (error != 0) {
5816 if (error == EJUSTRETURN) {
5817 error = 0;
5818 }
5819 goto out;
5820 }
5821
5822 if (sopt->sopt_level != SOL_SOCKET) {
5823 if (so->so_proto != NULL &&
5824 so->so_proto->pr_ctloutput != NULL) {
5825 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5826 goto out;
5827 }
5828 error = ENOPROTOOPT;
5829 } else {
5830 /*
5831 * Allow socket-level (SOL_SOCKET) options to be filtered by
5832 * the protocol layer, if needed. A zero value returned from
5833 * the handler means use default socket-level processing as
5834 * done by the rest of this routine. Otherwise, any other
5835 * return value indicates that the option is unsupported.
5836 */
5837 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5838 pru_socheckopt(so, sopt)) != 0) {
5839 goto out;
5840 }
5841
5842 error = 0;
5843 switch (sopt->sopt_name) {
5844 case SO_LINGER:
5845 case SO_LINGER_SEC:
5846 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5847 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5848 so->so_linger : so->so_linger / hz;
5849 error = sooptcopyout(sopt, &l, sizeof(l));
5850 break;
5851
5852 case SO_USELOOPBACK:
5853 case SO_DONTROUTE:
5854 case SO_DEBUG:
5855 case SO_KEEPALIVE:
5856 case SO_REUSEADDR:
5857 case SO_REUSEPORT:
5858 case SO_BROADCAST:
5859 case SO_OOBINLINE:
5860 case SO_TIMESTAMP:
5861 case SO_TIMESTAMP_MONOTONIC:
5862 case SO_TIMESTAMP_CONTINUOUS:
5863 case SO_DONTTRUNC:
5864 case SO_WANTMORE:
5865 case SO_WANTOOBFLAG:
5866 case SO_NOWAKEFROMSLEEP:
5867 case SO_NOAPNFALLBK:
5868 optval = so->so_options & sopt->sopt_name;
5869 integer:
5870 error = sooptcopyout(sopt, &optval, sizeof(optval));
5871 break;
5872
5873 case SO_TYPE:
5874 optval = so->so_type;
5875 goto integer;
5876
5877 case SO_NREAD:
5878 if (so->so_proto->pr_flags & PR_ATOMIC) {
5879 int pkt_total;
5880 struct mbuf *m1;
5881
5882 pkt_total = 0;
5883 m1 = so->so_rcv.sb_mb;
5884 while (m1 != NULL) {
5885 if (m1->m_type == MT_DATA ||
5886 m1->m_type == MT_HEADER ||
5887 m1->m_type == MT_OOBDATA) {
5888 pkt_total += m1->m_len;
5889 }
5890 m1 = m1->m_next;
5891 }
5892 optval = pkt_total;
5893 } else {
5894 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5895 }
5896 goto integer;
5897
5898 case SO_NUMRCVPKT:
5899 if (so->so_proto->pr_flags & PR_ATOMIC) {
5900 int cnt = 0;
5901 struct mbuf *m1;
5902
5903 m1 = so->so_rcv.sb_mb;
5904 while (m1 != NULL) {
5905 cnt += 1;
5906 m1 = m1->m_nextpkt;
5907 }
5908 optval = cnt;
5909 goto integer;
5910 } else {
5911 error = ENOPROTOOPT;
5912 break;
5913 }
5914
5915 case SO_NWRITE:
5916 optval = so->so_snd.sb_cc;
5917 goto integer;
5918
5919 case SO_ERROR:
5920 optval = so->so_error;
5921 so->so_error = 0;
5922 goto integer;
5923
5924 case SO_SNDBUF: {
5925 u_int32_t hiwat = so->so_snd.sb_hiwat;
5926
5927 if (so->so_snd.sb_flags & SB_UNIX) {
5928 struct unpcb *unp =
5929 (struct unpcb *)(so->so_pcb);
5930 if (unp != NULL && unp->unp_conn != NULL) {
5931 hiwat += unp->unp_conn->unp_cc;
5932 }
5933 }
5934
5935 optval = hiwat;
5936 goto integer;
5937 }
5938 case SO_RCVBUF:
5939 optval = so->so_rcv.sb_hiwat;
5940 goto integer;
5941
5942 case SO_SNDLOWAT:
5943 optval = so->so_snd.sb_lowat;
5944 goto integer;
5945
5946 case SO_RCVLOWAT:
5947 optval = so->so_rcv.sb_lowat;
5948 goto integer;
5949
5950 case SO_SNDTIMEO:
5951 case SO_RCVTIMEO:
5952 tv = (sopt->sopt_name == SO_SNDTIMEO ?
5953 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5954
5955 error = sooptcopyout_timeval(sopt, &tv);
5956 break;
5957
5958 case SO_NOSIGPIPE:
5959 optval = (so->so_flags & SOF_NOSIGPIPE);
5960 goto integer;
5961
5962 case SO_NOADDRERR:
5963 optval = (so->so_flags & SOF_NOADDRAVAIL);
5964 goto integer;
5965
5966 case SO_REUSESHAREUID:
5967 optval = (so->so_flags & SOF_REUSESHAREUID);
5968 goto integer;
5969
5970
5971 case SO_NOTIFYCONFLICT:
5972 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5973 goto integer;
5974
5975 case SO_RESTRICTIONS:
5976 optval = so_get_restrictions(so);
5977 goto integer;
5978
5979 case SO_AWDL_UNRESTRICTED:
5980 if (SOCK_DOM(so) == PF_INET ||
5981 SOCK_DOM(so) == PF_INET6) {
5982 optval = inp_get_awdl_unrestricted(
5983 sotoinpcb(so));
5984 goto integer;
5985 } else {
5986 error = EOPNOTSUPP;
5987 }
5988 break;
5989
5990 case SO_INTCOPROC_ALLOW:
5991 if (SOCK_DOM(so) == PF_INET6) {
5992 optval = inp_get_intcoproc_allowed(
5993 sotoinpcb(so));
5994 goto integer;
5995 } else {
5996 error = EOPNOTSUPP;
5997 }
5998 break;
5999
6000 case SO_LABEL:
6001 error = EOPNOTSUPP;
6002 break;
6003
6004 case SO_PEERLABEL:
6005 error = EOPNOTSUPP;
6006 break;
6007
6008 #ifdef __APPLE_API_PRIVATE
6009 case SO_UPCALLCLOSEWAIT:
6010 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6011 goto integer;
6012 #endif
6013 case SO_RANDOMPORT:
6014 optval = (so->so_flags & SOF_BINDRANDOMPORT);
6015 goto integer;
6016
6017 case SO_NP_EXTENSIONS: {
6018 struct so_np_extensions sonpx = {};
6019
6020 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6021 SONPX_SETOPTSHUT : 0;
6022 sonpx.npx_mask = SONPX_MASK_VALID;
6023
6024 error = sooptcopyout(sopt, &sonpx,
6025 sizeof(struct so_np_extensions));
6026 break;
6027 }
6028
6029 case SO_TRAFFIC_CLASS:
6030 optval = so->so_traffic_class;
6031 goto integer;
6032
6033 case SO_RECV_TRAFFIC_CLASS:
6034 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6035 goto integer;
6036
6037 #if (DEVELOPMENT || DEBUG)
6038 case SO_TRAFFIC_CLASS_DBG:
6039 error = sogetopt_tcdbg(so, sopt);
6040 break;
6041 #endif /* (DEVELOPMENT || DEBUG) */
6042
6043 case SO_PRIVILEGED_TRAFFIC_CLASS:
6044 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6045 goto integer;
6046
6047 case SO_DEFUNCTOK:
6048 optval = !(so->so_flags & SOF_NODEFUNCT);
6049 goto integer;
6050
6051 case SO_ISDEFUNCT:
6052 optval = (so->so_flags & SOF_DEFUNCT);
6053 goto integer;
6054
6055 case SO_OPPORTUNISTIC:
6056 optval = so_get_opportunistic(so);
6057 goto integer;
6058
6059 case SO_FLUSH:
6060 /* This option is not gettable */
6061 error = EINVAL;
6062 break;
6063
6064 case SO_RECV_ANYIF:
6065 optval = so_get_recv_anyif(so);
6066 goto integer;
6067
6068 case SO_TRAFFIC_MGT_BACKGROUND:
6069 /* This option is handled by lower layer(s) */
6070 if (so->so_proto != NULL &&
6071 so->so_proto->pr_ctloutput != NULL) {
6072 (void) so->so_proto->pr_ctloutput(so, sopt);
6073 }
6074 break;
6075
6076 #if FLOW_DIVERT
6077 case SO_FLOW_DIVERT_TOKEN:
6078 error = flow_divert_token_get(so, sopt);
6079 break;
6080 #endif /* FLOW_DIVERT */
6081
6082 #if NECP
6083 case SO_NECP_ATTRIBUTES:
6084 error = necp_get_socket_attributes(so, sopt);
6085 break;
6086
6087 case SO_NECP_CLIENTUUID: {
6088 uuid_t *ncu;
6089
6090 if (SOCK_DOM(so) == PF_MULTIPATH) {
6091 ncu = &mpsotomppcb(so)->necp_client_uuid;
6092 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6093 ncu = &sotoinpcb(so)->necp_client_uuid;
6094 } else {
6095 error = EINVAL;
6096 goto out;
6097 }
6098
6099 error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6100 break;
6101 }
6102
6103 case SO_NECP_LISTENUUID: {
6104 uuid_t *nlu;
6105
6106 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6107 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6108 nlu = &sotoinpcb(so)->necp_client_uuid;
6109 } else {
6110 error = ENOENT;
6111 goto out;
6112 }
6113 } else {
6114 error = EINVAL;
6115 goto out;
6116 }
6117
6118 error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6119 break;
6120 }
6121 #endif /* NECP */
6122
6123 #if CONTENT_FILTER
6124 case SO_CFIL_SOCK_ID: {
6125 cfil_sock_id_t sock_id;
6126
6127 sock_id = cfil_sock_id_from_socket(so);
6128
6129 error = sooptcopyout(sopt, &sock_id,
6130 sizeof(cfil_sock_id_t));
6131 break;
6132 }
6133 #endif /* CONTENT_FILTER */
6134
6135 case SO_EXTENDED_BK_IDLE:
6136 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6137 goto integer;
6138 case SO_MARK_CELLFALLBACK:
6139 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6140 ? 1 : 0;
6141 goto integer;
6142 case SO_NET_SERVICE_TYPE: {
6143 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6144 optval = so->so_netsvctype;
6145 } else {
6146 optval = NET_SERVICE_TYPE_BE;
6147 }
6148 goto integer;
6149 }
6150 case SO_NETSVC_MARKING_LEVEL:
6151 optval = so_get_netsvc_marking_level(so);
6152 goto integer;
6153
6154 case SO_MPKL_SEND_INFO: {
6155 struct so_mpkl_send_info so_mpkl_send_info;
6156
6157 uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6158 so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6159 error = sooptcopyout(sopt, &so_mpkl_send_info,
6160 sizeof(struct so_mpkl_send_info));
6161 break;
6162 }
6163 default:
6164 error = ENOPROTOOPT;
6165 break;
6166 }
6167 }
6168 out:
6169 if (dolock) {
6170 socket_unlock(so, 1);
6171 }
6172 return error;
6173 }
6174
6175 /*
6176 * The size limits on our soopt_getm is different from that on FreeBSD.
6177 * We limit the size of options to MCLBYTES. This will have to change
6178 * if we need to define options that need more space than MCLBYTES.
6179 */
6180 int
6181 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6182 {
6183 struct mbuf *m, *m_prev;
6184 int sopt_size = sopt->sopt_valsize;
6185 int how;
6186
6187 if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6188 return EMSGSIZE;
6189 }
6190
6191 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6192 MGET(m, how, MT_DATA);
6193 if (m == NULL) {
6194 return ENOBUFS;
6195 }
6196 if (sopt_size > MLEN) {
6197 MCLGET(m, how);
6198 if ((m->m_flags & M_EXT) == 0) {
6199 m_free(m);
6200 return ENOBUFS;
6201 }
6202 m->m_len = min(MCLBYTES, sopt_size);
6203 } else {
6204 m->m_len = min(MLEN, sopt_size);
6205 }
6206 sopt_size -= m->m_len;
6207 *mp = m;
6208 m_prev = m;
6209
6210 while (sopt_size > 0) {
6211 MGET(m, how, MT_DATA);
6212 if (m == NULL) {
6213 m_freem(*mp);
6214 return ENOBUFS;
6215 }
6216 if (sopt_size > MLEN) {
6217 MCLGET(m, how);
6218 if ((m->m_flags & M_EXT) == 0) {
6219 m_freem(*mp);
6220 m_freem(m);
6221 return ENOBUFS;
6222 }
6223 m->m_len = min(MCLBYTES, sopt_size);
6224 } else {
6225 m->m_len = min(MLEN, sopt_size);
6226 }
6227 sopt_size -= m->m_len;
6228 m_prev->m_next = m;
6229 m_prev = m;
6230 }
6231 return 0;
6232 }
6233
6234 /* copyin sopt data into mbuf chain */
6235 int
6236 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6237 {
6238 struct mbuf *m0 = m;
6239
6240 if (sopt->sopt_val == USER_ADDR_NULL) {
6241 return 0;
6242 }
6243 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6244 if (sopt->sopt_p != kernproc) {
6245 int error;
6246
6247 error = copyin(sopt->sopt_val, mtod(m, char *),
6248 m->m_len);
6249 if (error != 0) {
6250 m_freem(m0);
6251 return error;
6252 }
6253 } else {
6254 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6255 mtod(m, char *), m->m_len);
6256 }
6257 sopt->sopt_valsize -= m->m_len;
6258 sopt->sopt_val += m->m_len;
6259 m = m->m_next;
6260 }
6261 /* should be allocated enoughly at ip6_sooptmcopyin() */
6262 if (m != NULL) {
6263 panic("soopt_mcopyin");
6264 /* NOTREACHED */
6265 }
6266 return 0;
6267 }
6268
6269 /* copyout mbuf chain data into soopt */
6270 int
6271 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6272 {
6273 struct mbuf *m0 = m;
6274 size_t valsize = 0;
6275
6276 if (sopt->sopt_val == USER_ADDR_NULL) {
6277 return 0;
6278 }
6279 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6280 if (sopt->sopt_p != kernproc) {
6281 int error;
6282
6283 error = copyout(mtod(m, char *), sopt->sopt_val,
6284 m->m_len);
6285 if (error != 0) {
6286 m_freem(m0);
6287 return error;
6288 }
6289 } else {
6290 bcopy(mtod(m, char *),
6291 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6292 }
6293 sopt->sopt_valsize -= m->m_len;
6294 sopt->sopt_val += m->m_len;
6295 valsize += m->m_len;
6296 m = m->m_next;
6297 }
6298 if (m != NULL) {
6299 /* enough soopt buffer should be given from user-land */
6300 m_freem(m0);
6301 return EINVAL;
6302 }
6303 sopt->sopt_valsize = valsize;
6304 return 0;
6305 }
6306
6307 void
6308 sohasoutofband(struct socket *so)
6309 {
6310 if (so->so_pgid < 0) {
6311 gsignal(-so->so_pgid, SIGURG);
6312 } else if (so->so_pgid > 0) {
6313 proc_signal(so->so_pgid, SIGURG);
6314 }
6315 selwakeup(&so->so_rcv.sb_sel);
6316 if (so->so_rcv.sb_flags & SB_KNOTE) {
6317 KNOTE(&so->so_rcv.sb_sel.si_note,
6318 (NOTE_OOB | SO_FILT_HINT_LOCKED));
6319 }
6320 }
6321
6322 int
6323 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6324 {
6325 #pragma unused(cred)
6326 struct proc *p = current_proc();
6327 int revents = 0;
6328
6329 socket_lock(so, 1);
6330 so_update_last_owner_locked(so, PROC_NULL);
6331 so_update_policy(so);
6332
6333 if (events & (POLLIN | POLLRDNORM)) {
6334 if (soreadable(so)) {
6335 revents |= events & (POLLIN | POLLRDNORM);
6336 }
6337 }
6338
6339 if (events & (POLLOUT | POLLWRNORM)) {
6340 if (sowriteable(so)) {
6341 revents |= events & (POLLOUT | POLLWRNORM);
6342 }
6343 }
6344
6345 if (events & (POLLPRI | POLLRDBAND)) {
6346 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6347 revents |= events & (POLLPRI | POLLRDBAND);
6348 }
6349 }
6350
6351 if (revents == 0) {
6352 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6353 /*
6354 * Darwin sets the flag first,
6355 * BSD calls selrecord first
6356 */
6357 so->so_rcv.sb_flags |= SB_SEL;
6358 selrecord(p, &so->so_rcv.sb_sel, wql);
6359 }
6360
6361 if (events & (POLLOUT | POLLWRNORM)) {
6362 /*
6363 * Darwin sets the flag first,
6364 * BSD calls selrecord first
6365 */
6366 so->so_snd.sb_flags |= SB_SEL;
6367 selrecord(p, &so->so_snd.sb_sel, wql);
6368 }
6369 }
6370
6371 socket_unlock(so, 1);
6372 return revents;
6373 }
6374
6375 int
6376 soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6377 {
6378 struct socket *so = (struct socket *)fp->fp_glob->fg_data;
6379 int result;
6380
6381 socket_lock(so, 1);
6382 so_update_last_owner_locked(so, PROC_NULL);
6383 so_update_policy(so);
6384
6385 switch (kn->kn_filter) {
6386 case EVFILT_READ:
6387 kn->kn_filtid = EVFILTID_SOREAD;
6388 break;
6389 case EVFILT_WRITE:
6390 kn->kn_filtid = EVFILTID_SOWRITE;
6391 break;
6392 case EVFILT_SOCK:
6393 kn->kn_filtid = EVFILTID_SCK;
6394 break;
6395 case EVFILT_EXCEPT:
6396 kn->kn_filtid = EVFILTID_SOEXCEPT;
6397 break;
6398 default:
6399 socket_unlock(so, 1);
6400 knote_set_error(kn, EINVAL);
6401 return 0;
6402 }
6403
6404 /*
6405 * call the appropriate sub-filter attach
6406 * with the socket still locked
6407 */
6408 result = knote_fops(kn)->f_attach(kn, kev);
6409
6410 socket_unlock(so, 1);
6411
6412 return result;
6413 }
6414
6415 static int
6416 filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6417 {
6418 int retval = 0;
6419 int64_t data = 0;
6420
6421 if (so->so_options & SO_ACCEPTCONN) {
6422 /*
6423 * Radar 6615193 handle the listen case dynamically
6424 * for kqueue read filter. This allows to call listen()
6425 * after registering the kqueue EVFILT_READ.
6426 */
6427
6428 retval = !TAILQ_EMPTY(&so->so_comp);
6429 data = so->so_qlen;
6430 goto out;
6431 }
6432
6433 /* socket isn't a listener */
6434 /*
6435 * NOTE_LOWAT specifies new low water mark in data, i.e.
6436 * the bytes of protocol data. We therefore exclude any
6437 * control bytes.
6438 */
6439 data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6440
6441 if (kn->kn_sfflags & NOTE_OOB) {
6442 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6443 kn->kn_fflags |= NOTE_OOB;
6444 data -= so->so_oobmark;
6445 retval = 1;
6446 goto out;
6447 }
6448 }
6449
6450 if ((so->so_state & SS_CANTRCVMORE)
6451 #if CONTENT_FILTER
6452 && cfil_sock_data_pending(&so->so_rcv) == 0
6453 #endif /* CONTENT_FILTER */
6454 ) {
6455 kn->kn_flags |= EV_EOF;
6456 kn->kn_fflags = so->so_error;
6457 retval = 1;
6458 goto out;
6459 }
6460
6461 if (so->so_error) { /* temporary udp error */
6462 retval = 1;
6463 goto out;
6464 }
6465
6466 int64_t lowwat = so->so_rcv.sb_lowat;
6467 /*
6468 * Ensure that when NOTE_LOWAT is used, the derived
6469 * low water mark is bounded by socket's rcv buf's
6470 * high and low water mark values.
6471 */
6472 if (kn->kn_sfflags & NOTE_LOWAT) {
6473 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6474 lowwat = so->so_rcv.sb_hiwat;
6475 } else if (kn->kn_sdata > lowwat) {
6476 lowwat = kn->kn_sdata;
6477 }
6478 }
6479
6480 /*
6481 * While the `data` field is the amount of data to read,
6482 * 0-sized packets need to wake up the kqueue, see 58140856,
6483 * so we need to take control bytes into account too.
6484 */
6485 retval = (so->so_rcv.sb_cc >= lowwat);
6486
6487 out:
6488 if (retval && kev) {
6489 knote_fill_kevent(kn, kev, data);
6490 }
6491 return retval;
6492 }
6493
6494 static int
6495 filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6496 {
6497 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6498
6499 /* socket locked */
6500
6501 /*
6502 * If the caller explicitly asked for OOB results (e.g. poll())
6503 * from EVFILT_READ, then save that off in the hookid field
6504 * and reserve the kn_flags EV_OOBAND bit for output only.
6505 */
6506 if (kn->kn_filter == EVFILT_READ &&
6507 kn->kn_flags & EV_OOBAND) {
6508 kn->kn_flags &= ~EV_OOBAND;
6509 kn->kn_hook32 = EV_OOBAND;
6510 } else {
6511 kn->kn_hook32 = 0;
6512 }
6513 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6514 so->so_rcv.sb_flags |= SB_KNOTE;
6515 }
6516
6517 /* indicate if event is already fired */
6518 return filt_soread_common(kn, NULL, so);
6519 }
6520
6521 static void
6522 filt_sordetach(struct knote *kn)
6523 {
6524 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6525
6526 socket_lock(so, 1);
6527 if (so->so_rcv.sb_flags & SB_KNOTE) {
6528 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6529 so->so_rcv.sb_flags &= ~SB_KNOTE;
6530 }
6531 }
6532 socket_unlock(so, 1);
6533 }
6534
6535 /*ARGSUSED*/
6536 static int
6537 filt_soread(struct knote *kn, long hint)
6538 {
6539 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6540 int retval;
6541
6542 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6543 socket_lock(so, 1);
6544 }
6545
6546 retval = filt_soread_common(kn, NULL, so);
6547
6548 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6549 socket_unlock(so, 1);
6550 }
6551
6552 return retval;
6553 }
6554
6555 static int
6556 filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6557 {
6558 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6559 int retval;
6560
6561 socket_lock(so, 1);
6562
6563 /* save off the new input fflags and data */
6564 kn->kn_sfflags = kev->fflags;
6565 kn->kn_sdata = kev->data;
6566
6567 /* determine if changes result in fired events */
6568 retval = filt_soread_common(kn, NULL, so);
6569
6570 socket_unlock(so, 1);
6571
6572 return retval;
6573 }
6574
6575 static int
6576 filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6577 {
6578 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6579 int retval;
6580
6581 socket_lock(so, 1);
6582 retval = filt_soread_common(kn, kev, so);
6583 socket_unlock(so, 1);
6584
6585 return retval;
6586 }
6587
6588 int
6589 so_wait_for_if_feedback(struct socket *so)
6590 {
6591 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6592 (so->so_state & SS_ISCONNECTED)) {
6593 struct inpcb *inp = sotoinpcb(so);
6594 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6595 return 1;
6596 }
6597 }
6598 return 0;
6599 }
6600
6601 static int
6602 filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6603 {
6604 int ret = 0;
6605 int64_t data = sbspace(&so->so_snd);
6606
6607 if (so->so_state & SS_CANTSENDMORE) {
6608 kn->kn_flags |= EV_EOF;
6609 kn->kn_fflags = so->so_error;
6610 ret = 1;
6611 goto out;
6612 }
6613
6614 if (so->so_error) { /* temporary udp error */
6615 ret = 1;
6616 goto out;
6617 }
6618
6619 if (!socanwrite(so)) {
6620 ret = 0;
6621 goto out;
6622 }
6623
6624 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6625 ret = 1;
6626 goto out;
6627 }
6628
6629 int64_t lowwat = so->so_snd.sb_lowat;
6630
6631 if (kn->kn_sfflags & NOTE_LOWAT) {
6632 if (kn->kn_sdata > so->so_snd.sb_hiwat) {
6633 lowwat = so->so_snd.sb_hiwat;
6634 } else if (kn->kn_sdata > lowwat) {
6635 lowwat = kn->kn_sdata;
6636 }
6637 }
6638
6639 if (data >= lowwat) {
6640 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6641 #if (DEBUG || DEVELOPMENT)
6642 && so_notsent_lowat_check == 1
6643 #endif /* DEBUG || DEVELOPMENT */
6644 ) {
6645 if ((SOCK_DOM(so) == PF_INET ||
6646 SOCK_DOM(so) == PF_INET6) &&
6647 so->so_type == SOCK_STREAM) {
6648 ret = tcp_notsent_lowat_check(so);
6649 }
6650 #if MPTCP
6651 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6652 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6653 ret = mptcp_notsent_lowat_check(so);
6654 }
6655 #endif
6656 else {
6657 ret = 1;
6658 goto out;
6659 }
6660 } else {
6661 ret = 1;
6662 }
6663 }
6664 if (so_wait_for_if_feedback(so)) {
6665 ret = 0;
6666 }
6667
6668 out:
6669 if (ret && kev) {
6670 knote_fill_kevent(kn, kev, data);
6671 }
6672 return ret;
6673 }
6674
6675 static int
6676 filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6677 {
6678 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6679
6680 /* socket locked */
6681 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6682 so->so_snd.sb_flags |= SB_KNOTE;
6683 }
6684
6685 /* determine if its already fired */
6686 return filt_sowrite_common(kn, NULL, so);
6687 }
6688
6689 static void
6690 filt_sowdetach(struct knote *kn)
6691 {
6692 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6693 socket_lock(so, 1);
6694
6695 if (so->so_snd.sb_flags & SB_KNOTE) {
6696 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6697 so->so_snd.sb_flags &= ~SB_KNOTE;
6698 }
6699 }
6700 socket_unlock(so, 1);
6701 }
6702
6703 /*ARGSUSED*/
6704 static int
6705 filt_sowrite(struct knote *kn, long hint)
6706 {
6707 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6708 int ret;
6709
6710 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6711 socket_lock(so, 1);
6712 }
6713
6714 ret = filt_sowrite_common(kn, NULL, so);
6715
6716 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6717 socket_unlock(so, 1);
6718 }
6719
6720 return ret;
6721 }
6722
6723 static int
6724 filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6725 {
6726 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6727 int ret;
6728
6729 socket_lock(so, 1);
6730
6731 /*save off the new input fflags and data */
6732 kn->kn_sfflags = kev->fflags;
6733 kn->kn_sdata = kev->data;
6734
6735 /* determine if these changes result in a triggered event */
6736 ret = filt_sowrite_common(kn, NULL, so);
6737
6738 socket_unlock(so, 1);
6739
6740 return ret;
6741 }
6742
6743 static int
6744 filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6745 {
6746 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6747 int ret;
6748
6749 socket_lock(so, 1);
6750 ret = filt_sowrite_common(kn, kev, so);
6751 socket_unlock(so, 1);
6752
6753 return ret;
6754 }
6755
6756 static int
6757 filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6758 struct socket *so, long ev_hint)
6759 {
6760 int ret = 0;
6761 int64_t data = 0;
6762 uint32_t level_trigger = 0;
6763
6764 if (ev_hint & SO_FILT_HINT_CONNRESET) {
6765 kn->kn_fflags |= NOTE_CONNRESET;
6766 }
6767 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6768 kn->kn_fflags |= NOTE_TIMEOUT;
6769 }
6770 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6771 kn->kn_fflags |= NOTE_NOSRCADDR;
6772 }
6773 if (ev_hint & SO_FILT_HINT_IFDENIED) {
6774 kn->kn_fflags |= NOTE_IFDENIED;
6775 }
6776 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6777 kn->kn_fflags |= NOTE_KEEPALIVE;
6778 }
6779 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6780 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6781 }
6782 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6783 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6784 }
6785 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6786 (so->so_state & SS_ISCONNECTED)) {
6787 kn->kn_fflags |= NOTE_CONNECTED;
6788 level_trigger |= NOTE_CONNECTED;
6789 }
6790 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6791 (so->so_state & SS_ISDISCONNECTED)) {
6792 kn->kn_fflags |= NOTE_DISCONNECTED;
6793 level_trigger |= NOTE_DISCONNECTED;
6794 }
6795 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6796 if (so->so_proto != NULL &&
6797 (so->so_proto->pr_flags & PR_EVCONNINFO)) {
6798 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6799 }
6800 }
6801
6802 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6803 tcp_notify_ack_active(so)) {
6804 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6805 }
6806
6807 if ((so->so_state & SS_CANTRCVMORE)
6808 #if CONTENT_FILTER
6809 && cfil_sock_data_pending(&so->so_rcv) == 0
6810 #endif /* CONTENT_FILTER */
6811 ) {
6812 kn->kn_fflags |= NOTE_READCLOSED;
6813 level_trigger |= NOTE_READCLOSED;
6814 }
6815
6816 if (so->so_state & SS_CANTSENDMORE) {
6817 kn->kn_fflags |= NOTE_WRITECLOSED;
6818 level_trigger |= NOTE_WRITECLOSED;
6819 }
6820
6821 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6822 (so->so_flags & SOF_SUSPENDED)) {
6823 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6824
6825 /* If resume event was delivered before, reset it */
6826 kn->kn_hook32 &= ~NOTE_RESUME;
6827
6828 kn->kn_fflags |= NOTE_SUSPEND;
6829 level_trigger |= NOTE_SUSPEND;
6830 }
6831
6832 if ((ev_hint & SO_FILT_HINT_RESUME) ||
6833 (so->so_flags & SOF_SUSPENDED) == 0) {
6834 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6835
6836 /* If suspend event was delivered before, reset it */
6837 kn->kn_hook32 &= ~NOTE_SUSPEND;
6838
6839 kn->kn_fflags |= NOTE_RESUME;
6840 level_trigger |= NOTE_RESUME;
6841 }
6842
6843 if (so->so_error != 0) {
6844 ret = 1;
6845 data = so->so_error;
6846 kn->kn_flags |= EV_EOF;
6847 } else {
6848 u_int32_t data32 = 0;
6849 get_sockev_state(so, &data32);
6850 data = data32;
6851 }
6852
6853 /* Reset any events that are not requested on this knote */
6854 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6855 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6856
6857 /* Find the level triggerred events that are already delivered */
6858 level_trigger &= kn->kn_hook32;
6859 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6860
6861 /* Do not deliver level triggerred events more than once */
6862 if ((kn->kn_fflags & ~level_trigger) != 0) {
6863 ret = 1;
6864 }
6865
6866 if (ret && kev) {
6867 /*
6868 * Store the state of the events being delivered. This
6869 * state can be used to deliver level triggered events
6870 * ateast once and still avoid waking up the application
6871 * multiple times as long as the event is active.
6872 */
6873 if (kn->kn_fflags != 0) {
6874 kn->kn_hook32 |= (kn->kn_fflags &
6875 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6876 }
6877
6878 /*
6879 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6880 * only one of them and remember the last one that was
6881 * delivered last
6882 */
6883 if (kn->kn_fflags & NOTE_SUSPEND) {
6884 kn->kn_hook32 &= ~NOTE_RESUME;
6885 }
6886 if (kn->kn_fflags & NOTE_RESUME) {
6887 kn->kn_hook32 &= ~NOTE_SUSPEND;
6888 }
6889
6890 knote_fill_kevent(kn, kev, data);
6891 }
6892 return ret;
6893 }
6894
6895 static int
6896 filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6897 {
6898 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6899
6900 /* socket locked */
6901 kn->kn_hook32 = 0;
6902 if (KNOTE_ATTACH(&so->so_klist, kn)) {
6903 so->so_flags |= SOF_KNOTE;
6904 }
6905
6906 /* determine if event already fired */
6907 return filt_sockev_common(kn, NULL, so, 0);
6908 }
6909
6910 static void
6911 filt_sockdetach(struct knote *kn)
6912 {
6913 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6914 socket_lock(so, 1);
6915
6916 if ((so->so_flags & SOF_KNOTE) != 0) {
6917 if (KNOTE_DETACH(&so->so_klist, kn)) {
6918 so->so_flags &= ~SOF_KNOTE;
6919 }
6920 }
6921 socket_unlock(so, 1);
6922 }
6923
6924 static int
6925 filt_sockev(struct knote *kn, long hint)
6926 {
6927 int ret = 0, locked = 0;
6928 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6929 long ev_hint = (hint & SO_FILT_HINT_EV);
6930
6931 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6932 socket_lock(so, 1);
6933 locked = 1;
6934 }
6935
6936 ret = filt_sockev_common(kn, NULL, so, ev_hint);
6937
6938 if (locked) {
6939 socket_unlock(so, 1);
6940 }
6941
6942 return ret;
6943 }
6944
6945
6946
6947 /*
6948 * filt_socktouch - update event state
6949 */
6950 static int
6951 filt_socktouch(
6952 struct knote *kn,
6953 struct kevent_qos_s *kev)
6954 {
6955 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6956 uint32_t changed_flags;
6957 int ret;
6958
6959 socket_lock(so, 1);
6960
6961 /* save off the [result] data and fflags */
6962 changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
6963
6964 /* save off the new input fflags and data */
6965 kn->kn_sfflags = kev->fflags;
6966 kn->kn_sdata = kev->data;
6967
6968 /* restrict the current results to the (smaller?) set of new interest */
6969 /*
6970 * For compatibility with previous implementations, we leave kn_fflags
6971 * as they were before.
6972 */
6973 //kn->kn_fflags &= kev->fflags;
6974
6975 /*
6976 * Since we keep track of events that are already
6977 * delivered, if any of those events are not requested
6978 * anymore the state related to them can be reset
6979 */
6980 kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6981
6982 /* determine if we have events to deliver */
6983 ret = filt_sockev_common(kn, NULL, so, 0);
6984
6985 socket_unlock(so, 1);
6986
6987 return ret;
6988 }
6989
6990 /*
6991 * filt_sockprocess - query event fired state and return data
6992 */
6993 static int
6994 filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
6995 {
6996 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
6997 int ret = 0;
6998
6999 socket_lock(so, 1);
7000
7001 ret = filt_sockev_common(kn, kev, so, 0);
7002
7003 socket_unlock(so, 1);
7004
7005 return ret;
7006 }
7007
7008 void
7009 get_sockev_state(struct socket *so, u_int32_t *statep)
7010 {
7011 u_int32_t state = *(statep);
7012
7013 /*
7014 * If the state variable is already used by a previous event,
7015 * reset it.
7016 */
7017 if (state != 0) {
7018 return;
7019 }
7020
7021 if (so->so_state & SS_ISCONNECTED) {
7022 state |= SOCKEV_CONNECTED;
7023 } else {
7024 state &= ~(SOCKEV_CONNECTED);
7025 }
7026 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7027 *(statep) = state;
7028 }
7029
7030 #define SO_LOCK_HISTORY_STR_LEN \
7031 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7032
7033 __private_extern__ const char *
7034 solockhistory_nr(struct socket *so)
7035 {
7036 size_t n = 0;
7037 int i;
7038 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7039
7040 bzero(lock_history_str, sizeof(lock_history_str));
7041 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7042 n += scnprintf(lock_history_str + n,
7043 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7044 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7045 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7046 }
7047 return lock_history_str;
7048 }
7049
7050 lck_mtx_t *
7051 socket_getlock(struct socket *so, int flags)
7052 {
7053 if (so->so_proto->pr_getlock != NULL) {
7054 return (*so->so_proto->pr_getlock)(so, flags);
7055 } else {
7056 return so->so_proto->pr_domain->dom_mtx;
7057 }
7058 }
7059
7060 void
7061 socket_lock(struct socket *so, int refcount)
7062 {
7063 void *lr_saved;
7064
7065 lr_saved = __builtin_return_address(0);
7066
7067 if (so->so_proto->pr_lock) {
7068 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
7069 } else {
7070 #ifdef MORE_LOCKING_DEBUG
7071 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7072 LCK_MTX_ASSERT_NOTOWNED);
7073 #endif
7074 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
7075 if (refcount) {
7076 so->so_usecount++;
7077 }
7078 so->lock_lr[so->next_lock_lr] = lr_saved;
7079 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7080 }
7081 }
7082
7083 void
7084 socket_lock_assert_owned(struct socket *so)
7085 {
7086 lck_mtx_t *mutex_held;
7087
7088 if (so->so_proto->pr_getlock != NULL) {
7089 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7090 } else {
7091 mutex_held = so->so_proto->pr_domain->dom_mtx;
7092 }
7093
7094 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7095 }
7096
7097 int
7098 socket_try_lock(struct socket *so)
7099 {
7100 lck_mtx_t *mtx;
7101
7102 if (so->so_proto->pr_getlock != NULL) {
7103 mtx = (*so->so_proto->pr_getlock)(so, 0);
7104 } else {
7105 mtx = so->so_proto->pr_domain->dom_mtx;
7106 }
7107
7108 return lck_mtx_try_lock(mtx);
7109 }
7110
7111 void
7112 socket_unlock(struct socket *so, int refcount)
7113 {
7114 void *lr_saved;
7115 lck_mtx_t *mutex_held;
7116
7117 lr_saved = __builtin_return_address(0);
7118
7119 if (so == NULL || so->so_proto == NULL) {
7120 panic("%s: null so_proto so=%p\n", __func__, so);
7121 /* NOTREACHED */
7122 }
7123
7124 if (so->so_proto->pr_unlock) {
7125 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7126 } else {
7127 mutex_held = so->so_proto->pr_domain->dom_mtx;
7128 #ifdef MORE_LOCKING_DEBUG
7129 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7130 #endif
7131 so->unlock_lr[so->next_unlock_lr] = lr_saved;
7132 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7133
7134 if (refcount) {
7135 if (so->so_usecount <= 0) {
7136 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7137 "lrh=%s", __func__, so->so_usecount, so,
7138 SOCK_DOM(so), so->so_type,
7139 SOCK_PROTO(so), solockhistory_nr(so));
7140 /* NOTREACHED */
7141 }
7142
7143 so->so_usecount--;
7144 if (so->so_usecount == 0) {
7145 sofreelastref(so, 1);
7146 }
7147 }
7148 lck_mtx_unlock(mutex_held);
7149 }
7150 }
7151
7152 /* Called with socket locked, will unlock socket */
7153 void
7154 sofree(struct socket *so)
7155 {
7156 lck_mtx_t *mutex_held;
7157
7158 if (so->so_proto->pr_getlock != NULL) {
7159 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7160 } else {
7161 mutex_held = so->so_proto->pr_domain->dom_mtx;
7162 }
7163 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7164
7165 sofreelastref(so, 0);
7166 }
7167
7168 void
7169 soreference(struct socket *so)
7170 {
7171 socket_lock(so, 1); /* locks & take one reference on socket */
7172 socket_unlock(so, 0); /* unlock only */
7173 }
7174
7175 void
7176 sodereference(struct socket *so)
7177 {
7178 socket_lock(so, 0);
7179 socket_unlock(so, 1);
7180 }
7181
7182 /*
7183 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7184 * possibility of using jumbo clusters. Caller must ensure to hold
7185 * the socket lock.
7186 */
7187 void
7188 somultipages(struct socket *so, boolean_t set)
7189 {
7190 if (set) {
7191 so->so_flags |= SOF_MULTIPAGES;
7192 } else {
7193 so->so_flags &= ~SOF_MULTIPAGES;
7194 }
7195 }
7196
7197 void
7198 soif2kcl(struct socket *so, boolean_t set)
7199 {
7200 if (set) {
7201 so->so_flags1 |= SOF1_IF_2KCL;
7202 } else {
7203 so->so_flags1 &= ~SOF1_IF_2KCL;
7204 }
7205 }
7206
7207 int
7208 so_isdstlocal(struct socket *so)
7209 {
7210 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7211
7212 if (SOCK_DOM(so) == PF_INET) {
7213 return inaddr_local(inp->inp_faddr);
7214 } else if (SOCK_DOM(so) == PF_INET6) {
7215 return in6addr_local(&inp->in6p_faddr);
7216 }
7217
7218 return 0;
7219 }
7220
7221 int
7222 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7223 {
7224 struct sockbuf *rcv, *snd;
7225 int err = 0, defunct;
7226
7227 rcv = &so->so_rcv;
7228 snd = &so->so_snd;
7229
7230 defunct = (so->so_flags & SOF_DEFUNCT);
7231 if (defunct) {
7232 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7233 panic("%s: SB_DROP not set", __func__);
7234 /* NOTREACHED */
7235 }
7236 goto done;
7237 }
7238
7239 if (so->so_flags & SOF_NODEFUNCT) {
7240 if (noforce) {
7241 err = EOPNOTSUPP;
7242 if (p != PROC_NULL) {
7243 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7244 "name %s level %d) so 0x%llx [%d,%d] "
7245 "is not eligible for defunct "
7246 "(%d)\n", __func__, proc_selfpid(),
7247 proc_best_name(current_proc()), proc_pid(p),
7248 proc_best_name(p), level,
7249 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7250 SOCK_DOM(so), SOCK_TYPE(so), err);
7251 }
7252 return err;
7253 }
7254 so->so_flags &= ~SOF_NODEFUNCT;
7255 if (p != PROC_NULL) {
7256 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7257 "name %s level %d) so 0x%llx [%d,%d] "
7258 "defunct by force "
7259 "(%d)\n", __func__, proc_selfpid(),
7260 proc_best_name(current_proc()), proc_pid(p),
7261 proc_best_name(p), level,
7262 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7263 SOCK_DOM(so), SOCK_TYPE(so), err);
7264 }
7265 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7266 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7267 struct ifnet *ifp = inp->inp_last_outifp;
7268
7269 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7270 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7271 } else if (so->so_flags & SOF_DELEGATED) {
7272 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7273 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7274 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7275 } else if (noforce && p != PROC_NULL) {
7276 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7277
7278 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7279 so->so_extended_bk_start = net_uptime();
7280 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7281
7282 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7283
7284 err = EOPNOTSUPP;
7285 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7286 "name %s level %d) so 0x%llx [%d,%d] "
7287 "extend bk idle "
7288 "(%d)\n", __func__, proc_selfpid(),
7289 proc_best_name(current_proc()), proc_pid(p),
7290 proc_best_name(p), level,
7291 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7292 SOCK_DOM(so), SOCK_TYPE(so), err);
7293 return err;
7294 } else {
7295 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7296 }
7297 }
7298
7299 so->so_flags |= SOF_DEFUNCT;
7300
7301 /* Prevent further data from being appended to the socket buffers */
7302 snd->sb_flags |= SB_DROP;
7303 rcv->sb_flags |= SB_DROP;
7304
7305 /* Flush any existing data in the socket buffers */
7306 if (rcv->sb_cc != 0) {
7307 rcv->sb_flags &= ~SB_SEL;
7308 selthreadclear(&rcv->sb_sel);
7309 sbrelease(rcv);
7310 }
7311 if (snd->sb_cc != 0) {
7312 snd->sb_flags &= ~SB_SEL;
7313 selthreadclear(&snd->sb_sel);
7314 sbrelease(snd);
7315 }
7316
7317 done:
7318 if (p != PROC_NULL) {
7319 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7320 "so 0x%llx [%d,%d] %s defunct%s\n", __func__,
7321 proc_selfpid(), proc_best_name(current_proc()),
7322 proc_pid(p), proc_best_name(p), level,
7323 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7324 SOCK_TYPE(so), defunct ? "is already" : "marked as",
7325 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7326 " extbkidle" : "");
7327 }
7328 return err;
7329 }
7330
7331 int
7332 sodefunct(struct proc *p, struct socket *so, int level)
7333 {
7334 struct sockbuf *rcv, *snd;
7335
7336 if (!(so->so_flags & SOF_DEFUNCT)) {
7337 panic("%s improperly called", __func__);
7338 /* NOTREACHED */
7339 }
7340 if (so->so_state & SS_DEFUNCT) {
7341 goto done;
7342 }
7343
7344 rcv = &so->so_rcv;
7345 snd = &so->so_snd;
7346
7347 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7348 char s[MAX_IPv6_STR_LEN];
7349 char d[MAX_IPv6_STR_LEN];
7350 struct inpcb *inp = sotoinpcb(so);
7351
7352 if (p != PROC_NULL) {
7353 SODEFUNCTLOG(
7354 "%s[%d, %s]: (target pid %d name %s level %d) "
7355 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
7356 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7357 " snd_fl 0x%x]\n", __func__,
7358 proc_selfpid(), proc_best_name(current_proc()),
7359 proc_pid(p), proc_best_name(p), level,
7360 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7361 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7362 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7363 (void *)&inp->inp_laddr.s_addr :
7364 (void *)&inp->in6p_laddr),
7365 s, sizeof(s)), ntohs(inp->in6p_lport),
7366 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7367 (void *)&inp->inp_faddr.s_addr :
7368 (void *)&inp->in6p_faddr,
7369 d, sizeof(d)), ntohs(inp->in6p_fport),
7370 (uint32_t)rcv->sb_sel.si_flags,
7371 (uint32_t)snd->sb_sel.si_flags,
7372 rcv->sb_flags, snd->sb_flags);
7373 }
7374 } else if (p != PROC_NULL) {
7375 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7376 "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7377 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7378 proc_selfpid(), proc_best_name(current_proc()),
7379 proc_pid(p), proc_best_name(p), level,
7380 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7381 SOCK_DOM(so), SOCK_TYPE(so),
7382 (uint32_t)rcv->sb_sel.si_flags,
7383 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7384 snd->sb_flags);
7385 }
7386
7387 /*
7388 * Unwedge threads blocked on sbwait() and sb_lock().
7389 */
7390 sbwakeup(rcv);
7391 sbwakeup(snd);
7392
7393 so->so_flags1 |= SOF1_DEFUNCTINPROG;
7394 if (rcv->sb_flags & SB_LOCK) {
7395 sbunlock(rcv, TRUE); /* keep socket locked */
7396 }
7397 if (snd->sb_flags & SB_LOCK) {
7398 sbunlock(snd, TRUE); /* keep socket locked */
7399 }
7400 /*
7401 * Flush the buffers and disconnect. We explicitly call shutdown
7402 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7403 * states are set for the socket. This would also flush out data
7404 * hanging off the receive list of this socket.
7405 */
7406 (void) soshutdownlock_final(so, SHUT_RD);
7407 (void) soshutdownlock_final(so, SHUT_WR);
7408 (void) sodisconnectlocked(so);
7409
7410 /*
7411 * Explicitly handle connectionless-protocol disconnection
7412 * and release any remaining data in the socket buffers.
7413 */
7414 if (!(so->so_state & SS_ISDISCONNECTED)) {
7415 (void) soisdisconnected(so);
7416 }
7417
7418 if (so->so_error == 0) {
7419 so->so_error = EBADF;
7420 }
7421
7422 if (rcv->sb_cc != 0) {
7423 rcv->sb_flags &= ~SB_SEL;
7424 selthreadclear(&rcv->sb_sel);
7425 sbrelease(rcv);
7426 }
7427 if (snd->sb_cc != 0) {
7428 snd->sb_flags &= ~SB_SEL;
7429 selthreadclear(&snd->sb_sel);
7430 sbrelease(snd);
7431 }
7432 so->so_state |= SS_DEFUNCT;
7433 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7434
7435 done:
7436 return 0;
7437 }
7438
7439 int
7440 soresume(struct proc *p, struct socket *so, int locked)
7441 {
7442 if (locked == 0) {
7443 socket_lock(so, 1);
7444 }
7445
7446 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7447 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7448 "[%d,%d] resumed from bk idle\n",
7449 __func__, proc_selfpid(), proc_best_name(current_proc()),
7450 proc_pid(p), proc_best_name(p),
7451 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7452 SOCK_DOM(so), SOCK_TYPE(so));
7453
7454 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7455 so->so_extended_bk_start = 0;
7456 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7457
7458 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7459 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7460 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7461 }
7462 if (locked == 0) {
7463 socket_unlock(so, 1);
7464 }
7465
7466 return 0;
7467 }
7468
7469 /*
7470 * Does not attempt to account for sockets that are delegated from
7471 * the current process
7472 */
7473 int
7474 so_set_extended_bk_idle(struct socket *so, int optval)
7475 {
7476 int error = 0;
7477
7478 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7479 SOCK_PROTO(so) != IPPROTO_TCP) {
7480 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7481 error = EOPNOTSUPP;
7482 } else if (optval == 0) {
7483 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7484
7485 soresume(current_proc(), so, 1);
7486 } else {
7487 struct proc *p = current_proc();
7488 struct fileproc *fp;
7489 int count = 0;
7490
7491 /*
7492 * Unlock socket to avoid lock ordering issue with
7493 * the proc fd table lock
7494 */
7495 socket_unlock(so, 0);
7496
7497 proc_fdlock(p);
7498 fdt_foreach(fp, p) {
7499 struct socket *so2;
7500
7501 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7502 continue;
7503 }
7504
7505 so2 = (struct socket *)fp->fp_glob->fg_data;
7506 if (so != so2 &&
7507 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7508 count++;
7509 }
7510 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7511 break;
7512 }
7513 }
7514 proc_fdunlock(p);
7515
7516 socket_lock(so, 0);
7517
7518 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7519 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7520 error = EBUSY;
7521 } else if (so->so_flags & SOF_DELEGATED) {
7522 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7523 error = EBUSY;
7524 } else {
7525 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7526 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7527 }
7528 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7529 "%s marked for extended bk idle\n",
7530 __func__, proc_selfpid(), proc_best_name(current_proc()),
7531 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7532 SOCK_DOM(so), SOCK_TYPE(so),
7533 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7534 "is" : "not");
7535 }
7536
7537 return error;
7538 }
7539
7540 static void
7541 so_stop_extended_bk_idle(struct socket *so)
7542 {
7543 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7544 so->so_extended_bk_start = 0;
7545
7546 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7547 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7548 /*
7549 * Force defunct
7550 */
7551 sosetdefunct(current_proc(), so,
7552 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7553 if (so->so_flags & SOF_DEFUNCT) {
7554 sodefunct(current_proc(), so,
7555 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7556 }
7557 }
7558
7559 void
7560 so_drain_extended_bk_idle(struct socket *so)
7561 {
7562 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7563 /*
7564 * Only penalize sockets that have outstanding data
7565 */
7566 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7567 so_stop_extended_bk_idle(so);
7568
7569 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7570 }
7571 }
7572 }
7573
7574 /*
7575 * Return values tells if socket is still in extended background idle
7576 */
7577 int
7578 so_check_extended_bk_idle_time(struct socket *so)
7579 {
7580 int ret = 1;
7581
7582 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7583 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7584 __func__, proc_selfpid(), proc_best_name(current_proc()),
7585 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7586 SOCK_DOM(so), SOCK_TYPE(so));
7587 if (net_uptime() - so->so_extended_bk_start >
7588 soextbkidlestat.so_xbkidle_time) {
7589 so_stop_extended_bk_idle(so);
7590
7591 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7592
7593 ret = 0;
7594 } else {
7595 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7596
7597 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7598 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7599 }
7600 }
7601
7602 return ret;
7603 }
7604
7605 void
7606 resume_proc_sockets(proc_t p)
7607 {
7608 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7609 struct fileproc *fp;
7610 struct socket *so;
7611
7612 proc_fdlock(p);
7613 fdt_foreach(fp, p) {
7614 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7615 continue;
7616 }
7617
7618 so = (struct socket *)fp->fp_glob->fg_data;
7619 (void) soresume(p, so, 0);
7620 }
7621 proc_fdunlock(p);
7622
7623 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7624 }
7625 }
7626
7627 __private_extern__ int
7628 so_set_recv_anyif(struct socket *so, int optval)
7629 {
7630 int ret = 0;
7631
7632 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7633 if (optval) {
7634 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7635 } else {
7636 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7637 }
7638 }
7639
7640
7641 return ret;
7642 }
7643
7644 __private_extern__ int
7645 so_get_recv_anyif(struct socket *so)
7646 {
7647 int ret = 0;
7648
7649 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7650 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7651 }
7652
7653 return ret;
7654 }
7655
7656 int
7657 so_set_restrictions(struct socket *so, uint32_t vals)
7658 {
7659 int nocell_old, nocell_new;
7660 int noexpensive_old, noexpensive_new;
7661 int noconstrained_old, noconstrained_new;
7662
7663 /*
7664 * Deny-type restrictions are trapdoors; once set they cannot be
7665 * unset for the lifetime of the socket. This allows them to be
7666 * issued by a framework on behalf of the application without
7667 * having to worry that they can be undone.
7668 *
7669 * Note here that socket-level restrictions overrides any protocol
7670 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7671 * socket restriction issued on the socket has a higher precendence
7672 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7673 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7674 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7675 */
7676 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7677 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7678 noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7679 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7680 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7681 SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7682 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7683 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7684 noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7685
7686 /* we can only set, not clear restrictions */
7687 if ((nocell_new - nocell_old) == 0 &&
7688 (noexpensive_new - noexpensive_old) == 0 &&
7689 (noconstrained_new - noconstrained_old) == 0) {
7690 return 0;
7691 }
7692 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7693 if (nocell_new - nocell_old != 0) {
7694 /*
7695 * if deny cellular is now set, do what's needed
7696 * for INPCB
7697 */
7698 inp_set_nocellular(sotoinpcb(so));
7699 }
7700 if (noexpensive_new - noexpensive_old != 0) {
7701 inp_set_noexpensive(sotoinpcb(so));
7702 }
7703 if (noconstrained_new - noconstrained_old != 0) {
7704 inp_set_noconstrained(sotoinpcb(so));
7705 }
7706 }
7707
7708 if (SOCK_DOM(so) == PF_MULTIPATH) {
7709 mptcp_set_restrictions(so);
7710 }
7711
7712 return 0;
7713 }
7714
7715 uint32_t
7716 so_get_restrictions(struct socket *so)
7717 {
7718 return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7719 SO_RESTRICT_DENY_OUT |
7720 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7721 }
7722
7723 int
7724 so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7725 {
7726 struct proc *ep = PROC_NULL;
7727 int error = 0;
7728
7729 /* pid 0 is reserved for kernel */
7730 if (epid == 0) {
7731 error = EINVAL;
7732 goto done;
7733 }
7734
7735 /*
7736 * If this is an in-kernel socket, prevent its delegate
7737 * association from changing unless the socket option is
7738 * coming from within the kernel itself.
7739 */
7740 if (so->last_pid == 0 && p != kernproc) {
7741 error = EACCES;
7742 goto done;
7743 }
7744
7745 /*
7746 * If this is issued by a process that's recorded as the
7747 * real owner of the socket, or if the pid is the same as
7748 * the process's own pid, then proceed. Otherwise ensure
7749 * that the issuing process has the necessary privileges.
7750 */
7751 if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7752 if ((error = priv_check_cred(kauth_cred_get(),
7753 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7754 error = EACCES;
7755 goto done;
7756 }
7757 }
7758
7759 /* Find the process that corresponds to the effective pid */
7760 if ((ep = proc_find(epid)) == PROC_NULL) {
7761 error = ESRCH;
7762 goto done;
7763 }
7764
7765 /*
7766 * If a process tries to delegate the socket to itself, then
7767 * there's really nothing to do; treat it as a way for the
7768 * delegate association to be cleared. Note that we check
7769 * the passed-in proc rather than calling proc_selfpid(),
7770 * as we need to check the process issuing the socket option
7771 * which could be kernproc. Given that we don't allow 0 for
7772 * effective pid, it means that a delegated in-kernel socket
7773 * stays delegated during its lifetime (which is probably OK.)
7774 */
7775 if (epid == proc_pid(p)) {
7776 so->so_flags &= ~SOF_DELEGATED;
7777 so->e_upid = 0;
7778 so->e_pid = 0;
7779 uuid_clear(so->e_uuid);
7780 } else {
7781 so->so_flags |= SOF_DELEGATED;
7782 so->e_upid = proc_uniqueid(ep);
7783 so->e_pid = proc_pid(ep);
7784 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
7785
7786 #if defined(XNU_TARGET_OS_OSX)
7787 if (ep->p_responsible_pid != so->e_pid) {
7788 proc_t rp = proc_find(ep->p_responsible_pid);
7789 if (rp != PROC_NULL) {
7790 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
7791 so->so_rpid = ep->p_responsible_pid;
7792 proc_rele(rp);
7793 } else {
7794 uuid_clear(so->so_ruuid);
7795 so->so_rpid = -1;
7796 }
7797 }
7798 #endif
7799 }
7800 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7801 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
7802 }
7803 done:
7804 if (error == 0 && net_io_policy_log) {
7805 uuid_string_t buf;
7806
7807 uuid_unparse(so->e_uuid, buf);
7808 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7809 "euuid %s%s\n", __func__, proc_name_address(p),
7810 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7811 SOCK_DOM(so), SOCK_TYPE(so),
7812 so->e_pid, proc_name_address(ep), buf,
7813 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7814 } else if (error != 0 && net_io_policy_log) {
7815 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7816 "ERROR (%d)\n", __func__, proc_name_address(p),
7817 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7818 SOCK_DOM(so), SOCK_TYPE(so),
7819 epid, (ep == PROC_NULL) ? "PROC_NULL" :
7820 proc_name_address(ep), error);
7821 }
7822
7823 /* Update this socket's policy upon success */
7824 if (error == 0) {
7825 so->so_policy_gencnt *= -1;
7826 so_update_policy(so);
7827 #if NECP
7828 so_update_necp_policy(so, NULL, NULL);
7829 #endif /* NECP */
7830 }
7831
7832 if (ep != PROC_NULL) {
7833 proc_rele(ep);
7834 }
7835
7836 return error;
7837 }
7838
7839 int
7840 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
7841 {
7842 uuid_string_t buf;
7843 uuid_t uuid;
7844 int error = 0;
7845
7846 /* UUID must not be all-zeroes (reserved for kernel) */
7847 if (uuid_is_null(euuid)) {
7848 error = EINVAL;
7849 goto done;
7850 }
7851
7852 /*
7853 * If this is an in-kernel socket, prevent its delegate
7854 * association from changing unless the socket option is
7855 * coming from within the kernel itself.
7856 */
7857 if (so->last_pid == 0 && p != kernproc) {
7858 error = EACCES;
7859 goto done;
7860 }
7861
7862 /* Get the UUID of the issuing process */
7863 proc_getexecutableuuid(p, uuid, sizeof(uuid));
7864
7865 /*
7866 * If this is issued by a process that's recorded as the
7867 * real owner of the socket, or if the uuid is the same as
7868 * the process's own uuid, then proceed. Otherwise ensure
7869 * that the issuing process has the necessary privileges.
7870 */
7871 if (check_cred &&
7872 (uuid_compare(euuid, so->last_uuid) != 0 ||
7873 uuid_compare(euuid, uuid) != 0)) {
7874 if ((error = priv_check_cred(kauth_cred_get(),
7875 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7876 error = EACCES;
7877 goto done;
7878 }
7879 }
7880
7881 /*
7882 * If a process tries to delegate the socket to itself, then
7883 * there's really nothing to do; treat it as a way for the
7884 * delegate association to be cleared. Note that we check
7885 * the uuid of the passed-in proc rather than that of the
7886 * current process, as we need to check the process issuing
7887 * the socket option which could be kernproc itself. Given
7888 * that we don't allow 0 for effective uuid, it means that
7889 * a delegated in-kernel socket stays delegated during its
7890 * lifetime (which is okay.)
7891 */
7892 if (uuid_compare(euuid, uuid) == 0) {
7893 so->so_flags &= ~SOF_DELEGATED;
7894 so->e_upid = 0;
7895 so->e_pid = 0;
7896 uuid_clear(so->e_uuid);
7897 } else {
7898 so->so_flags |= SOF_DELEGATED;
7899 /*
7900 * Unlike so_set_effective_pid(), we only have the UUID
7901 * here and the process ID is not known. Inherit the
7902 * real {pid,upid} of the socket.
7903 */
7904 so->e_upid = so->last_upid;
7905 so->e_pid = so->last_pid;
7906 uuid_copy(so->e_uuid, euuid);
7907 }
7908 /*
7909 * The following will clear the effective process name as it's the same
7910 * as the real process
7911 */
7912 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7913 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
7914 }
7915 done:
7916 if (error == 0 && net_io_policy_log) {
7917 uuid_unparse(so->e_uuid, buf);
7918 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7919 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
7920 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7921 SOCK_TYPE(so), so->e_pid, buf,
7922 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7923 } else if (error != 0 && net_io_policy_log) {
7924 uuid_unparse(euuid, buf);
7925 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7926 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
7927 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7928 SOCK_TYPE(so), buf, error);
7929 }
7930
7931 /* Update this socket's policy upon success */
7932 if (error == 0) {
7933 so->so_policy_gencnt *= -1;
7934 so_update_policy(so);
7935 #if NECP
7936 so_update_necp_policy(so, NULL, NULL);
7937 #endif /* NECP */
7938 }
7939
7940 return error;
7941 }
7942
7943 void
7944 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
7945 uint32_t ev_datalen)
7946 {
7947 struct kev_msg ev_msg;
7948
7949 /*
7950 * A netpolicy event always starts with a netpolicy_event_data
7951 * structure, but the caller can provide for a longer event
7952 * structure to post, depending on the event code.
7953 */
7954 VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
7955
7956 bzero(&ev_msg, sizeof(ev_msg));
7957 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7958 ev_msg.kev_class = KEV_NETWORK_CLASS;
7959 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
7960 ev_msg.event_code = ev_code;
7961
7962 ev_msg.dv[0].data_ptr = ev_data;
7963 ev_msg.dv[0].data_length = ev_datalen;
7964
7965 kev_post_msg(&ev_msg);
7966 }
7967
7968 void
7969 socket_post_kev_msg(uint32_t ev_code,
7970 struct kev_socket_event_data *ev_data,
7971 uint32_t ev_datalen)
7972 {
7973 struct kev_msg ev_msg;
7974
7975 bzero(&ev_msg, sizeof(ev_msg));
7976 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7977 ev_msg.kev_class = KEV_NETWORK_CLASS;
7978 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
7979 ev_msg.event_code = ev_code;
7980
7981 ev_msg.dv[0].data_ptr = ev_data;
7982 ev_msg.dv[0].data_length = ev_datalen;
7983
7984 kev_post_msg(&ev_msg);
7985 }
7986
7987 void
7988 socket_post_kev_msg_closed(struct socket *so)
7989 {
7990 struct kev_socket_closed ev = {};
7991 struct sockaddr *socksa = NULL, *peersa = NULL;
7992 int err;
7993
7994 if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
7995 return;
7996 }
7997 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
7998 if (err == 0) {
7999 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8000 &peersa);
8001 if (err == 0) {
8002 memcpy(&ev.ev_data.kev_sockname, socksa,
8003 min(socksa->sa_len,
8004 sizeof(ev.ev_data.kev_sockname)));
8005 memcpy(&ev.ev_data.kev_peername, peersa,
8006 min(peersa->sa_len,
8007 sizeof(ev.ev_data.kev_peername)));
8008 socket_post_kev_msg(KEV_SOCKET_CLOSED,
8009 &ev.ev_data, sizeof(ev));
8010 }
8011 }
8012 if (socksa != NULL) {
8013 FREE(socksa, M_SONAME);
8014 }
8015 if (peersa != NULL) {
8016 FREE(peersa, M_SONAME);
8017 }
8018 }