2 * Copyright (c) 1998-2021 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
92 #include <sys/uio_internal.h>
94 #include <sys/kdebug.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/net_api_stats.h>
102 #include <net/ntstat.h>
103 #include <net/content_filter.h>
104 #include <netinet/in.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet/in_tclass.h>
107 #include <netinet/in_var.h>
108 #include <netinet/tcp_var.h>
109 #include <netinet/ip6.h>
110 #include <netinet6/ip6_var.h>
111 #include <netinet/flow_divert.h>
112 #include <kern/zalloc.h>
113 #include <kern/locks.h>
114 #include <machine/limits.h>
115 #include <libkern/OSAtomic.h>
116 #include <pexpert/pexpert.h>
117 #include <kern/assert.h>
118 #include <kern/task.h>
119 #include <kern/policy_internal.h>
121 #include <sys/kpi_mbuf.h>
122 #include <sys/mcache.h>
123 #include <sys/unpcb.h>
124 #include <libkern/section_keywords.h>
127 #include <security/mac_framework.h>
131 #include <netinet/mp_pcb.h>
132 #include <netinet/mptcp_var.h>
133 #endif /* MULTIPATH */
135 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
137 #if DEBUG || DEVELOPMENT
138 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
140 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
143 /* TODO: this should be in a header file somewhere */
144 extern char *proc_name_address(void *p
);
146 static u_int32_t so_cache_hw
; /* High water mark for socache */
147 static u_int32_t so_cache_timeouts
; /* number of timeouts */
148 static u_int32_t so_cache_max_freed
; /* max freed per timeout */
149 static u_int32_t cached_sock_count
= 0;
150 STAILQ_HEAD(, socket
) so_cache_head
;
151 int max_cached_sock_count
= MAX_CACHED_SOCKETS
;
152 static u_int32_t so_cache_time
;
153 static int socketinit_done
;
154 static struct zone
*so_cache_zone
;
156 static LCK_GRP_DECLARE(so_cache_mtx_grp
, "so_cache");
157 static LCK_MTX_DECLARE(so_cache_mtx
, &so_cache_mtx_grp
);
159 #include <machine/limits.h>
161 static int filt_sorattach(struct knote
*kn
, struct kevent_qos_s
*kev
);
162 static void filt_sordetach(struct knote
*kn
);
163 static int filt_soread(struct knote
*kn
, long hint
);
164 static int filt_sortouch(struct knote
*kn
, struct kevent_qos_s
*kev
);
165 static int filt_sorprocess(struct knote
*kn
, struct kevent_qos_s
*kev
);
167 static int filt_sowattach(struct knote
*kn
, struct kevent_qos_s
*kev
);
168 static void filt_sowdetach(struct knote
*kn
);
169 static int filt_sowrite(struct knote
*kn
, long hint
);
170 static int filt_sowtouch(struct knote
*kn
, struct kevent_qos_s
*kev
);
171 static int filt_sowprocess(struct knote
*kn
, struct kevent_qos_s
*kev
);
173 static int filt_sockattach(struct knote
*kn
, struct kevent_qos_s
*kev
);
174 static void filt_sockdetach(struct knote
*kn
);
175 static int filt_sockev(struct knote
*kn
, long hint
);
176 static int filt_socktouch(struct knote
*kn
, struct kevent_qos_s
*kev
);
177 static int filt_sockprocess(struct knote
*kn
, struct kevent_qos_s
*kev
);
179 static int sooptcopyin_timeval(struct sockopt
*, struct timeval
*);
180 static int sooptcopyout_timeval(struct sockopt
*, const struct timeval
*);
182 SECURITY_READ_ONLY_EARLY(struct filterops
) soread_filtops
= {
184 .f_attach
= filt_sorattach
,
185 .f_detach
= filt_sordetach
,
186 .f_event
= filt_soread
,
187 .f_touch
= filt_sortouch
,
188 .f_process
= filt_sorprocess
,
191 SECURITY_READ_ONLY_EARLY(struct filterops
) sowrite_filtops
= {
193 .f_attach
= filt_sowattach
,
194 .f_detach
= filt_sowdetach
,
195 .f_event
= filt_sowrite
,
196 .f_touch
= filt_sowtouch
,
197 .f_process
= filt_sowprocess
,
200 SECURITY_READ_ONLY_EARLY(struct filterops
) sock_filtops
= {
202 .f_attach
= filt_sockattach
,
203 .f_detach
= filt_sockdetach
,
204 .f_event
= filt_sockev
,
205 .f_touch
= filt_socktouch
,
206 .f_process
= filt_sockprocess
,
209 SECURITY_READ_ONLY_EARLY(struct filterops
) soexcept_filtops
= {
211 .f_attach
= filt_sorattach
,
212 .f_detach
= filt_sordetach
,
213 .f_event
= filt_soread
,
214 .f_touch
= filt_sortouch
,
215 .f_process
= filt_sorprocess
,
218 SYSCTL_DECL(_kern_ipc
);
220 #define EVEN_MORE_LOCKING_DEBUG 0
222 int socket_debug
= 0;
223 SYSCTL_INT(_kern_ipc
, OID_AUTO
, socket_debug
,
224 CTLFLAG_RW
| CTLFLAG_LOCKED
, &socket_debug
, 0, "");
226 static unsigned long sodefunct_calls
= 0;
227 SYSCTL_LONG(_kern_ipc
, OID_AUTO
, sodefunct_calls
, CTLFLAG_LOCKED
,
228 &sodefunct_calls
, "");
230 ZONE_DECLARE(socket_zone
, "socket", sizeof(struct socket
), ZC_ZFREE_CLEARMEM
);
231 so_gen_t so_gencnt
; /* generation count for sockets */
233 MALLOC_DEFINE(M_SONAME
, "soname", "socket name");
234 MALLOC_DEFINE(M_PCB
, "pcb", "protocol control block");
236 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
237 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
238 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
239 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
240 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
241 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
242 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
243 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
244 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
246 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
248 int somaxconn
= SOMAXCONN
;
249 SYSCTL_INT(_kern_ipc
, KIPC_SOMAXCONN
, somaxconn
,
250 CTLFLAG_RW
| CTLFLAG_LOCKED
, &somaxconn
, 0, "");
252 /* Should we get a maximum also ??? */
253 static int sosendmaxchain
= 65536;
254 static int sosendminchain
= 16384;
255 static int sorecvmincopy
= 16384;
256 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendminchain
,
257 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sosendminchain
, 0, "");
258 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sorecvmincopy
,
259 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sorecvmincopy
, 0, "");
262 * Set to enable jumbo clusters (if available) for large writes when
263 * the socket is marked with SOF_MULTIPAGES; see below.
266 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendjcl
,
267 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sosendjcl
, 0, "");
270 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
271 * writes on the socket for all protocols on any network interfaces,
272 * depending upon sosendjcl above. Be extra careful when setting this
273 * to 1, because sending down packets that cross physical pages down to
274 * broken drivers (those that falsely assume that the physical pages
275 * are contiguous) might lead to system panics or silent data corruption.
276 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
277 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
278 * capable. Set this to 1 only for testing/debugging purposes.
280 int sosendjcl_ignore_capab
= 0;
281 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendjcl_ignore_capab
,
282 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sosendjcl_ignore_capab
, 0, "");
285 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
286 * writes on the socket for all protocols on any network interfaces.
287 * Be extra careful when setting this to 1, because sending down packets with
288 * clusters larger that 2 KB might lead to system panics or data corruption.
289 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
290 * on the outgoing interface
291 * Set this to 1 for testing/debugging purposes only.
293 int sosendbigcl_ignore_capab
= 0;
294 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendbigcl_ignore_capab
,
295 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sosendbigcl_ignore_capab
, 0, "");
297 int sodefunctlog
= 0;
298 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sodefunctlog
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
299 &sodefunctlog
, 0, "");
301 int sothrottlelog
= 0;
302 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sothrottlelog
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
303 &sothrottlelog
, 0, "");
305 int sorestrictrecv
= 1;
306 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sorestrictrecv
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
307 &sorestrictrecv
, 0, "Enable inbound interface restrictions");
309 int sorestrictsend
= 1;
310 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sorestrictsend
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
311 &sorestrictsend
, 0, "Enable outbound interface restrictions");
313 int soreserveheadroom
= 1;
314 SYSCTL_INT(_kern_ipc
, OID_AUTO
, soreserveheadroom
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
315 &soreserveheadroom
, 0, "To allocate contiguous datagram buffers");
317 #if (DEBUG || DEVELOPMENT)
318 int so_notsent_lowat_check
= 1;
319 SYSCTL_INT(_kern_ipc
, OID_AUTO
, notsent_lowat
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
320 &so_notsent_lowat_check
, 0, "enable/disable notsnet lowat check");
321 #endif /* DEBUG || DEVELOPMENT */
323 int so_accept_list_waits
= 0;
324 #if (DEBUG || DEVELOPMENT)
325 SYSCTL_INT(_kern_ipc
, OID_AUTO
, accept_list_waits
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
326 &so_accept_list_waits
, 0, "number of waits for listener incomp list");
327 #endif /* DEBUG || DEVELOPMENT */
329 extern struct inpcbinfo tcbinfo
;
331 /* TODO: these should be in header file */
332 extern int get_inpcb_str_size(void);
333 extern int get_tcp_str_size(void);
335 vm_size_t so_cache_zone_element_size
;
337 static int sodelayed_copy(struct socket
*, struct uio
*, struct mbuf
**,
339 static void cached_sock_alloc(struct socket
**, zalloc_flags_t
);
340 static void cached_sock_free(struct socket
*);
343 * Maximum of extended background idle sockets per process
344 * Set to zero to disable further setting of the option
347 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
348 #define SO_IDLE_BK_IDLE_TIME 600
349 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
351 struct soextbkidlestat soextbkidlestat
;
353 SYSCTL_UINT(_kern_ipc
, OID_AUTO
, maxextbkidleperproc
,
354 CTLFLAG_RW
| CTLFLAG_LOCKED
, &soextbkidlestat
.so_xbkidle_maxperproc
, 0,
355 "Maximum of extended background idle sockets per process");
357 SYSCTL_UINT(_kern_ipc
, OID_AUTO
, extbkidletime
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
358 &soextbkidlestat
.so_xbkidle_time
, 0,
359 "Time in seconds to keep extended background idle sockets");
361 SYSCTL_UINT(_kern_ipc
, OID_AUTO
, extbkidlercvhiwat
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
362 &soextbkidlestat
.so_xbkidle_rcvhiwat
, 0,
363 "High water mark for extended background idle sockets");
365 SYSCTL_STRUCT(_kern_ipc
, OID_AUTO
, extbkidlestat
, CTLFLAG_RD
| CTLFLAG_LOCKED
,
366 &soextbkidlestat
, soextbkidlestat
, "");
368 int so_set_extended_bk_idle(struct socket
*, int);
372 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
373 * setting the DSCP code on the packet based on the service class; see
374 * <rdar://problem/11277343> for details.
376 __private_extern__ u_int32_t sotcdb
= 0;
377 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sotcdb
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
383 _CASSERT(sizeof(so_gencnt
) == sizeof(uint64_t));
384 VERIFY(IS_P2ALIGNED(&so_gencnt
, sizeof(uint32_t)));
387 _CASSERT(sizeof(struct sa_endpoints
) == sizeof(struct user64_sa_endpoints
));
388 _CASSERT(offsetof(struct sa_endpoints
, sae_srcif
) == offsetof(struct user64_sa_endpoints
, sae_srcif
));
389 _CASSERT(offsetof(struct sa_endpoints
, sae_srcaddr
) == offsetof(struct user64_sa_endpoints
, sae_srcaddr
));
390 _CASSERT(offsetof(struct sa_endpoints
, sae_srcaddrlen
) == offsetof(struct user64_sa_endpoints
, sae_srcaddrlen
));
391 _CASSERT(offsetof(struct sa_endpoints
, sae_dstaddr
) == offsetof(struct user64_sa_endpoints
, sae_dstaddr
));
392 _CASSERT(offsetof(struct sa_endpoints
, sae_dstaddrlen
) == offsetof(struct user64_sa_endpoints
, sae_dstaddrlen
));
394 _CASSERT(sizeof(struct sa_endpoints
) == sizeof(struct user32_sa_endpoints
));
395 _CASSERT(offsetof(struct sa_endpoints
, sae_srcif
) == offsetof(struct user32_sa_endpoints
, sae_srcif
));
396 _CASSERT(offsetof(struct sa_endpoints
, sae_srcaddr
) == offsetof(struct user32_sa_endpoints
, sae_srcaddr
));
397 _CASSERT(offsetof(struct sa_endpoints
, sae_srcaddrlen
) == offsetof(struct user32_sa_endpoints
, sae_srcaddrlen
));
398 _CASSERT(offsetof(struct sa_endpoints
, sae_dstaddr
) == offsetof(struct user32_sa_endpoints
, sae_dstaddr
));
399 _CASSERT(offsetof(struct sa_endpoints
, sae_dstaddrlen
) == offsetof(struct user32_sa_endpoints
, sae_dstaddrlen
));
402 if (socketinit_done
) {
403 printf("socketinit: already called...\n");
408 PE_parse_boot_argn("socket_debug", &socket_debug
,
409 sizeof(socket_debug
));
411 STAILQ_INIT(&so_cache_head
);
413 so_cache_zone_element_size
= (vm_size_t
)(sizeof(struct socket
) + 4
414 + get_inpcb_str_size() + 4 + get_tcp_str_size());
416 so_cache_zone
= zone_create("socache zone", so_cache_zone_element_size
,
417 ZC_ZFREE_CLEARMEM
| ZC_NOENCRYPT
);
419 bzero(&soextbkidlestat
, sizeof(struct soextbkidlestat
));
420 soextbkidlestat
.so_xbkidle_maxperproc
= SO_IDLE_BK_IDLE_MAX_PER_PROC
;
421 soextbkidlestat
.so_xbkidle_time
= SO_IDLE_BK_IDLE_TIME
;
422 soextbkidlestat
.so_xbkidle_rcvhiwat
= SO_IDLE_BK_IDLE_RCV_HIWAT
;
425 socket_tclass_init();
428 #endif /* MULTIPATH */
432 cached_sock_alloc(struct socket
**so
, zalloc_flags_t how
)
437 lck_mtx_lock(&so_cache_mtx
);
439 if (!STAILQ_EMPTY(&so_cache_head
)) {
440 VERIFY(cached_sock_count
> 0);
442 *so
= STAILQ_FIRST(&so_cache_head
);
443 STAILQ_REMOVE_HEAD(&so_cache_head
, so_cache_ent
);
444 STAILQ_NEXT((*so
), so_cache_ent
) = NULL
;
447 lck_mtx_unlock(&so_cache_mtx
);
449 temp
= (*so
)->so_saved_pcb
;
450 bzero((caddr_t
)*so
, sizeof(struct socket
));
452 (*so
)->so_saved_pcb
= temp
;
454 lck_mtx_unlock(&so_cache_mtx
);
456 *so
= zalloc_flags(so_cache_zone
, how
| Z_ZERO
);
459 * Define offsets for extra structures into our
460 * single block of memory. Align extra structures
461 * on longword boundaries.
464 offset
= (uintptr_t)*so
;
465 offset
+= sizeof(struct socket
);
467 offset
= ALIGN(offset
);
469 (*so
)->so_saved_pcb
= (caddr_t
)offset
;
470 offset
+= get_inpcb_str_size();
472 offset
= ALIGN(offset
);
474 ((struct inpcb
*)(void *)(*so
)->so_saved_pcb
)->inp_saved_ppcb
=
478 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER
, &(*so
)->so_flags1
);
482 cached_sock_free(struct socket
*so
)
484 lck_mtx_lock(&so_cache_mtx
);
486 so_cache_time
= net_uptime();
487 if (++cached_sock_count
> max_cached_sock_count
) {
489 lck_mtx_unlock(&so_cache_mtx
);
490 zfree(so_cache_zone
, so
);
492 if (so_cache_hw
< cached_sock_count
) {
493 so_cache_hw
= cached_sock_count
;
496 STAILQ_INSERT_TAIL(&so_cache_head
, so
, so_cache_ent
);
498 so
->cache_timestamp
= so_cache_time
;
499 lck_mtx_unlock(&so_cache_mtx
);
504 so_update_last_owner_locked(struct socket
*so
, proc_t self
)
506 if (so
->last_pid
!= 0) {
508 * last_pid and last_upid should remain zero for sockets
509 * created using sock_socket. The check above achieves that
511 if (self
== PROC_NULL
) {
512 self
= current_proc();
515 if (so
->last_upid
!= proc_uniqueid(self
) ||
516 so
->last_pid
!= proc_pid(self
)) {
517 so
->last_upid
= proc_uniqueid(self
);
518 so
->last_pid
= proc_pid(self
);
519 proc_getexecutableuuid(self
, so
->last_uuid
,
520 sizeof(so
->last_uuid
));
521 if (so
->so_proto
!= NULL
&& so
->so_proto
->pr_update_last_owner
!= NULL
) {
522 (*so
->so_proto
->pr_update_last_owner
)(so
, self
, NULL
);
525 proc_pidoriginatoruuid(so
->so_vuuid
, sizeof(so
->so_vuuid
));
530 so_update_policy(struct socket
*so
)
532 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
533 (void) inp_update_policy(sotoinpcb(so
));
539 so_update_necp_policy(struct socket
*so
, struct sockaddr
*override_local_addr
,
540 struct sockaddr
*override_remote_addr
)
542 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
543 inp_update_necp_policy(sotoinpcb(so
), override_local_addr
,
544 override_remote_addr
, 0);
554 boolean_t rc
= FALSE
;
556 lck_mtx_lock(&so_cache_mtx
);
558 so_cache_time
= net_uptime();
560 while (!STAILQ_EMPTY(&so_cache_head
)) {
561 VERIFY(cached_sock_count
> 0);
562 p
= STAILQ_FIRST(&so_cache_head
);
563 if ((so_cache_time
- p
->cache_timestamp
) <
564 SO_CACHE_TIME_LIMIT
) {
568 STAILQ_REMOVE_HEAD(&so_cache_head
, so_cache_ent
);
571 zfree(so_cache_zone
, p
);
573 if (++n_freed
>= SO_CACHE_MAX_FREE_BATCH
) {
574 so_cache_max_freed
++;
579 /* Schedule again if there is more to cleanup */
580 if (!STAILQ_EMPTY(&so_cache_head
)) {
584 lck_mtx_unlock(&so_cache_mtx
);
589 * Get a socket structure from our zone, and initialize it.
590 * We don't implement `waitok' yet (see comments in uipc_domain.c).
591 * Note that it would probably be better to allocate socket
592 * and PCB at the same time, but I'm not convinced that all
593 * the protocols can be easily modified to do this.
596 soalloc(int waitok
, int dom
, int type
)
598 zalloc_flags_t how
= waitok
? Z_WAITOK
: Z_NOWAIT
;
601 if ((dom
== PF_INET
) && (type
== SOCK_STREAM
)) {
602 cached_sock_alloc(&so
, how
);
604 so
= zalloc_flags(socket_zone
, how
| Z_ZERO
);
607 so
->so_gencnt
= OSIncrementAtomic64((SInt64
*)&so_gencnt
);
610 * Increment the socket allocation statistics
612 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_alloc_total
);
619 socreate_internal(int dom
, struct socket
**aso
, int type
, int proto
,
620 struct proc
*p
, uint32_t flags
, struct proc
*ep
)
625 #if defined(XNU_TARGET_OS_OSX)
630 extern int tcpconsdebug
;
637 prp
= pffindproto(dom
, proto
, type
);
639 prp
= pffindtype(dom
, type
);
642 if (prp
== NULL
|| prp
->pr_usrreqs
->pru_attach
== NULL
) {
643 if (pffinddomain(dom
) == NULL
) {
647 if (pffindprotonotype(dom
, proto
) != NULL
) {
651 return EPROTONOSUPPORT
;
653 if (prp
->pr_type
!= type
) {
656 so
= soalloc(1, dom
, type
);
663 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_local_total
);
666 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_inet_total
);
667 if (type
== SOCK_STREAM
) {
668 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_inet_stream_total
);
670 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_inet_dgram_total
);
674 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_route_total
);
677 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_ndrv_total
);
680 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_key_total
);
683 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_inet6_total
);
684 if (type
== SOCK_STREAM
) {
685 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_inet6_stream_total
);
687 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_inet6_dgram_total
);
691 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_system_total
);
694 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_multipath_total
);
697 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_other_total
);
701 if (flags
& SOCF_MPTCP
) {
702 so
->so_state
|= SS_NBIO
;
705 TAILQ_INIT(&so
->so_incomp
);
706 TAILQ_INIT(&so
->so_comp
);
708 so
->last_upid
= proc_uniqueid(p
);
709 so
->last_pid
= proc_pid(p
);
710 proc_getexecutableuuid(p
, so
->last_uuid
, sizeof(so
->last_uuid
));
711 proc_pidoriginatoruuid(so
->so_vuuid
, sizeof(so
->so_vuuid
));
713 if (ep
!= PROC_NULL
&& ep
!= p
) {
714 so
->e_upid
= proc_uniqueid(ep
);
715 so
->e_pid
= proc_pid(ep
);
716 proc_getexecutableuuid(ep
, so
->e_uuid
, sizeof(so
->e_uuid
));
717 so
->so_flags
|= SOF_DELEGATED
;
718 #if defined(XNU_TARGET_OS_OSX)
719 if (ep
->p_responsible_pid
!= so
->e_pid
) {
720 rpid
= ep
->p_responsible_pid
;
725 #if defined(XNU_TARGET_OS_OSX)
726 if (rpid
< 0 && p
->p_responsible_pid
!= so
->last_pid
) {
727 rpid
= p
->p_responsible_pid
;
731 uuid_clear(so
->so_ruuid
);
733 proc_t rp
= proc_find(rpid
);
734 if (rp
!= PROC_NULL
) {
735 proc_getexecutableuuid(rp
, so
->so_ruuid
, sizeof(so
->so_ruuid
));
742 so
->so_cred
= kauth_cred_proc_ref(p
);
743 if (!suser(kauth_cred_get(), NULL
)) {
744 so
->so_state
|= SS_PRIV
;
748 so
->so_rcv
.sb_flags
|= SB_RECV
;
749 so
->so_rcv
.sb_so
= so
->so_snd
.sb_so
= so
;
750 so
->next_lock_lr
= 0;
751 so
->next_unlock_lr
= 0;
754 * Attachment will create the per pcb lock if necessary and
755 * increase refcount for creation, make sure it's done before
756 * socket is inserted in lists.
760 error
= (*prp
->pr_usrreqs
->pru_attach
)(so
, proto
, p
);
764 * If so_pcb is not zero, the socket will be leaked,
765 * so protocol attachment handler must be coded carefuly
767 so
->so_state
|= SS_NOFDREF
;
768 VERIFY(so
->so_usecount
> 0);
770 sofreelastref(so
, 1); /* will deallocate the socket */
775 * Note: needs so_pcb to be set after pru_attach
777 if (prp
->pr_update_last_owner
!= NULL
) {
778 (*prp
->pr_update_last_owner
)(so
, p
, ep
);
781 atomic_add_32(&prp
->pr_domain
->dom_refs
, 1);
783 /* Attach socket filters for this protocol */
786 if (tcpconsdebug
== 2) {
787 so
->so_options
|= SO_DEBUG
;
790 so_set_default_traffic_class(so
);
793 * If this thread or task is marked to create backgrounded sockets,
794 * mark the socket as background.
796 if (!(flags
& SOCF_MPTCP
) &&
797 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG
)) {
798 socket_set_traffic_mgt_flags(so
, TRAFFIC_MGT_SO_BACKGROUND
);
799 so
->so_background_thread
= current_thread();
804 * Don't mark Unix domain or system
805 * eligible for defunct by default.
809 so
->so_flags
|= SOF_NODEFUNCT
;
816 * Entitlements can't be checked at socket creation time except if the
817 * application requested a feature guarded by a privilege (c.f., socket
819 * The priv(9) and the Sandboxing APIs are designed with the idea that
820 * a privilege check should only be triggered by a userland request.
821 * A privilege check at socket creation time is time consuming and
822 * could trigger many authorisation error messages from the security
837 * <pru_attach>:ENOBUFS[AF_UNIX]
838 * <pru_attach>:ENOBUFS[TCP]
839 * <pru_attach>:ENOMEM[TCP]
840 * <pru_attach>:??? [other protocol families, IPSEC]
843 socreate(int dom
, struct socket
**aso
, int type
, int proto
)
845 return socreate_internal(dom
, aso
, type
, proto
, current_proc(), 0,
850 socreate_delegate(int dom
, struct socket
**aso
, int type
, int proto
, pid_t epid
)
853 struct proc
*ep
= PROC_NULL
;
855 if ((proc_selfpid() != epid
) && ((ep
= proc_find(epid
)) == PROC_NULL
)) {
860 error
= socreate_internal(dom
, aso
, type
, proto
, current_proc(), 0, ep
);
863 * It might not be wise to hold the proc reference when calling
864 * socreate_internal since it calls soalloc with M_WAITOK
867 if (ep
!= PROC_NULL
) {
876 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
877 * <pru_bind>:EAFNOSUPPORT Address family not supported
878 * <pru_bind>:EADDRNOTAVAIL Address not available.
879 * <pru_bind>:EINVAL Invalid argument
880 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
881 * <pru_bind>:EACCES Permission denied
882 * <pru_bind>:EADDRINUSE Address in use
883 * <pru_bind>:EAGAIN Resource unavailable, try again
884 * <pru_bind>:EPERM Operation not permitted
888 * Notes: It's not possible to fully enumerate the return codes above,
889 * since socket filter authors and protocol family authors may
890 * not choose to limit their error returns to those listed, even
891 * though this may result in some software operating incorrectly.
893 * The error codes which are enumerated above are those known to
894 * be returned by the tcp_usr_bind function supplied.
897 sobindlock(struct socket
*so
, struct sockaddr
*nam
, int dolock
)
899 struct proc
*p
= current_proc();
906 so_update_last_owner_locked(so
, p
);
907 so_update_policy(so
);
910 so_update_necp_policy(so
, nam
, NULL
);
914 * If this is a bind request on a socket that has been marked
915 * as inactive, reject it now before we go any further.
917 if (so
->so_flags
& SOF_DEFUNCT
) {
919 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
920 __func__
, proc_pid(p
), proc_best_name(p
),
921 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
922 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
927 error
= sflt_bind(so
, nam
);
930 error
= (*so
->so_proto
->pr_usrreqs
->pru_bind
)(so
, nam
, p
);
934 socket_unlock(so
, 1);
937 if (error
== EJUSTRETURN
) {
945 sodealloc(struct socket
*so
)
947 kauth_cred_unref(&so
->so_cred
);
949 /* Remove any filters */
953 cfil_sock_detach(so
);
954 #endif /* CONTENT_FILTER */
956 so
->so_gencnt
= OSIncrementAtomic64((SInt64
*)&so_gencnt
);
958 if (so
->so_flags1
& SOF1_CACHED_IN_SOCK_LAYER
) {
959 cached_sock_free(so
);
961 zfree(socket_zone
, so
);
969 * <pru_listen>:EINVAL[AF_UNIX]
970 * <pru_listen>:EINVAL[TCP]
971 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
972 * <pru_listen>:EINVAL[TCP] Invalid argument
973 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
974 * <pru_listen>:EACCES[TCP] Permission denied
975 * <pru_listen>:EADDRINUSE[TCP] Address in use
976 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
977 * <pru_listen>:EPERM[TCP] Operation not permitted
980 * Notes: Other <pru_listen> returns depend on the protocol family; all
981 * <sf_listen> returns depend on what the filter author causes
982 * their filter to return.
985 solisten(struct socket
*so
, int backlog
)
987 struct proc
*p
= current_proc();
992 so_update_last_owner_locked(so
, p
);
993 so_update_policy(so
);
996 so_update_necp_policy(so
, NULL
, NULL
);
999 if (so
->so_proto
== NULL
) {
1003 if ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) == 0) {
1009 * If the listen request is made on a socket that is not fully
1010 * disconnected, or on a socket that has been marked as inactive,
1011 * reject the request now.
1014 (SS_ISCONNECTED
| SS_ISCONNECTING
| SS_ISDISCONNECTING
)) ||
1015 (so
->so_flags
& SOF_DEFUNCT
)) {
1017 if (so
->so_flags
& SOF_DEFUNCT
) {
1018 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1019 "(%d)\n", __func__
, proc_pid(p
),
1021 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
1022 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
1027 if ((so
->so_restrictions
& SO_RESTRICT_DENY_IN
) != 0) {
1032 error
= sflt_listen(so
);
1034 error
= (*so
->so_proto
->pr_usrreqs
->pru_listen
)(so
, p
);
1038 if (error
== EJUSTRETURN
) {
1044 if (TAILQ_EMPTY(&so
->so_comp
)) {
1045 so
->so_options
|= SO_ACCEPTCONN
;
1048 * POSIX: The implementation may have an upper limit on the length of
1049 * the listen queue-either global or per accepting socket. If backlog
1050 * exceeds this limit, the length of the listen queue is set to the
1053 * If listen() is called with a backlog argument value that is less
1054 * than 0, the function behaves as if it had been called with a backlog
1055 * argument value of 0.
1057 * A backlog argument of 0 may allow the socket to accept connections,
1058 * in which case the length of the listen queue may be set to an
1059 * implementation-defined minimum value.
1061 if (backlog
<= 0 || backlog
> somaxconn
) {
1062 backlog
= somaxconn
;
1065 so
->so_qlimit
= backlog
;
1067 socket_unlock(so
, 1);
1072 * The "accept list lock" protects the fields related to the listener queues
1073 * because we can unlock a socket to respect the lock ordering between
1074 * the listener socket and its clients sockets. The lock ordering is first to
1075 * acquire the client socket before the listener socket.
1077 * The accept list lock serializes access to the following fields:
1078 * - of the listener socket:
1083 * - of client sockets that are in so_comp or so_incomp:
1087 * As one can see the accept list lock protects the consistent of the
1088 * linkage of the client sockets.
1090 * Note that those fields may be read without holding the accept list lock
1091 * for a preflight provided the accept list lock is taken when committing
1092 * to take an action based on the result of the preflight. The preflight
1093 * saves the cost of doing the unlock/lock dance.
1096 so_acquire_accept_list(struct socket
*head
, struct socket
*so
)
1098 lck_mtx_t
*mutex_held
;
1100 if (head
->so_proto
->pr_getlock
== NULL
) {
1103 mutex_held
= (*head
->so_proto
->pr_getlock
)(head
, PR_F_WILLUNLOCK
);
1104 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1106 if (!(head
->so_flags1
& SOF1_ACCEPT_LIST_HELD
)) {
1107 head
->so_flags1
|= SOF1_ACCEPT_LIST_HELD
;
1111 socket_unlock(so
, 0);
1113 while (head
->so_flags1
& SOF1_ACCEPT_LIST_HELD
) {
1114 so_accept_list_waits
+= 1;
1115 msleep((caddr_t
)&head
->so_incomp
, mutex_held
,
1116 PSOCK
| PCATCH
, __func__
, NULL
);
1118 head
->so_flags1
|= SOF1_ACCEPT_LIST_HELD
;
1120 socket_unlock(head
, 0);
1122 socket_lock(head
, 0);
1127 so_release_accept_list(struct socket
*head
)
1129 if (head
->so_proto
->pr_getlock
!= NULL
) {
1130 lck_mtx_t
*mutex_held
;
1132 mutex_held
= (*head
->so_proto
->pr_getlock
)(head
, 0);
1133 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1135 head
->so_flags1
&= ~SOF1_ACCEPT_LIST_HELD
;
1136 wakeup((caddr_t
)&head
->so_incomp
);
1141 sofreelastref(struct socket
*so
, int dealloc
)
1143 struct socket
*head
= so
->so_head
;
1145 /* Assume socket is locked */
1147 if (!(so
->so_flags
& SOF_PCBCLEARING
) || !(so
->so_state
& SS_NOFDREF
)) {
1148 selthreadclear(&so
->so_snd
.sb_sel
);
1149 selthreadclear(&so
->so_rcv
.sb_sel
);
1150 so
->so_rcv
.sb_flags
&= ~(SB_SEL
| SB_UPCALL
);
1151 so
->so_snd
.sb_flags
&= ~(SB_SEL
| SB_UPCALL
);
1152 so
->so_event
= sonullevent
;
1157 * Need to lock the listener when the protocol has
1160 if (head
->so_proto
->pr_getlock
!= NULL
) {
1161 socket_lock(head
, 1);
1162 so_acquire_accept_list(head
, so
);
1164 if (so
->so_state
& SS_INCOMP
) {
1165 so
->so_state
&= ~SS_INCOMP
;
1166 TAILQ_REMOVE(&head
->so_incomp
, so
, so_list
);
1171 if (head
->so_proto
->pr_getlock
!= NULL
) {
1172 so_release_accept_list(head
);
1173 socket_unlock(head
, 1);
1175 } else if (so
->so_state
& SS_COMP
) {
1176 if (head
->so_proto
->pr_getlock
!= NULL
) {
1177 so_release_accept_list(head
);
1178 socket_unlock(head
, 1);
1181 * We must not decommission a socket that's
1182 * on the accept(2) queue. If we do, then
1183 * accept(2) may hang after select(2) indicated
1184 * that the listening socket was ready.
1186 selthreadclear(&so
->so_snd
.sb_sel
);
1187 selthreadclear(&so
->so_rcv
.sb_sel
);
1188 so
->so_rcv
.sb_flags
&= ~(SB_SEL
| SB_UPCALL
);
1189 so
->so_snd
.sb_flags
&= ~(SB_SEL
| SB_UPCALL
);
1190 so
->so_event
= sonullevent
;
1193 if (head
->so_proto
->pr_getlock
!= NULL
) {
1194 so_release_accept_list(head
);
1195 socket_unlock(head
, 1);
1197 printf("sofree: not queued\n");
1204 if (so
->so_flags
& SOF_FLOW_DIVERT
) {
1205 flow_divert_detach(so
);
1207 #endif /* FLOW_DIVERT */
1209 /* 3932268: disable upcall */
1210 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
1211 so
->so_snd
.sb_flags
&= ~(SB_UPCALL
| SB_SNDBYTE_CNT
);
1212 so
->so_event
= sonullevent
;
1220 soclose_wait_locked(struct socket
*so
)
1222 lck_mtx_t
*mutex_held
;
1224 if (so
->so_proto
->pr_getlock
!= NULL
) {
1225 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, PR_F_WILLUNLOCK
);
1227 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1229 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1232 * Double check here and return if there's no outstanding upcall;
1233 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1235 if (!so
->so_upcallusecount
|| !(so
->so_flags
& SOF_UPCALLCLOSEWAIT
)) {
1238 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
1239 so
->so_snd
.sb_flags
&= ~SB_UPCALL
;
1240 so
->so_flags
|= SOF_CLOSEWAIT
;
1242 (void) msleep((caddr_t
)&so
->so_upcallusecount
, mutex_held
, (PZERO
- 1),
1243 "soclose_wait_locked", NULL
);
1244 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1245 so
->so_flags
&= ~SOF_CLOSEWAIT
;
1249 * Close a socket on last file table reference removal.
1250 * Initiate disconnect if connected.
1251 * Free socket when disconnect complete.
1254 soclose_locked(struct socket
*so
)
1259 if (so
->so_usecount
== 0) {
1260 panic("soclose: so=%p refcount=0\n", so
);
1264 sflt_notify(so
, sock_evt_closing
, NULL
);
1266 if (so
->so_upcallusecount
) {
1267 soclose_wait_locked(so
);
1272 * We have to wait until the content filters are done
1274 if ((so
->so_flags
& SOF_CONTENT_FILTER
) != 0) {
1275 cfil_sock_close_wait(so
);
1276 cfil_sock_is_closed(so
);
1277 cfil_sock_detach(so
);
1279 #endif /* CONTENT_FILTER */
1281 if (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_INPROG
) {
1282 soresume(current_proc(), so
, 1);
1283 so
->so_flags1
&= ~SOF1_EXTEND_BK_IDLE_WANTED
;
1286 if ((so
->so_options
& SO_ACCEPTCONN
)) {
1287 struct socket
*sp
, *sonext
;
1288 int persocklock
= 0;
1289 int incomp_overflow_only
;
1292 * We do not want new connection to be added
1293 * to the connection queues
1295 so
->so_options
&= ~SO_ACCEPTCONN
;
1298 * We can drop the lock on the listener once
1299 * we've acquired the incoming list
1301 if (so
->so_proto
->pr_getlock
!= NULL
) {
1303 so_acquire_accept_list(so
, NULL
);
1304 socket_unlock(so
, 0);
1307 incomp_overflow_only
= 1;
1309 TAILQ_FOREACH_SAFE(sp
, &so
->so_incomp
, so_list
, sonext
) {
1312 * skip sockets thrown away by tcpdropdropblreq
1313 * they will get cleanup by the garbage collection.
1314 * otherwise, remove the incomp socket from the queue
1315 * and let soabort trigger the appropriate cleanup.
1317 if (sp
->so_flags
& SOF_OVERFLOW
) {
1321 if (persocklock
!= 0) {
1327 * The extra reference for the list insure the
1328 * validity of the socket pointer when we perform the
1329 * unlock of the head above
1331 if (sp
->so_state
& SS_INCOMP
) {
1332 sp
->so_state
&= ~SS_INCOMP
;
1334 TAILQ_REMOVE(&so
->so_incomp
, sp
, so_list
);
1340 panic("%s sp %p in so_incomp but !SS_INCOMP",
1344 if (persocklock
!= 0) {
1345 socket_unlock(sp
, 1);
1349 TAILQ_FOREACH_SAFE(sp
, &so
->so_comp
, so_list
, sonext
) {
1350 /* Dequeue from so_comp since sofree() won't do it */
1351 if (persocklock
!= 0) {
1355 if (sp
->so_state
& SS_COMP
) {
1356 sp
->so_state
&= ~SS_COMP
;
1358 TAILQ_REMOVE(&so
->so_comp
, sp
, so_list
);
1363 panic("%s sp %p in so_comp but !SS_COMP",
1368 socket_unlock(sp
, 1);
1372 if (incomp_overflow_only
== 0 && !TAILQ_EMPTY(&so
->so_incomp
)) {
1373 #if (DEBUG | DEVELOPMENT)
1374 panic("%s head %p so_comp not empty\n", __func__
, so
);
1375 #endif /* (DEVELOPMENT || DEBUG) */
1380 if (!TAILQ_EMPTY(&so
->so_comp
)) {
1381 #if (DEBUG | DEVELOPMENT)
1382 panic("%s head %p so_comp not empty\n", __func__
, so
);
1383 #endif /* (DEVELOPMENT || DEBUG) */
1390 so_release_accept_list(so
);
1393 if (so
->so_pcb
== NULL
) {
1394 /* 3915887: mark the socket as ready for dealloc */
1395 so
->so_flags
|= SOF_PCBCLEARING
;
1398 if (so
->so_state
& SS_ISCONNECTED
) {
1399 if ((so
->so_state
& SS_ISDISCONNECTING
) == 0) {
1400 error
= sodisconnectlocked(so
);
1405 if (so
->so_options
& SO_LINGER
) {
1406 lck_mtx_t
*mutex_held
;
1408 if ((so
->so_state
& SS_ISDISCONNECTING
) &&
1409 (so
->so_state
& SS_NBIO
)) {
1412 if (so
->so_proto
->pr_getlock
!= NULL
) {
1413 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, PR_F_WILLUNLOCK
);
1415 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1417 while (so
->so_state
& SS_ISCONNECTED
) {
1418 ts
.tv_sec
= (so
->so_linger
/ 100);
1419 ts
.tv_nsec
= (so
->so_linger
% 100) *
1420 NSEC_PER_USEC
* 1000 * 10;
1421 error
= msleep((caddr_t
)&so
->so_timeo
,
1422 mutex_held
, PSOCK
| PCATCH
, "soclose", &ts
);
1425 * It's OK when the time fires,
1426 * don't report an error
1428 if (error
== EWOULDBLOCK
) {
1437 if (so
->so_usecount
== 0) {
1438 panic("soclose: usecount is zero so=%p\n", so
);
1441 if (so
->so_pcb
!= NULL
&& !(so
->so_flags
& SOF_PCBCLEARING
)) {
1442 int error2
= (*so
->so_proto
->pr_usrreqs
->pru_detach
)(so
);
1447 if (so
->so_usecount
<= 0) {
1448 panic("soclose: usecount is zero so=%p\n", so
);
1452 if (so
->so_pcb
!= NULL
&& !(so
->so_flags
& SOF_MP_SUBFLOW
) &&
1453 (so
->so_state
& SS_NOFDREF
)) {
1454 panic("soclose: NOFDREF");
1457 so
->so_state
|= SS_NOFDREF
;
1459 if ((so
->so_flags
& SOF_KNOTE
) != 0) {
1460 KNOTE(&so
->so_klist
, SO_FILT_HINT_LOCKED
);
1463 atomic_add_32(&so
->so_proto
->pr_domain
->dom_refs
, -1);
1465 VERIFY(so
->so_usecount
> 0);
1472 soclose(struct socket
*so
)
1477 if (so
->so_retaincnt
== 0) {
1478 error
= soclose_locked(so
);
1481 * if the FD is going away, but socket is
1482 * retained in kernel remove its reference
1485 if (so
->so_usecount
< 2) {
1486 panic("soclose: retaincnt non null and so=%p "
1487 "usecount=%d\n", so
, so
->so_usecount
);
1490 socket_unlock(so
, 1);
1495 * Must be called at splnet...
1497 /* Should already be locked */
1499 soabort(struct socket
*so
)
1503 #ifdef MORE_LOCKING_DEBUG
1504 lck_mtx_t
*mutex_held
;
1506 if (so
->so_proto
->pr_getlock
!= NULL
) {
1507 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
1509 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1511 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1514 if ((so
->so_flags
& SOF_ABORTED
) == 0) {
1515 so
->so_flags
|= SOF_ABORTED
;
1516 error
= (*so
->so_proto
->pr_usrreqs
->pru_abort
)(so
);
1526 soacceptlock(struct socket
*so
, struct sockaddr
**nam
, int dolock
)
1534 so_update_last_owner_locked(so
, PROC_NULL
);
1535 so_update_policy(so
);
1537 so_update_necp_policy(so
, NULL
, NULL
);
1540 if ((so
->so_state
& SS_NOFDREF
) == 0) {
1541 panic("soaccept: !NOFDREF");
1543 so
->so_state
&= ~SS_NOFDREF
;
1544 error
= (*so
->so_proto
->pr_usrreqs
->pru_accept
)(so
, nam
);
1547 socket_unlock(so
, 1);
1553 soaccept(struct socket
*so
, struct sockaddr
**nam
)
1555 return soacceptlock(so
, nam
, 1);
1559 soacceptfilter(struct socket
*so
, struct socket
*head
)
1561 struct sockaddr
*local
= NULL
, *remote
= NULL
;
1565 * Hold the lock even if this socket has not been made visible
1566 * to the filter(s). For sockets with global locks, this protects
1567 * against the head or peer going away
1570 if (sogetaddr_locked(so
, &remote
, 1) != 0 ||
1571 sogetaddr_locked(so
, &local
, 0) != 0) {
1572 so
->so_state
&= ~SS_NOFDREF
;
1573 socket_unlock(so
, 1);
1575 /* Out of resources; try it again next time */
1576 error
= ECONNABORTED
;
1580 error
= sflt_accept(head
, so
, local
, remote
);
1583 * If we get EJUSTRETURN from one of the filters, mark this socket
1584 * as inactive and return it anyway. This newly accepted socket
1585 * will be disconnected later before we hand it off to the caller.
1587 if (error
== EJUSTRETURN
) {
1589 (void) sosetdefunct(current_proc(), so
,
1590 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL
, FALSE
);
1595 * This may seem like a duplication to the above error
1596 * handling part when we return ECONNABORTED, except
1597 * the following is done while holding the lock since
1598 * the socket has been exposed to the filter(s) earlier.
1600 so
->so_state
&= ~SS_NOFDREF
;
1601 socket_unlock(so
, 1);
1603 /* Propagate socket filter's error code to the caller */
1605 socket_unlock(so
, 1);
1608 /* Callee checks for NULL pointer */
1609 sock_freeaddr(remote
);
1610 sock_freeaddr(local
);
1615 * Returns: 0 Success
1616 * EOPNOTSUPP Operation not supported on socket
1617 * EISCONN Socket is connected
1618 * <pru_connect>:EADDRNOTAVAIL Address not available.
1619 * <pru_connect>:EINVAL Invalid argument
1620 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1621 * <pru_connect>:EACCES Permission denied
1622 * <pru_connect>:EADDRINUSE Address in use
1623 * <pru_connect>:EAGAIN Resource unavailable, try again
1624 * <pru_connect>:EPERM Operation not permitted
1625 * <sf_connect_out>:??? [anything a filter writer might set]
1628 soconnectlock(struct socket
*so
, struct sockaddr
*nam
, int dolock
)
1631 struct proc
*p
= current_proc();
1637 so_update_last_owner_locked(so
, p
);
1638 so_update_policy(so
);
1641 so_update_necp_policy(so
, NULL
, nam
);
1645 * If this is a listening socket or if this is a previously-accepted
1646 * socket that has been marked as inactive, reject the connect request.
1648 if ((so
->so_options
& SO_ACCEPTCONN
) || (so
->so_flags
& SOF_DEFUNCT
)) {
1650 if (so
->so_flags
& SOF_DEFUNCT
) {
1651 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1652 "(%d)\n", __func__
, proc_pid(p
),
1654 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
1655 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
1658 socket_unlock(so
, 1);
1663 if ((so
->so_restrictions
& SO_RESTRICT_DENY_OUT
) != 0) {
1665 socket_unlock(so
, 1);
1671 * If protocol is connection-based, can only connect once.
1672 * Otherwise, if connected, try to disconnect first.
1673 * This allows user to disconnect by connecting to, e.g.,
1676 if (so
->so_state
& (SS_ISCONNECTED
| SS_ISCONNECTING
) &&
1677 ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ||
1678 (error
= sodisconnectlocked(so
)))) {
1682 * Run connect filter before calling protocol:
1683 * - non-blocking connect returns before completion;
1685 error
= sflt_connectout(so
, nam
);
1687 if (error
== EJUSTRETURN
) {
1691 error
= (*so
->so_proto
->pr_usrreqs
->pru_connect
)
1694 so
->so_state
&= ~SS_ISCONNECTING
;
1699 socket_unlock(so
, 1);
1705 soconnect(struct socket
*so
, struct sockaddr
*nam
)
1707 return soconnectlock(so
, nam
, 1);
1711 * Returns: 0 Success
1712 * <pru_connect2>:EINVAL[AF_UNIX]
1713 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1714 * <pru_connect2>:??? [other protocol families]
1716 * Notes: <pru_connect2> is not supported by [TCP].
1719 soconnect2(struct socket
*so1
, struct socket
*so2
)
1723 socket_lock(so1
, 1);
1724 if (so2
->so_proto
->pr_lock
) {
1725 socket_lock(so2
, 1);
1728 error
= (*so1
->so_proto
->pr_usrreqs
->pru_connect2
)(so1
, so2
);
1730 socket_unlock(so1
, 1);
1731 if (so2
->so_proto
->pr_lock
) {
1732 socket_unlock(so2
, 1);
1738 soconnectxlocked(struct socket
*so
, struct sockaddr
*src
,
1739 struct sockaddr
*dst
, struct proc
*p
, uint32_t ifscope
,
1740 sae_associd_t aid
, sae_connid_t
*pcid
, uint32_t flags
, void *arg
,
1741 uint32_t arglen
, uio_t auio
, user_ssize_t
*bytes_written
)
1745 so_update_last_owner_locked(so
, p
);
1746 so_update_policy(so
);
1749 * If this is a listening socket or if this is a previously-accepted
1750 * socket that has been marked as inactive, reject the connect request.
1752 if ((so
->so_options
& SO_ACCEPTCONN
) || (so
->so_flags
& SOF_DEFUNCT
)) {
1754 if (so
->so_flags
& SOF_DEFUNCT
) {
1755 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1756 "(%d)\n", __func__
, proc_pid(p
),
1758 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
1759 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
1764 if ((so
->so_restrictions
& SO_RESTRICT_DENY_OUT
) != 0) {
1769 * If protocol is connection-based, can only connect once
1770 * unless PR_MULTICONN is set. Otherwise, if connected,
1771 * try to disconnect first. This allows user to disconnect
1772 * by connecting to, e.g., a null address.
1774 if ((so
->so_state
& (SS_ISCONNECTED
| SS_ISCONNECTING
)) &&
1775 !(so
->so_proto
->pr_flags
& PR_MULTICONN
) &&
1776 ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ||
1777 (error
= sodisconnectlocked(so
)) != 0)) {
1780 if ((so
->so_proto
->pr_flags
& PR_DATA_IDEMPOTENT
) &&
1781 (flags
& CONNECT_DATA_IDEMPOTENT
)) {
1782 so
->so_flags1
|= SOF1_DATA_IDEMPOTENT
;
1784 if (flags
& CONNECT_DATA_AUTHENTICATED
) {
1785 so
->so_flags1
|= SOF1_DATA_AUTHENTICATED
;
1790 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1791 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1792 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1793 * Case 3 allows user to combine write with connect even if they have
1794 * no use for TFO (such as regular TCP, and UDP).
1795 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1797 if ((so
->so_proto
->pr_flags
& PR_PRECONN_WRITE
) &&
1798 ((flags
& CONNECT_RESUME_ON_READ_WRITE
) || auio
)) {
1799 so
->so_flags1
|= SOF1_PRECONNECT_DATA
;
1803 * If a user sets data idempotent and does not pass an uio, or
1804 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1805 * SOF1_DATA_IDEMPOTENT.
1807 if (!(so
->so_flags1
& SOF1_PRECONNECT_DATA
) &&
1808 (so
->so_flags1
& SOF1_DATA_IDEMPOTENT
)) {
1809 /* We should return EINVAL instead perhaps. */
1810 so
->so_flags1
&= ~SOF1_DATA_IDEMPOTENT
;
1814 * Run connect filter before calling protocol:
1815 * - non-blocking connect returns before completion;
1817 error
= sflt_connectout(so
, dst
);
1819 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1820 so
->so_flags1
&= ~SOF1_PRECONNECT_DATA
;
1821 if (error
== EJUSTRETURN
) {
1825 error
= (*so
->so_proto
->pr_usrreqs
->pru_connectx
)
1826 (so
, src
, dst
, p
, ifscope
, aid
, pcid
,
1827 flags
, arg
, arglen
, auio
, bytes_written
);
1829 so
->so_state
&= ~SS_ISCONNECTING
;
1830 if (error
!= EINPROGRESS
) {
1831 so
->so_flags1
&= ~SOF1_PRECONNECT_DATA
;
1841 sodisconnectlocked(struct socket
*so
)
1845 if ((so
->so_state
& SS_ISCONNECTED
) == 0) {
1849 if (so
->so_state
& SS_ISDISCONNECTING
) {
1854 error
= (*so
->so_proto
->pr_usrreqs
->pru_disconnect
)(so
);
1856 sflt_notify(so
, sock_evt_disconnected
, NULL
);
1863 /* Locking version */
1865 sodisconnect(struct socket
*so
)
1870 error
= sodisconnectlocked(so
);
1871 socket_unlock(so
, 1);
1876 sodisconnectxlocked(struct socket
*so
, sae_associd_t aid
, sae_connid_t cid
)
1881 * Call the protocol disconnectx handler; let it handle all
1882 * matters related to the connection state of this session.
1884 error
= (*so
->so_proto
->pr_usrreqs
->pru_disconnectx
)(so
, aid
, cid
);
1887 * The event applies only for the session, not for
1888 * the disconnection of individual subflows.
1890 if (so
->so_state
& (SS_ISDISCONNECTING
| SS_ISDISCONNECTED
)) {
1891 sflt_notify(so
, sock_evt_disconnected
, NULL
);
1898 sodisconnectx(struct socket
*so
, sae_associd_t aid
, sae_connid_t cid
)
1903 error
= sodisconnectxlocked(so
, aid
, cid
);
1904 socket_unlock(so
, 1);
1908 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1911 * sosendcheck will lock the socket buffer if it isn't locked and
1912 * verify that there is space for the data being inserted.
1914 * Returns: 0 Success
1916 * sblock:EWOULDBLOCK
1923 sosendcheck(struct socket
*so
, struct sockaddr
*addr
, user_ssize_t resid
,
1924 int32_t clen
, int32_t atomic
, int flags
, int *sblocked
)
1931 if (*sblocked
== 0) {
1932 if ((so
->so_snd
.sb_flags
& SB_LOCK
) != 0 &&
1933 so
->so_send_filt_thread
!= 0 &&
1934 so
->so_send_filt_thread
== current_thread()) {
1936 * We're being called recursively from a filter,
1937 * allow this to continue. Radar 4150520.
1938 * Don't set sblocked because we don't want
1939 * to perform an unlock later.
1943 error
= sblock(&so
->so_snd
, SBLOCKWAIT(flags
));
1945 if (so
->so_flags
& SOF_DEFUNCT
) {
1955 * If a send attempt is made on a socket that has been marked
1956 * as inactive (disconnected), reject the request.
1958 if (so
->so_flags
& SOF_DEFUNCT
) {
1961 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1962 __func__
, proc_selfpid(), proc_best_name(current_proc()),
1963 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
1964 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
1968 if (so
->so_state
& SS_CANTSENDMORE
) {
1971 * Can re-inject data of half closed connections
1973 if ((so
->so_state
& SS_ISDISCONNECTED
) == 0 &&
1974 so
->so_snd
.sb_cfil_thread
== current_thread() &&
1975 cfil_sock_data_pending(&so
->so_snd
) != 0) {
1977 "so %llx ignore SS_CANTSENDMORE",
1978 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
));
1980 #endif /* CONTENT_FILTER */
1984 error
= so
->so_error
;
1989 if ((so
->so_state
& SS_ISCONNECTED
) == 0) {
1990 if ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) != 0) {
1991 if (((so
->so_state
& SS_ISCONFIRMING
) == 0) &&
1992 (resid
!= 0 || clen
== 0) &&
1993 !(so
->so_flags1
& SOF1_PRECONNECT_DATA
)) {
1996 } else if (addr
== 0) {
1997 return (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ?
1998 ENOTCONN
: EDESTADDRREQ
;
2002 space
= sbspace(&so
->so_snd
);
2004 if (flags
& MSG_OOB
) {
2007 if ((atomic
&& resid
> so
->so_snd
.sb_hiwat
) ||
2008 clen
> so
->so_snd
.sb_hiwat
) {
2012 if ((space
< resid
+ clen
&&
2013 (atomic
|| (space
< (int32_t)so
->so_snd
.sb_lowat
) ||
2015 (so
->so_type
== SOCK_STREAM
&& so_wait_for_if_feedback(so
))) {
2017 * don't block the connectx call when there's more data
2018 * than can be copied.
2020 if (so
->so_flags1
& SOF1_PRECONNECT_DATA
) {
2024 if (space
< (int32_t)so
->so_snd
.sb_lowat
) {
2028 if ((so
->so_state
& SS_NBIO
) || (flags
& MSG_NBIO
) ||
2032 sbunlock(&so
->so_snd
, TRUE
); /* keep socket locked */
2034 error
= sbwait(&so
->so_snd
);
2036 if (so
->so_flags
& SOF_DEFUNCT
) {
2048 * If send must go all at once and message is larger than
2049 * send buffering, then hard error.
2050 * Lock against other senders.
2051 * If must go all at once and not enough room now, then
2052 * inform user that this would block and do nothing.
2053 * Otherwise, if nonblocking, send as much as possible.
2054 * The data to be sent is described by "uio" if nonzero,
2055 * otherwise by the mbuf chain "top" (which must be null
2056 * if uio is not). Data provided in mbuf chain must be small
2057 * enough to send all at once.
2059 * Returns nonzero on error, timeout or signal; callers
2060 * must check for short counts if EINTR/ERESTART are returned.
2061 * Data and control buffers are freed on return.
2063 * Returns: 0 Success
2069 * sosendcheck:EWOULDBLOCK
2073 * sosendcheck:??? [value from so_error]
2074 * <pru_send>:ECONNRESET[TCP]
2075 * <pru_send>:EINVAL[TCP]
2076 * <pru_send>:ENOBUFS[TCP]
2077 * <pru_send>:EADDRINUSE[TCP]
2078 * <pru_send>:EADDRNOTAVAIL[TCP]
2079 * <pru_send>:EAFNOSUPPORT[TCP]
2080 * <pru_send>:EACCES[TCP]
2081 * <pru_send>:EAGAIN[TCP]
2082 * <pru_send>:EPERM[TCP]
2083 * <pru_send>:EMSGSIZE[TCP]
2084 * <pru_send>:EHOSTUNREACH[TCP]
2085 * <pru_send>:ENETUNREACH[TCP]
2086 * <pru_send>:ENETDOWN[TCP]
2087 * <pru_send>:ENOMEM[TCP]
2088 * <pru_send>:ENOBUFS[TCP]
2089 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2090 * <pru_send>:EINVAL[AF_UNIX]
2091 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2092 * <pru_send>:EPIPE[AF_UNIX]
2093 * <pru_send>:ENOTCONN[AF_UNIX]
2094 * <pru_send>:EISCONN[AF_UNIX]
2095 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2096 * <sf_data_out>:??? [whatever a filter author chooses]
2098 * Notes: Other <pru_send> returns depend on the protocol family; all
2099 * <sf_data_out> returns depend on what the filter author causes
2100 * their filter to return.
2103 sosend(struct socket
*so
, struct sockaddr
*addr
, struct uio
*uio
,
2104 struct mbuf
*top
, struct mbuf
*control
, int flags
)
2107 struct mbuf
*m
, *freelist
= NULL
;
2108 user_ssize_t space
, len
, resid
, orig_resid
;
2109 int clen
= 0, error
, dontroute
, mlen
, sendflags
;
2110 int atomic
= sosendallatonce(so
) || top
;
2112 struct proc
*p
= current_proc();
2113 uint16_t headroom
= 0;
2114 boolean_t en_tracing
= FALSE
;
2117 resid
= uio_resid(uio
);
2119 resid
= top
->m_pkthdr
.len
;
2122 KERNEL_DEBUG((DBG_FNC_SOSEND
| DBG_FUNC_START
), so
, resid
,
2123 so
->so_snd
.sb_cc
, so
->so_snd
.sb_lowat
, so
->so_snd
.sb_hiwat
);
2128 * trace if tracing & network (vs. unix) sockets & and
2131 if (ENTR_SHOULDTRACE
&&
2132 (SOCK_CHECK_DOM(so
, AF_INET
) || SOCK_CHECK_DOM(so
, AF_INET6
))) {
2133 struct inpcb
*inp
= sotoinpcb(so
);
2134 if (inp
->inp_last_outifp
!= NULL
&&
2135 !(inp
->inp_last_outifp
->if_flags
& IFF_LOOPBACK
)) {
2137 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite
, DBG_FUNC_START
,
2138 VM_KERNEL_ADDRPERM(so
),
2139 ((so
->so_state
& SS_NBIO
) ? kEnTrFlagNonBlocking
: 0),
2146 * Re-injection should not affect process accounting
2148 if ((flags
& MSG_SKIPCFIL
) == 0) {
2149 so_update_last_owner_locked(so
, p
);
2150 so_update_policy(so
);
2153 so_update_necp_policy(so
, NULL
, addr
);
2157 if (so
->so_type
!= SOCK_STREAM
&& (flags
& MSG_OOB
) != 0) {
2163 * In theory resid should be unsigned.
2164 * However, space must be signed, as it might be less than 0
2165 * if we over-committed, and we must use a signed comparison
2166 * of space and resid. On the other hand, a negative resid
2167 * causes us to loop sending 0-length segments to the protocol.
2169 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2171 * Note: We limit resid to be a positive int value as we use
2172 * imin() to set bytes_to_copy -- radr://14558484
2174 if (resid
< 0 || resid
> INT_MAX
||
2175 (so
->so_type
== SOCK_STREAM
&& (flags
& MSG_EOR
))) {
2180 dontroute
= (flags
& MSG_DONTROUTE
) &&
2181 (so
->so_options
& SO_DONTROUTE
) == 0 &&
2182 (so
->so_proto
->pr_flags
& PR_ATOMIC
);
2183 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgsnd
);
2185 if (control
!= NULL
) {
2186 clen
= control
->m_len
;
2189 if (soreserveheadroom
!= 0) {
2190 headroom
= so
->so_pktheadroom
;
2194 error
= sosendcheck(so
, addr
, resid
, clen
, atomic
, flags
,
2201 space
= sbspace(&so
->so_snd
) - clen
;
2202 space
+= ((flags
& MSG_OOB
) ? 1024 : 0);
2207 * Data is prepackaged in "top".
2210 if (flags
& MSG_EOR
) {
2211 top
->m_flags
|= M_EOR
;
2220 bytes_to_copy
= imin(resid
, space
);
2222 bytes_to_alloc
= bytes_to_copy
;
2224 bytes_to_alloc
+= headroom
;
2227 if (sosendminchain
> 0) {
2230 chainlength
= sosendmaxchain
;
2234 * Use big 4 KB cluster when the outgoing interface
2235 * does not prefer 2 KB clusters
2237 bigcl
= !(so
->so_flags1
& SOF1_IF_2KCL
) ||
2238 sosendbigcl_ignore_capab
;
2241 * Attempt to use larger than system page-size
2242 * clusters for large writes only if there is
2243 * a jumbo cluster pool and if the socket is
2244 * marked accordingly.
2246 jumbocl
= sosendjcl
&& njcl
> 0 &&
2247 ((so
->so_flags
& SOF_MULTIPAGES
) ||
2248 sosendjcl_ignore_capab
) &&
2251 socket_unlock(so
, 0);
2255 int hdrs_needed
= (top
== NULL
) ? 1 : 0;
2258 * try to maintain a local cache of mbuf
2259 * clusters needed to complete this
2260 * write the list is further limited to
2261 * the number that are currently needed
2262 * to fill the socket this mechanism
2263 * allows a large number of mbufs/
2264 * clusters to be grabbed under a single
2265 * mbuf lock... if we can't get any
2266 * clusters, than fall back to trying
2267 * for mbufs if we fail early (or
2268 * miscalcluate the number needed) make
2269 * sure to release any clusters we
2270 * haven't yet consumed.
2272 if (freelist
== NULL
&&
2273 bytes_to_alloc
> MBIGCLBYTES
&&
2276 bytes_to_alloc
/ M16KCLBYTES
;
2278 if ((bytes_to_alloc
-
2279 (num_needed
* M16KCLBYTES
))
2285 m_getpackets_internal(
2286 (unsigned int *)&num_needed
,
2287 hdrs_needed
, M_WAIT
, 0,
2290 * Fall back to 4K cluster size
2291 * if allocation failed
2295 if (freelist
== NULL
&&
2296 bytes_to_alloc
> MCLBYTES
&&
2299 bytes_to_alloc
/ MBIGCLBYTES
;
2301 if ((bytes_to_alloc
-
2302 (num_needed
* MBIGCLBYTES
)) >=
2308 m_getpackets_internal(
2309 (unsigned int *)&num_needed
,
2310 hdrs_needed
, M_WAIT
, 0,
2313 * Fall back to cluster size
2314 * if allocation failed
2319 * Allocate a cluster as we want to
2320 * avoid to split the data in more
2321 * that one segment and using MINCLSIZE
2322 * would lead us to allocate two mbufs
2324 if (soreserveheadroom
!= 0 &&
2327 bytes_to_alloc
> _MHLEN
) ||
2328 bytes_to_alloc
> _MLEN
)) {
2329 num_needed
= ROUNDUP(bytes_to_alloc
, MCLBYTES
) /
2332 m_getpackets_internal(
2333 (unsigned int *)&num_needed
,
2334 hdrs_needed
, M_WAIT
, 0,
2337 * Fall back to a single mbuf
2338 * if allocation failed
2340 } else if (freelist
== NULL
&&
2341 bytes_to_alloc
> MINCLSIZE
) {
2343 bytes_to_alloc
/ MCLBYTES
;
2345 if ((bytes_to_alloc
-
2346 (num_needed
* MCLBYTES
)) >=
2352 m_getpackets_internal(
2353 (unsigned int *)&num_needed
,
2354 hdrs_needed
, M_WAIT
, 0,
2357 * Fall back to a single mbuf
2358 * if allocation failed
2362 * For datagram protocols, leave
2363 * headroom for protocol headers
2364 * in the first cluster of the chain
2366 if (freelist
!= NULL
&& atomic
&&
2367 top
== NULL
&& headroom
> 0) {
2368 freelist
->m_data
+= headroom
;
2372 * Fall back to regular mbufs without
2373 * reserving the socket headroom
2375 if (freelist
== NULL
) {
2376 if (SOCK_TYPE(so
) != SOCK_STREAM
|| bytes_to_alloc
<= MINCLSIZE
) {
2386 if (freelist
== NULL
) {
2392 * For datagram protocols,
2393 * leave room for protocol
2394 * headers in first mbuf.
2396 if (atomic
&& top
== NULL
&&
2397 bytes_to_copy
< MHLEN
) {
2403 freelist
= m
->m_next
;
2406 if ((m
->m_flags
& M_EXT
)) {
2407 mlen
= m
->m_ext
.ext_size
-
2409 } else if ((m
->m_flags
& M_PKTHDR
)) {
2411 MHLEN
- M_LEADINGSPACE(m
);
2413 mlen
= MLEN
- M_LEADINGSPACE(m
);
2415 len
= imin(mlen
, bytes_to_copy
);
2421 error
= uiomove(mtod(m
, caddr_t
),
2424 resid
= uio_resid(uio
);
2428 top
->m_pkthdr
.len
+= len
;
2434 if (flags
& MSG_EOR
) {
2435 top
->m_flags
|= M_EOR
;
2439 bytes_to_copy
= min(resid
, space
);
2440 } while (space
> 0 &&
2441 (chainlength
< sosendmaxchain
|| atomic
||
2442 resid
< MINCLSIZE
));
2452 so
->so_options
|= SO_DONTROUTE
;
2456 * Compute flags here, for pru_send and NKEs
2458 * If the user set MSG_EOF, the protocol
2459 * understands this flag and nothing left to
2460 * send then use PRU_SEND_EOF instead of PRU_SEND.
2462 sendflags
= (flags
& MSG_OOB
) ? PRUS_OOB
:
2463 ((flags
& MSG_EOF
) &&
2464 (so
->so_proto
->pr_flags
& PR_IMPLOPCL
) &&
2465 (resid
<= 0)) ? PRUS_EOF
:
2466 /* If there is more to send set PRUS_MORETOCOME */
2467 (resid
> 0 && space
> 0) ? PRUS_MORETOCOME
: 0;
2469 if ((flags
& MSG_SKIPCFIL
) == 0) {
2471 * Socket filter processing
2473 error
= sflt_data_out(so
, addr
, &top
,
2474 &control
, (sendflags
& MSG_OOB
) ?
2475 sock_data_filt_flag_oob
: 0);
2477 if (error
== EJUSTRETURN
) {
2479 goto packet_consumed
;
2485 * Content filter processing
2487 error
= cfil_sock_data_out(so
, addr
, top
,
2488 control
, sendflags
);
2490 if (error
== EJUSTRETURN
) {
2492 goto packet_consumed
;
2496 #endif /* CONTENT_FILTER */
2498 error
= (*so
->so_proto
->pr_usrreqs
->pru_send
)
2499 (so
, sendflags
, top
, addr
, control
, p
);
2503 so
->so_options
&= ~SO_DONTROUTE
;
2513 } while (resid
&& space
> 0);
2518 sbunlock(&so
->so_snd
, FALSE
); /* will unlock socket */
2520 socket_unlock(so
, 1);
2525 if (control
!= NULL
) {
2528 if (freelist
!= NULL
) {
2529 m_freem_list(freelist
);
2532 soclearfastopen(so
);
2535 /* resid passed here is the bytes left in uio */
2536 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite
, DBG_FUNC_END
,
2537 VM_KERNEL_ADDRPERM(so
),
2538 ((error
== EWOULDBLOCK
) ? kEnTrFlagNoWork
: 0),
2539 (int64_t)(orig_resid
- resid
));
2541 KERNEL_DEBUG(DBG_FNC_SOSEND
| DBG_FUNC_END
, so
, resid
,
2542 so
->so_snd
.sb_cc
, space
, error
);
2548 sosend_reinject(struct socket
*so
, struct sockaddr
*addr
, struct mbuf
*top
, struct mbuf
*control
, uint32_t sendflags
)
2550 struct mbuf
*m0
= NULL
, *control_end
= NULL
;
2552 socket_lock_assert_owned(so
);
2555 * top must points to mbuf chain to be sent.
2556 * If control is not NULL, top must be packet header
2558 VERIFY(top
!= NULL
&&
2559 (control
== NULL
|| top
->m_flags
& M_PKTHDR
));
2562 * If control is not passed in, see if we can get it
2565 if (control
== NULL
&& (top
->m_flags
& M_PKTHDR
) == 0) {
2566 // Locate start of control if present and start of data
2567 for (m0
= top
; m0
!= NULL
; m0
= m0
->m_next
) {
2568 if (m0
->m_flags
& M_PKTHDR
) {
2571 } else if (m0
->m_type
== MT_CONTROL
) {
2572 if (control
== NULL
) {
2573 // Found start of control
2576 if (control
!= NULL
&& m0
->m_next
!= NULL
&& m0
->m_next
->m_type
!= MT_CONTROL
) {
2577 // Found end of control
2582 if (control_end
!= NULL
) {
2583 control_end
->m_next
= NULL
;
2587 int error
= (*so
->so_proto
->pr_usrreqs
->pru_send
)
2588 (so
, sendflags
, top
, addr
, control
, current_proc());
2594 * Supported only connected sockets (no address) without ancillary data
2595 * (control mbuf) for atomic protocols
2598 sosend_list(struct socket
*so
, struct uio
**uioarray
, u_int uiocnt
, int flags
)
2600 struct mbuf
*m
, *freelist
= NULL
;
2601 user_ssize_t len
, resid
;
2602 int error
, dontroute
, mlen
;
2603 int atomic
= sosendallatonce(so
);
2605 struct proc
*p
= current_proc();
2608 struct mbuf
*top
= NULL
;
2609 uint16_t headroom
= 0;
2612 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST
| DBG_FUNC_START
), so
, uiocnt
,
2613 so
->so_snd
.sb_cc
, so
->so_snd
.sb_lowat
, so
->so_snd
.sb_hiwat
);
2615 if (so
->so_type
!= SOCK_DGRAM
) {
2623 if (so
->so_proto
->pr_usrreqs
->pru_send_list
== NULL
) {
2624 error
= EPROTONOSUPPORT
;
2627 if (flags
& ~(MSG_DONTWAIT
| MSG_NBIO
)) {
2631 resid
= uio_array_resid(uioarray
, uiocnt
);
2634 * In theory resid should be unsigned.
2635 * However, space must be signed, as it might be less than 0
2636 * if we over-committed, and we must use a signed comparison
2637 * of space and resid. On the other hand, a negative resid
2638 * causes us to loop sending 0-length segments to the protocol.
2640 * Note: We limit resid to be a positive int value as we use
2641 * imin() to set bytes_to_copy -- radr://14558484
2643 if (resid
< 0 || resid
> INT_MAX
) {
2649 so_update_last_owner_locked(so
, p
);
2650 so_update_policy(so
);
2653 so_update_necp_policy(so
, NULL
, NULL
);
2656 dontroute
= (flags
& MSG_DONTROUTE
) &&
2657 (so
->so_options
& SO_DONTROUTE
) == 0 &&
2658 (so
->so_proto
->pr_flags
& PR_ATOMIC
);
2659 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgsnd
);
2661 error
= sosendcheck(so
, NULL
, resid
, 0, atomic
, flags
, &sblocked
);
2667 * Use big 4 KB clusters when the outgoing interface does not prefer
2670 bigcl
= !(so
->so_flags1
& SOF1_IF_2KCL
) || sosendbigcl_ignore_capab
;
2672 if (soreserveheadroom
!= 0) {
2673 headroom
= so
->so_pktheadroom
;
2680 size_t maxpktlen
= 0;
2683 if (sosendminchain
> 0) {
2686 chainlength
= sosendmaxchain
;
2689 socket_unlock(so
, 0);
2692 * Find a set of uio that fit in a reasonable number
2695 for (i
= uiofirst
; i
< uiocnt
; i
++) {
2696 struct uio
*auio
= uioarray
[i
];
2698 len
= uio_resid(auio
);
2700 /* Do nothing for empty messages */
2708 if (len
> maxpktlen
) {
2713 if (chainlength
> sosendmaxchain
) {
2718 * Nothing left to send
2720 if (num_needed
== 0) {
2725 * Allocate buffer large enough to include headroom space for
2726 * network and link header
2729 bytes_to_alloc
= maxpktlen
+ headroom
;
2732 * Allocate a single contiguous buffer of the smallest available
2733 * size when possible
2735 if (bytes_to_alloc
> MCLBYTES
&&
2736 bytes_to_alloc
<= MBIGCLBYTES
&& bigcl
) {
2737 freelist
= m_getpackets_internal(
2738 (unsigned int *)&num_needed
,
2739 num_needed
, M_WAIT
, 1,
2741 } else if (bytes_to_alloc
> _MHLEN
&&
2742 bytes_to_alloc
<= MCLBYTES
) {
2743 freelist
= m_getpackets_internal(
2744 (unsigned int *)&num_needed
,
2745 num_needed
, M_WAIT
, 1,
2748 freelist
= m_allocpacket_internal(
2749 (unsigned int *)&num_needed
,
2750 bytes_to_alloc
, NULL
, M_WAIT
, 1, 0);
2753 if (freelist
== NULL
) {
2759 * Copy each uio of the set into its own mbuf packet
2761 for (i
= uiofirst
, m
= freelist
;
2762 i
< uiolast
&& m
!= NULL
;
2766 struct uio
*auio
= uioarray
[i
];
2768 bytes_to_copy
= uio_resid(auio
);
2770 /* Do nothing for empty messages */
2771 if (bytes_to_copy
== 0) {
2775 * Leave headroom for protocol headers
2776 * in the first mbuf of the chain
2778 m
->m_data
+= headroom
;
2780 for (n
= m
; n
!= NULL
; n
= n
->m_next
) {
2781 if ((m
->m_flags
& M_EXT
)) {
2782 mlen
= m
->m_ext
.ext_size
-
2784 } else if ((m
->m_flags
& M_PKTHDR
)) {
2786 MHLEN
- M_LEADINGSPACE(m
);
2788 mlen
= MLEN
- M_LEADINGSPACE(m
);
2790 len
= imin(mlen
, bytes_to_copy
);
2793 * Note: uiomove() decrements the iovec
2796 error
= uiomove(mtod(n
, caddr_t
),
2802 m
->m_pkthdr
.len
+= len
;
2804 VERIFY(m
->m_pkthdr
.len
<= maxpktlen
);
2806 bytes_to_copy
-= len
;
2809 if (m
->m_pkthdr
.len
== 0) {
2811 "%s:%d so %llx pkt %llx type %u len null\n",
2813 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
2814 (uint64_t)DEBUG_KERNEL_ADDRPERM(m
),
2832 so
->so_options
|= SO_DONTROUTE
;
2835 if ((flags
& MSG_SKIPCFIL
) == 0) {
2836 struct mbuf
**prevnextp
= NULL
;
2838 for (i
= uiofirst
, m
= top
;
2839 i
< uiolast
&& m
!= NULL
;
2841 struct mbuf
*nextpkt
= m
->m_nextpkt
;
2844 * Socket filter processing
2846 error
= sflt_data_out(so
, NULL
, &m
,
2848 if (error
!= 0 && error
!= EJUSTRETURN
) {
2855 * Content filter processing
2857 error
= cfil_sock_data_out(so
, NULL
, m
,
2859 if (error
!= 0 && error
!= EJUSTRETURN
) {
2863 #endif /* CONTENT_FILTER */
2865 * Remove packet from the list when
2866 * swallowed by a filter
2868 if (error
== EJUSTRETURN
) {
2870 if (prevnextp
!= NULL
) {
2871 *prevnextp
= nextpkt
;
2879 prevnextp
= &m
->m_nextpkt
;
2884 error
= (*so
->so_proto
->pr_usrreqs
->pru_send_list
)
2885 (so
, 0, top
, NULL
, NULL
, p
);
2889 so
->so_options
&= ~SO_DONTROUTE
;
2894 } while (resid
> 0 && error
== 0);
2897 sbunlock(&so
->so_snd
, FALSE
); /* will unlock socket */
2899 socket_unlock(so
, 1);
2905 if (freelist
!= NULL
) {
2906 m_freem_list(freelist
);
2909 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST
| DBG_FUNC_END
, so
, resid
,
2910 so
->so_snd
.sb_cc
, 0, error
);
2916 * May return ERESTART when packet is dropped by MAC policy check
2919 soreceive_addr(struct proc
*p
, struct socket
*so
, struct sockaddr
**psa
,
2920 int flags
, struct mbuf
**mp
, struct mbuf
**nextrecordp
, int canwait
)
2923 struct mbuf
*m
= *mp
;
2924 struct mbuf
*nextrecord
= *nextrecordp
;
2926 KASSERT(m
->m_type
== MT_SONAME
, ("receive 1a"));
2927 #if CONFIG_MACF_SOCKET_SUBSET
2929 * Call the MAC framework for policy checking if we're in
2930 * the user process context and the socket isn't connected.
2932 if (p
!= kernproc
&& !(so
->so_state
& SS_ISCONNECTED
)) {
2933 struct mbuf
*m0
= m
;
2935 * Dequeue this record (temporarily) from the receive
2936 * list since we're about to drop the socket's lock
2937 * where a new record may arrive and be appended to
2938 * the list. Upon MAC policy failure, the record
2939 * will be freed. Otherwise, we'll add it back to
2940 * the head of the list. We cannot rely on SB_LOCK
2941 * because append operation uses the socket's lock.
2944 m
->m_nextpkt
= NULL
;
2945 sbfree(&so
->so_rcv
, m
);
2947 } while (m
!= NULL
);
2949 so
->so_rcv
.sb_mb
= nextrecord
;
2950 SB_EMPTY_FIXUP(&so
->so_rcv
);
2951 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1a");
2952 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1a");
2953 socket_unlock(so
, 0);
2955 if (mac_socket_check_received(proc_ucred(p
), so
,
2956 mtod(m
, struct sockaddr
*)) != 0) {
2958 * MAC policy failure; free this record and
2959 * process the next record (or block until
2960 * one is available). We have adjusted sb_cc
2961 * and sb_mbcnt above so there is no need to
2962 * call sbfree() again.
2966 * Clear SB_LOCK but don't unlock the socket.
2967 * Process the next record or wait for one.
2970 sbunlock(&so
->so_rcv
, TRUE
); /* stay locked */
2976 * If the socket has been defunct'd, drop it.
2978 if (so
->so_flags
& SOF_DEFUNCT
) {
2984 * Re-adjust the socket receive list and re-enqueue
2985 * the record in front of any packets which may have
2986 * been appended while we dropped the lock.
2988 for (m
= m0
; m
->m_next
!= NULL
; m
= m
->m_next
) {
2989 sballoc(&so
->so_rcv
, m
);
2991 sballoc(&so
->so_rcv
, m
);
2992 if (so
->so_rcv
.sb_mb
== NULL
) {
2993 so
->so_rcv
.sb_lastrecord
= m0
;
2994 so
->so_rcv
.sb_mbtail
= m
;
2997 nextrecord
= m
->m_nextpkt
= so
->so_rcv
.sb_mb
;
2998 so
->so_rcv
.sb_mb
= m
;
2999 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1b");
3000 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1b");
3002 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3004 *psa
= dup_sockaddr(mtod(m
, struct sockaddr
*), canwait
);
3005 if ((*psa
== NULL
) && (flags
& MSG_NEEDSA
)) {
3006 error
= EWOULDBLOCK
;
3010 if (flags
& MSG_PEEK
) {
3013 sbfree(&so
->so_rcv
, m
);
3014 if (m
->m_next
== NULL
&& so
->so_rcv
.sb_cc
!= 0) {
3015 panic("%s: about to create invalid socketbuf",
3019 MFREE(m
, so
->so_rcv
.sb_mb
);
3020 m
= so
->so_rcv
.sb_mb
;
3022 m
->m_nextpkt
= nextrecord
;
3024 so
->so_rcv
.sb_mb
= nextrecord
;
3025 SB_EMPTY_FIXUP(&so
->so_rcv
);
3030 *nextrecordp
= nextrecord
;
3036 * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
3037 * so clear the data portion in order not to leak the file pointers
3040 sopeek_scm_rights(struct mbuf
*rights
)
3042 struct cmsghdr
*cm
= mtod(rights
, struct cmsghdr
*);
3044 if (cm
->cmsg_type
== SCM_RIGHTS
) {
3045 memset(cm
+ 1, 0, cm
->cmsg_len
- sizeof(*cm
));
3050 * Process one or more MT_CONTROL mbufs present before any data mbufs
3051 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3052 * just copy the data; if !MSG_PEEK, we call into the protocol to
3053 * perform externalization.
3056 soreceive_ctl(struct socket
*so
, struct mbuf
**controlp
, int flags
,
3057 struct mbuf
**mp
, struct mbuf
**nextrecordp
)
3060 struct mbuf
*cm
= NULL
, *cmn
;
3061 struct mbuf
**cme
= &cm
;
3062 struct sockbuf
*sb_rcv
= &so
->so_rcv
;
3063 struct mbuf
**msgpcm
= NULL
;
3064 struct mbuf
*m
= *mp
;
3065 struct mbuf
*nextrecord
= *nextrecordp
;
3066 struct protosw
*pr
= so
->so_proto
;
3069 * Externalizing the control messages would require us to
3070 * drop the socket's lock below. Once we re-acquire the
3071 * lock, the mbuf chain might change. In order to preserve
3072 * consistency, we unlink all control messages from the
3073 * first mbuf chain in one shot and link them separately
3074 * onto a different chain.
3077 if (flags
& MSG_PEEK
) {
3078 if (controlp
!= NULL
) {
3079 if (*controlp
== NULL
) {
3082 *controlp
= m_copy(m
, 0, m
->m_len
);
3085 * If we failed to allocate an mbuf,
3086 * release any previously allocated
3087 * mbufs for control data. Return
3088 * an error. Keep the mbufs in the
3089 * socket as this is using
3092 if (*controlp
== NULL
) {
3098 sopeek_scm_rights(*controlp
);
3100 controlp
= &(*controlp
)->m_next
;
3104 m
->m_nextpkt
= NULL
;
3106 sb_rcv
->sb_mb
= m
->m_next
;
3109 cme
= &(*cme
)->m_next
;
3112 } while (m
!= NULL
&& m
->m_type
== MT_CONTROL
);
3114 if (!(flags
& MSG_PEEK
)) {
3115 if (sb_rcv
->sb_mb
!= NULL
) {
3116 sb_rcv
->sb_mb
->m_nextpkt
= nextrecord
;
3118 sb_rcv
->sb_mb
= nextrecord
;
3119 SB_EMPTY_FIXUP(sb_rcv
);
3121 if (nextrecord
== NULL
) {
3122 sb_rcv
->sb_lastrecord
= m
;
3126 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive ctl");
3127 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive ctl");
3129 while (cm
!= NULL
) {
3134 cmsg_type
= mtod(cm
, struct cmsghdr
*)->cmsg_type
;
3137 * Call the protocol to externalize SCM_RIGHTS message
3138 * and return the modified message to the caller upon
3139 * success. Otherwise, all other control messages are
3140 * returned unmodified to the caller. Note that we
3141 * only get into this loop if MSG_PEEK is not set.
3143 if (pr
->pr_domain
->dom_externalize
!= NULL
&&
3144 cmsg_type
== SCM_RIGHTS
) {
3146 * Release socket lock: see 3903171. This
3147 * would also allow more records to be appended
3148 * to the socket buffer. We still have SB_LOCK
3149 * set on it, so we can be sure that the head
3150 * of the mbuf chain won't change.
3152 socket_unlock(so
, 0);
3153 error
= (*pr
->pr_domain
->dom_externalize
)(cm
);
3159 if (controlp
!= NULL
&& error
== 0) {
3161 controlp
= &(*controlp
)->m_next
;
3168 * Update the value of nextrecord in case we received new
3169 * records when the socket was unlocked above for
3170 * externalizing SCM_RIGHTS.
3173 nextrecord
= sb_rcv
->sb_mb
->m_nextpkt
;
3175 nextrecord
= sb_rcv
->sb_mb
;
3180 *nextrecordp
= nextrecord
;
3186 * If we have less data than requested, block awaiting more
3187 * (subject to any timeout) if:
3188 * 1. the current count is less than the low water mark, or
3189 * 2. MSG_WAITALL is set, and it is possible to do the entire
3190 * receive operation at once if we block (resid <= hiwat).
3191 * 3. MSG_DONTWAIT is not set
3192 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3193 * we have to do the receive in sections, and thus risk returning
3194 * a short count if a timeout or signal occurs after we start.
3197 so_should_wait(struct socket
*so
, struct uio
*uio
, struct mbuf
*m
, int flags
)
3199 struct protosw
*pr
= so
->so_proto
;
3201 /* No mbufs in the receive-queue? Wait! */
3206 /* Not enough data in the receive socket-buffer - we may have to wait */
3207 if ((flags
& MSG_DONTWAIT
) == 0 && so
->so_rcv
.sb_cc
< uio_resid(uio
) &&
3208 m
->m_nextpkt
== NULL
&& (pr
->pr_flags
& PR_ATOMIC
) == 0) {
3210 * Application did set the lowater-mark, so we should wait for
3211 * this data to be present.
3213 if (so
->so_rcv
.sb_cc
< so
->so_rcv
.sb_lowat
) {
3218 * Application wants all the data - so let's try to do the
3219 * receive-operation at once by waiting for everything to
3222 if ((flags
& MSG_WAITALL
) && uio_resid(uio
) <= so
->so_rcv
.sb_hiwat
) {
3231 * Implement receive operations on a socket.
3232 * We depend on the way that records are added to the sockbuf
3233 * by sbappend*. In particular, each record (mbufs linked through m_next)
3234 * must begin with an address if the protocol so specifies,
3235 * followed by an optional mbuf or mbufs containing ancillary data,
3236 * and then zero or more mbufs of data.
3237 * In order to avoid blocking network interrupts for the entire time here,
3238 * we splx() while doing the actual copy to user space.
3239 * Although the sockbuf is locked, new data may still be appended,
3240 * and thus we must maintain consistency of the sockbuf during that time.
3242 * The caller may receive the data as a single mbuf chain by supplying
3243 * an mbuf **mp0 for use in returning the chain. The uio is then used
3244 * only for the count in uio_resid.
3246 * Returns: 0 Success
3251 * sblock:EWOULDBLOCK
3255 * sodelayed_copy:EFAULT
3256 * <pru_rcvoob>:EINVAL[TCP]
3257 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3259 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3260 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3261 * <pr_domain->dom_externalize>:???
3263 * Notes: Additional return values from calls through <pru_rcvoob> and
3264 * <pr_domain->dom_externalize> depend on protocols other than
3265 * TCP or AF_UNIX, which are documented above.
3268 soreceive(struct socket
*so
, struct sockaddr
**psa
, struct uio
*uio
,
3269 struct mbuf
**mp0
, struct mbuf
**controlp
, int *flagsp
)
3271 struct mbuf
*m
, **mp
, *ml
= NULL
;
3272 struct mbuf
*nextrecord
, *free_list
;
3273 int flags
, error
, offset
;
3275 struct protosw
*pr
= so
->so_proto
;
3277 user_ssize_t orig_resid
= uio_resid(uio
);
3278 user_ssize_t delayed_copy_len
;
3280 struct proc
*p
= current_proc();
3281 boolean_t en_tracing
= FALSE
;
3284 * Sanity check on the length passed by caller as we are making 'int'
3287 if (orig_resid
< 0 || orig_resid
> INT_MAX
) {
3291 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_START
, so
,
3292 uio_resid(uio
), so
->so_rcv
.sb_cc
, so
->so_rcv
.sb_lowat
,
3293 so
->so_rcv
.sb_hiwat
);
3296 so_update_last_owner_locked(so
, p
);
3297 so_update_policy(so
);
3299 #ifdef MORE_LOCKING_DEBUG
3300 if (so
->so_usecount
== 1) {
3301 panic("%s: so=%x no other reference on socket\n", __func__
, so
);
3309 if (controlp
!= NULL
) {
3312 if (flagsp
!= NULL
) {
3313 flags
= *flagsp
& ~MSG_EOR
;
3319 * If a recv attempt is made on a previously-accepted socket
3320 * that has been marked as inactive (disconnected), reject
3323 if (so
->so_flags
& SOF_DEFUNCT
) {
3324 struct sockbuf
*sb
= &so
->so_rcv
;
3327 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3328 __func__
, proc_pid(p
), proc_best_name(p
),
3329 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
3330 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
3332 * This socket should have been disconnected and flushed
3333 * prior to being returned from sodefunct(); there should
3334 * be no data on its receive list, so panic otherwise.
3336 if (so
->so_state
& SS_DEFUNCT
) {
3337 sb_empty_assert(sb
, __func__
);
3339 socket_unlock(so
, 1);
3343 if ((so
->so_flags1
& SOF1_PRECONNECT_DATA
) &&
3344 pr
->pr_usrreqs
->pru_preconnect
) {
3346 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3347 * calling write() right after this. *If* the app calls a read
3348 * we do not want to block this read indefinetely. Thus,
3349 * we trigger a connect so that the session gets initiated.
3351 error
= (*pr
->pr_usrreqs
->pru_preconnect
)(so
);
3354 socket_unlock(so
, 1);
3359 if (ENTR_SHOULDTRACE
&&
3360 (SOCK_CHECK_DOM(so
, AF_INET
) || SOCK_CHECK_DOM(so
, AF_INET6
))) {
3362 * enable energy tracing for inet sockets that go over
3363 * non-loopback interfaces only.
3365 struct inpcb
*inp
= sotoinpcb(so
);
3366 if (inp
->inp_last_outifp
!= NULL
&&
3367 !(inp
->inp_last_outifp
->if_flags
& IFF_LOOPBACK
)) {
3369 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_START
,
3370 VM_KERNEL_ADDRPERM(so
),
3371 ((so
->so_state
& SS_NBIO
) ?
3372 kEnTrFlagNonBlocking
: 0),
3373 (int64_t)orig_resid
);
3378 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3379 * regardless of the flags argument. Here is the case were
3380 * out-of-band data is not inline.
3382 if ((flags
& MSG_OOB
) ||
3383 ((so
->so_options
& SO_WANTOOBFLAG
) != 0 &&
3384 (so
->so_options
& SO_OOBINLINE
) == 0 &&
3385 (so
->so_oobmark
|| (so
->so_state
& SS_RCVATMARK
)))) {
3386 m
= m_get(M_WAIT
, MT_DATA
);
3388 socket_unlock(so
, 1);
3389 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
,
3390 ENOBUFS
, 0, 0, 0, 0);
3393 error
= (*pr
->pr_usrreqs
->pru_rcvoob
)(so
, m
, flags
& MSG_PEEK
);
3397 socket_unlock(so
, 0);
3399 error
= uiomove(mtod(m
, caddr_t
),
3400 imin(uio_resid(uio
), m
->m_len
), uio
);
3402 } while (uio_resid(uio
) && error
== 0 && m
!= NULL
);
3409 if ((so
->so_options
& SO_WANTOOBFLAG
) != 0) {
3410 if (error
== EWOULDBLOCK
|| error
== EINVAL
) {
3412 * Let's try to get normal data:
3413 * EWOULDBLOCK: out-of-band data not
3414 * receive yet. EINVAL: out-of-band data
3419 } else if (error
== 0 && flagsp
!= NULL
) {
3423 socket_unlock(so
, 1);
3425 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_END
,
3426 VM_KERNEL_ADDRPERM(so
), 0,
3427 (int64_t)(orig_resid
- uio_resid(uio
)));
3429 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
3439 if (so
->so_state
& SS_ISCONFIRMING
&& uio_resid(uio
)) {
3440 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, 0);
3444 delayed_copy_len
= 0;
3446 #ifdef MORE_LOCKING_DEBUG
3447 if (so
->so_usecount
<= 1) {
3448 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3449 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), so
->so_usecount
);
3453 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3454 * and if so just return to the caller. This could happen when
3455 * soreceive() is called by a socket upcall function during the
3456 * time the socket is freed. The socket buffer would have been
3457 * locked across the upcall, therefore we cannot put this thread
3458 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3459 * we may livelock), because the lock on the socket buffer will
3460 * only be released when the upcall routine returns to its caller.
3461 * Because the socket has been officially closed, there can be
3462 * no further read on it.
3464 * A multipath subflow socket would have its SS_NOFDREF set by
3465 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3466 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3468 if ((so
->so_state
& (SS_NOFDREF
| SS_CANTRCVMORE
)) ==
3469 (SS_NOFDREF
| SS_CANTRCVMORE
) && !(so
->so_flags
& SOF_MP_SUBFLOW
)) {
3470 socket_unlock(so
, 1);
3474 error
= sblock(&so
->so_rcv
, SBLOCKWAIT(flags
));
3476 socket_unlock(so
, 1);
3477 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
3480 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_END
,
3481 VM_KERNEL_ADDRPERM(so
), 0,
3482 (int64_t)(orig_resid
- uio_resid(uio
)));
3487 m
= so
->so_rcv
.sb_mb
;
3488 if (so_should_wait(so
, uio
, m
, flags
)) {
3490 * Panic if we notice inconsistencies in the socket's
3491 * receive list; both sb_mb and sb_cc should correctly
3492 * reflect the contents of the list, otherwise we may
3493 * end up with false positives during select() or poll()
3494 * which could put the application in a bad state.
3496 SB_MB_CHECK(&so
->so_rcv
);
3502 error
= so
->so_error
;
3503 if ((flags
& MSG_PEEK
) == 0) {
3508 if (so
->so_state
& SS_CANTRCVMORE
) {
3511 * Deal with half closed connections
3513 if ((so
->so_state
& SS_ISDISCONNECTED
) == 0 &&
3514 cfil_sock_data_pending(&so
->so_rcv
) != 0) {
3516 "so %llx ignore SS_CANTRCVMORE",
3517 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
));
3519 #endif /* CONTENT_FILTER */
3526 for (; m
!= NULL
; m
= m
->m_next
) {
3527 if (m
->m_type
== MT_OOBDATA
|| (m
->m_flags
& M_EOR
)) {
3528 m
= so
->so_rcv
.sb_mb
;
3532 if ((so
->so_state
& (SS_ISCONNECTED
| SS_ISCONNECTING
)) == 0 &&
3533 (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
)) {
3537 if (uio_resid(uio
) == 0) {
3541 if ((so
->so_state
& SS_NBIO
) ||
3542 (flags
& (MSG_DONTWAIT
| MSG_NBIO
))) {
3543 error
= EWOULDBLOCK
;
3546 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 1");
3547 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 1");
3548 sbunlock(&so
->so_rcv
, TRUE
); /* keep socket locked */
3549 #if EVEN_MORE_LOCKING_DEBUG
3551 printf("Waiting for socket data\n");
3556 * Depending on the protocol (e.g. TCP), the following
3557 * might cause the socket lock to be dropped and later
3558 * be reacquired, and more data could have arrived and
3559 * have been appended to the receive socket buffer by
3560 * the time it returns. Therefore, we only sleep in
3561 * sbwait() below if and only if the wait-condition is still
3564 if ((pr
->pr_flags
& PR_WANTRCVD
) && so
->so_pcb
!= NULL
) {
3565 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
3569 if (so_should_wait(so
, uio
, so
->so_rcv
.sb_mb
, flags
)) {
3570 error
= sbwait(&so
->so_rcv
);
3573 #if EVEN_MORE_LOCKING_DEBUG
3575 printf("SORECEIVE - sbwait returned %d\n", error
);
3578 if (so
->so_usecount
< 1) {
3579 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3580 __func__
, so
, so
->so_usecount
);
3584 socket_unlock(so
, 1);
3585 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
3588 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_END
,
3589 VM_KERNEL_ADDRPERM(so
), 0,
3590 (int64_t)(orig_resid
- uio_resid(uio
)));
3597 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgrcv
);
3598 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1");
3599 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1");
3600 nextrecord
= m
->m_nextpkt
;
3602 if ((pr
->pr_flags
& PR_ADDR
) && m
->m_type
== MT_SONAME
) {
3603 error
= soreceive_addr(p
, so
, psa
, flags
, &m
, &nextrecord
,
3605 if (error
== ERESTART
) {
3607 } else if (error
!= 0) {
3614 * Process one or more MT_CONTROL mbufs present before any data mbufs
3615 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3616 * just copy the data; if !MSG_PEEK, we call into the protocol to
3617 * perform externalization.
3619 if (m
!= NULL
&& m
->m_type
== MT_CONTROL
) {
3620 error
= soreceive_ctl(so
, controlp
, flags
, &m
, &nextrecord
);
3628 if (!(flags
& MSG_PEEK
)) {
3630 * We get here because m points to an mbuf following
3631 * any MT_SONAME or MT_CONTROL mbufs which have been
3632 * processed above. In any case, m should be pointing
3633 * to the head of the mbuf chain, and the nextrecord
3634 * should be either NULL or equal to m->m_nextpkt.
3635 * See comments above about SB_LOCK.
3637 if (m
!= so
->so_rcv
.sb_mb
||
3638 m
->m_nextpkt
!= nextrecord
) {
3639 panic("%s: post-control !sync so=%p m=%p "
3640 "nextrecord=%p\n", __func__
, so
, m
,
3644 if (nextrecord
== NULL
) {
3645 so
->so_rcv
.sb_lastrecord
= m
;
3649 if (type
== MT_OOBDATA
) {
3653 if (!(flags
& MSG_PEEK
)) {
3654 SB_EMPTY_FIXUP(&so
->so_rcv
);
3657 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 2");
3658 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 2");
3663 if (!(flags
& MSG_PEEK
) && uio_resid(uio
) > sorecvmincopy
) {
3670 (uio_resid(uio
) - delayed_copy_len
) > 0 && error
== 0) {
3671 if (m
->m_type
== MT_OOBDATA
) {
3672 if (type
!= MT_OOBDATA
) {
3675 } else if (type
== MT_OOBDATA
) {
3679 if (m
->m_type
!= MT_OOBDATA
&& m
->m_type
!= MT_DATA
&&
3680 m
->m_type
!= MT_HEADER
) {
3684 * Make sure to allways set MSG_OOB event when getting
3685 * out of band data inline.
3687 if ((so
->so_options
& SO_WANTOOBFLAG
) != 0 &&
3688 (so
->so_options
& SO_OOBINLINE
) != 0 &&
3689 (so
->so_state
& SS_RCVATMARK
) != 0) {
3692 so
->so_state
&= ~SS_RCVATMARK
;
3693 len
= uio_resid(uio
) - delayed_copy_len
;
3694 if (so
->so_oobmark
&& len
> so
->so_oobmark
- offset
) {
3695 len
= so
->so_oobmark
- offset
;
3697 if (len
> m
->m_len
- moff
) {
3698 len
= m
->m_len
- moff
;
3701 * If mp is set, just pass back the mbufs.
3702 * Otherwise copy them out via the uio, then free.
3703 * Sockbuf must be consistent here (points to current mbuf,
3704 * it points to next record) when we drop priority;
3705 * we must note any additions to the sockbuf when we
3706 * block interrupts again.
3709 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive uiomove");
3710 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive uiomove");
3711 if (can_delay
&& len
== m
->m_len
) {
3713 * only delay the copy if we're consuming the
3714 * mbuf and we're NOT in MSG_PEEK mode
3715 * and we have enough data to make it worthwile
3716 * to drop and retake the lock... can_delay
3717 * reflects the state of the 2 latter
3718 * constraints moff should always be zero
3721 delayed_copy_len
+= len
;
3723 if (delayed_copy_len
) {
3724 error
= sodelayed_copy(so
, uio
,
3725 &free_list
, &delayed_copy_len
);
3731 * can only get here if MSG_PEEK is not
3732 * set therefore, m should point at the
3733 * head of the rcv queue; if it doesn't,
3734 * it means something drastically
3735 * changed while we were out from behind
3736 * the lock in sodelayed_copy. perhaps
3737 * a RST on the stream. in any event,
3738 * the stream has been interrupted. it's
3739 * probably best just to return whatever
3740 * data we've moved and let the caller
3743 if (m
!= so
->so_rcv
.sb_mb
) {
3747 socket_unlock(so
, 0);
3748 error
= uiomove(mtod(m
, caddr_t
) + moff
,
3757 uio_setresid(uio
, (uio_resid(uio
) - len
));
3759 if (len
== m
->m_len
- moff
) {
3760 if (m
->m_flags
& M_EOR
) {
3763 if (flags
& MSG_PEEK
) {
3767 nextrecord
= m
->m_nextpkt
;
3768 sbfree(&so
->so_rcv
, m
);
3769 m
->m_nextpkt
= NULL
;
3774 so
->so_rcv
.sb_mb
= m
= m
->m_next
;
3777 if (free_list
== NULL
) {
3783 so
->so_rcv
.sb_mb
= m
= m
->m_next
;
3787 m
->m_nextpkt
= nextrecord
;
3788 if (nextrecord
== NULL
) {
3789 so
->so_rcv
.sb_lastrecord
= m
;
3792 so
->so_rcv
.sb_mb
= nextrecord
;
3793 SB_EMPTY_FIXUP(&so
->so_rcv
);
3795 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 3");
3796 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 3");
3799 if (flags
& MSG_PEEK
) {
3805 if (flags
& MSG_DONTWAIT
) {
3806 copy_flag
= M_DONTWAIT
;
3810 *mp
= m_copym(m
, 0, len
, copy_flag
);
3812 * Failed to allocate an mbuf?
3813 * Adjust uio_resid back, it was
3814 * adjusted down by len bytes which
3815 * we didn't copy over.
3819 (uio_resid(uio
) + len
));
3825 so
->so_rcv
.sb_cc
-= len
;
3828 if (so
->so_oobmark
) {
3829 if ((flags
& MSG_PEEK
) == 0) {
3830 so
->so_oobmark
-= len
;
3831 if (so
->so_oobmark
== 0) {
3832 so
->so_state
|= SS_RCVATMARK
;
3837 if (offset
== so
->so_oobmark
) {
3842 if (flags
& MSG_EOR
) {
3846 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3847 * (for non-atomic socket), we must not quit until
3848 * "uio->uio_resid == 0" or an error termination.
3849 * If a signal/timeout occurs, return with a short
3850 * count but without error. Keep sockbuf locked
3851 * against other readers.
3853 while (flags
& (MSG_WAITALL
| MSG_WAITSTREAM
) && m
== NULL
&&
3854 (uio_resid(uio
) - delayed_copy_len
) > 0 &&
3855 !sosendallatonce(so
) && !nextrecord
) {
3856 if (so
->so_error
|| ((so
->so_state
& SS_CANTRCVMORE
)
3858 && cfil_sock_data_pending(&so
->so_rcv
) == 0
3859 #endif /* CONTENT_FILTER */
3865 * Depending on the protocol (e.g. TCP), the following
3866 * might cause the socket lock to be dropped and later
3867 * be reacquired, and more data could have arrived and
3868 * have been appended to the receive socket buffer by
3869 * the time it returns. Therefore, we only sleep in
3870 * sbwait() below if and only if the socket buffer is
3871 * empty, in order to avoid a false sleep.
3873 if ((pr
->pr_flags
& PR_WANTRCVD
) && so
->so_pcb
!= NULL
) {
3874 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
3877 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 2");
3878 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 2");
3880 if (so
->so_rcv
.sb_mb
== NULL
&& sbwait(&so
->so_rcv
)) {
3885 * have to wait until after we get back from the sbwait
3886 * to do the copy because we will drop the lock if we
3887 * have enough data that has been delayed... by dropping
3888 * the lock we open up a window allowing the netisr
3889 * thread to process the incoming packets and to change
3890 * the state of this socket... we're issuing the sbwait
3891 * because the socket is empty and we're expecting the
3892 * netisr thread to wake us up when more packets arrive;
3893 * if we allow that processing to happen and then sbwait
3894 * we could stall forever with packets sitting in the
3895 * socket if no further packets arrive from the remote
3898 * we want to copy before we've collected all the data
3899 * to satisfy this request to allow the copy to overlap
3900 * the incoming packet processing on an MP system
3902 if (delayed_copy_len
> sorecvmincopy
&&
3903 (delayed_copy_len
> (so
->so_rcv
.sb_hiwat
/ 2))) {
3904 error
= sodelayed_copy(so
, uio
,
3905 &free_list
, &delayed_copy_len
);
3911 m
= so
->so_rcv
.sb_mb
;
3913 nextrecord
= m
->m_nextpkt
;
3915 SB_MB_CHECK(&so
->so_rcv
);
3918 #ifdef MORE_LOCKING_DEBUG
3919 if (so
->so_usecount
<= 1) {
3920 panic("%s: after big while so=%p ref=%d on socket\n",
3921 __func__
, so
, so
->so_usecount
);
3926 if (m
!= NULL
&& pr
->pr_flags
& PR_ATOMIC
) {
3927 if (so
->so_options
& SO_DONTTRUNC
) {
3928 flags
|= MSG_RCVMORE
;
3931 if ((flags
& MSG_PEEK
) == 0) {
3932 (void) sbdroprecord(&so
->so_rcv
);
3938 * pru_rcvd below (for TCP) may cause more data to be received
3939 * if the socket lock is dropped prior to sending the ACK; some
3940 * legacy OpenTransport applications don't handle this well
3941 * (if it receives less data than requested while MSG_HAVEMORE
3942 * is set), and so we set the flag now based on what we know
3943 * prior to calling pru_rcvd.
3945 if ((so
->so_options
& SO_WANTMORE
) && so
->so_rcv
.sb_cc
> 0) {
3946 flags
|= MSG_HAVEMORE
;
3949 if ((flags
& MSG_PEEK
) == 0) {
3951 so
->so_rcv
.sb_mb
= nextrecord
;
3953 * First part is an inline SB_EMPTY_FIXUP(). Second
3954 * part makes sure sb_lastrecord is up-to-date if
3955 * there is still data in the socket buffer.
3957 if (so
->so_rcv
.sb_mb
== NULL
) {
3958 so
->so_rcv
.sb_mbtail
= NULL
;
3959 so
->so_rcv
.sb_lastrecord
= NULL
;
3960 } else if (nextrecord
->m_nextpkt
== NULL
) {
3961 so
->so_rcv
.sb_lastrecord
= nextrecord
;
3963 SB_MB_CHECK(&so
->so_rcv
);
3965 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 4");
3966 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 4");
3967 if (pr
->pr_flags
& PR_WANTRCVD
&& so
->so_pcb
) {
3968 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
3972 if (delayed_copy_len
) {
3973 error
= sodelayed_copy(so
, uio
, &free_list
, &delayed_copy_len
);
3978 if (free_list
!= NULL
) {
3979 m_freem_list(free_list
);
3983 if (orig_resid
== uio_resid(uio
) && orig_resid
&&
3984 (flags
& MSG_EOR
) == 0 && (so
->so_state
& SS_CANTRCVMORE
) == 0) {
3985 sbunlock(&so
->so_rcv
, TRUE
); /* keep socket locked */
3989 if (flagsp
!= NULL
) {
3993 #ifdef MORE_LOCKING_DEBUG
3994 if (so
->so_usecount
<= 1) {
3995 panic("%s: release so=%p ref=%d on socket\n", __func__
,
3996 so
, so
->so_usecount
);
4000 if (delayed_copy_len
) {
4001 error
= sodelayed_copy(so
, uio
, &free_list
, &delayed_copy_len
);
4004 if (free_list
!= NULL
) {
4005 m_freem_list(free_list
);
4008 sbunlock(&so
->so_rcv
, FALSE
); /* will unlock socket */
4011 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_END
,
4012 VM_KERNEL_ADDRPERM(so
),
4013 ((error
== EWOULDBLOCK
) ? kEnTrFlagNoWork
: 0),
4014 (int64_t)(orig_resid
- uio_resid(uio
)));
4016 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, so
, uio_resid(uio
),
4017 so
->so_rcv
.sb_cc
, 0, error
);
4023 * Returns: 0 Success
4027 sodelayed_copy(struct socket
*so
, struct uio
*uio
, struct mbuf
**free_list
,
4028 user_ssize_t
*resid
)
4035 socket_unlock(so
, 0);
4037 while (m
!= NULL
&& error
== 0) {
4038 error
= uiomove(mtod(m
, caddr_t
), (int)m
->m_len
, uio
);
4041 m_freem_list(*free_list
);
4052 sodelayed_copy_list(struct socket
*so
, struct recv_msg_elem
*msgarray
,
4053 u_int uiocnt
, struct mbuf
**free_list
, user_ssize_t
*resid
)
4057 struct mbuf
*ml
, *m
;
4061 for (ml
= *free_list
, i
= 0; ml
!= NULL
&& i
< uiocnt
;
4062 ml
= ml
->m_nextpkt
, i
++) {
4063 auio
= msgarray
[i
].uio
;
4064 for (m
= ml
; m
!= NULL
; m
= m
->m_next
) {
4065 error
= uiomove(mtod(m
, caddr_t
), m
->m_len
, auio
);
4072 m_freem_list(*free_list
);
4081 soreceive_list(struct socket
*so
, struct recv_msg_elem
*msgarray
, u_int uiocnt
,
4085 struct mbuf
*nextrecord
;
4086 struct mbuf
*ml
= NULL
, *free_list
= NULL
, *free_tail
= NULL
;
4088 user_ssize_t len
, pktlen
, delayed_copy_len
= 0;
4089 struct protosw
*pr
= so
->so_proto
;
4091 struct proc
*p
= current_proc();
4092 struct uio
*auio
= NULL
;
4095 struct sockaddr
**psa
= NULL
;
4096 struct mbuf
**controlp
= NULL
;
4099 struct mbuf
*free_others
= NULL
;
4101 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST
| DBG_FUNC_START
,
4103 so
->so_rcv
.sb_cc
, so
->so_rcv
.sb_lowat
, so
->so_rcv
.sb_hiwat
);
4107 * - Only supports don't wait flags
4108 * - Only support datagram sockets (could be extended to raw)
4110 * - Protocol must support packet chains
4111 * - The uio array is NULL (should we panic?)
4113 if (flagsp
!= NULL
) {
4118 if (flags
& ~(MSG_PEEK
| MSG_WAITALL
| MSG_DONTWAIT
| MSG_NEEDSA
|
4120 printf("%s invalid flags 0x%x\n", __func__
, flags
);
4124 if (so
->so_type
!= SOCK_DGRAM
) {
4128 if (sosendallatonce(so
) == 0) {
4132 if (so
->so_proto
->pr_usrreqs
->pru_send_list
== NULL
) {
4133 error
= EPROTONOSUPPORT
;
4136 if (msgarray
== NULL
) {
4137 printf("%s uioarray is NULL\n", __func__
);
4142 printf("%s uiocnt is 0\n", __func__
);
4147 * Sanity check on the length passed by caller as we are making 'int'
4150 resid
= recv_msg_array_resid(msgarray
, uiocnt
);
4151 if (resid
< 0 || resid
> INT_MAX
) {
4156 if (!(flags
& MSG_PEEK
) && sorecvmincopy
> 0) {
4163 so_update_last_owner_locked(so
, p
);
4164 so_update_policy(so
);
4167 so_update_necp_policy(so
, NULL
, NULL
);
4171 * If a recv attempt is made on a previously-accepted socket
4172 * that has been marked as inactive (disconnected), reject
4175 if (so
->so_flags
& SOF_DEFUNCT
) {
4176 struct sockbuf
*sb
= &so
->so_rcv
;
4179 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4180 __func__
, proc_pid(p
), proc_best_name(p
),
4181 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
4182 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
4184 * This socket should have been disconnected and flushed
4185 * prior to being returned from sodefunct(); there should
4186 * be no data on its receive list, so panic otherwise.
4188 if (so
->so_state
& SS_DEFUNCT
) {
4189 sb_empty_assert(sb
, __func__
);
4196 * The uio may be empty
4198 if (npkts
>= uiocnt
) {
4204 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4205 * and if so just return to the caller. This could happen when
4206 * soreceive() is called by a socket upcall function during the
4207 * time the socket is freed. The socket buffer would have been
4208 * locked across the upcall, therefore we cannot put this thread
4209 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4210 * we may livelock), because the lock on the socket buffer will
4211 * only be released when the upcall routine returns to its caller.
4212 * Because the socket has been officially closed, there can be
4213 * no further read on it.
4215 if ((so
->so_state
& (SS_NOFDREF
| SS_CANTRCVMORE
)) ==
4216 (SS_NOFDREF
| SS_CANTRCVMORE
)) {
4221 error
= sblock(&so
->so_rcv
, SBLOCKWAIT(flags
));
4227 m
= so
->so_rcv
.sb_mb
;
4229 * Block awaiting more datagram if needed
4231 if (m
== NULL
|| (((flags
& MSG_DONTWAIT
) == 0 &&
4232 (so
->so_rcv
.sb_cc
< so
->so_rcv
.sb_lowat
||
4233 ((flags
& MSG_WAITALL
) && npkts
< uiocnt
))))) {
4235 * Panic if we notice inconsistencies in the socket's
4236 * receive list; both sb_mb and sb_cc should correctly
4237 * reflect the contents of the list, otherwise we may
4238 * end up with false positives during select() or poll()
4239 * which could put the application in a bad state.
4241 SB_MB_CHECK(&so
->so_rcv
);
4244 error
= so
->so_error
;
4245 if ((flags
& MSG_PEEK
) == 0) {
4250 if (so
->so_state
& SS_CANTRCVMORE
) {
4253 if ((so
->so_state
& (SS_ISCONNECTED
| SS_ISCONNECTING
)) == 0 &&
4254 (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
)) {
4258 if ((so
->so_state
& SS_NBIO
) ||
4259 (flags
& (MSG_DONTWAIT
| MSG_NBIO
))) {
4260 error
= EWOULDBLOCK
;
4264 * Do not block if we got some data
4266 if (free_list
!= NULL
) {
4271 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 1");
4272 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 1");
4274 sbunlock(&so
->so_rcv
, TRUE
); /* keep socket locked */
4277 error
= sbwait(&so
->so_rcv
);
4284 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgrcv
);
4285 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1");
4286 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1");
4289 * Consume the current uio index as we have a datagram
4291 auio
= msgarray
[npkts
].uio
;
4292 resid
= uio_resid(auio
);
4293 msgarray
[npkts
].which
|= SOCK_MSG_DATA
;
4294 psa
= (msgarray
[npkts
].which
& SOCK_MSG_SA
) ?
4295 &msgarray
[npkts
].psa
: NULL
;
4296 controlp
= (msgarray
[npkts
].which
& SOCK_MSG_CONTROL
) ?
4297 &msgarray
[npkts
].controlp
: NULL
;
4299 nextrecord
= m
->m_nextpkt
;
4301 if ((pr
->pr_flags
& PR_ADDR
) && m
->m_type
== MT_SONAME
) {
4302 error
= soreceive_addr(p
, so
, psa
, flags
, &m
, &nextrecord
, 1);
4303 if (error
== ERESTART
) {
4305 } else if (error
!= 0) {
4310 if (m
!= NULL
&& m
->m_type
== MT_CONTROL
) {
4311 error
= soreceive_ctl(so
, controlp
, flags
, &m
, &nextrecord
);
4317 if (m
->m_pkthdr
.len
== 0) {
4318 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4320 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
4321 (uint64_t)DEBUG_KERNEL_ADDRPERM(m
),
4326 * Loop to copy the mbufs of the current record
4327 * Support zero length packets
4331 while (m
!= NULL
&& (len
= resid
- pktlen
) >= 0 && error
== 0) {
4332 if (m
->m_len
== 0) {
4333 panic("%p m_len zero", m
);
4335 if (m
->m_type
== 0) {
4336 panic("%p m_type zero", m
);
4339 * Clip to the residual length
4341 if (len
> m
->m_len
) {
4346 * Copy the mbufs via the uio or delay the copy
4347 * Sockbuf must be consistent here (points to current mbuf,
4348 * it points to next record) when we drop priority;
4349 * we must note any additions to the sockbuf when we
4350 * block interrupts again.
4352 if (len
> 0 && can_delay
== 0) {
4353 socket_unlock(so
, 0);
4354 error
= uiomove(mtod(m
, caddr_t
), (int)len
, auio
);
4360 delayed_copy_len
+= len
;
4363 if (len
== m
->m_len
) {
4365 * m was entirely copied
4367 sbfree(&so
->so_rcv
, m
);
4368 nextrecord
= m
->m_nextpkt
;
4369 m
->m_nextpkt
= NULL
;
4372 * Set the first packet to the head of the free list
4374 if (free_list
== NULL
) {
4378 * Link current packet to tail of free list
4381 if (free_tail
!= NULL
) {
4382 free_tail
->m_nextpkt
= m
;
4387 * Link current mbuf to last mbuf of current packet
4395 * Move next buf to head of socket buffer
4397 so
->so_rcv
.sb_mb
= m
= ml
->m_next
;
4401 m
->m_nextpkt
= nextrecord
;
4402 if (nextrecord
== NULL
) {
4403 so
->so_rcv
.sb_lastrecord
= m
;
4406 so
->so_rcv
.sb_mb
= nextrecord
;
4407 SB_EMPTY_FIXUP(&so
->so_rcv
);
4409 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 3");
4410 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 3");
4413 * Stop the loop on partial copy
4418 #ifdef MORE_LOCKING_DEBUG
4419 if (so
->so_usecount
<= 1) {
4420 panic("%s: after big while so=%llx ref=%d on socket\n",
4422 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), so
->so_usecount
);
4427 * Tell the caller we made a partial copy
4430 if (so
->so_options
& SO_DONTTRUNC
) {
4432 * Copyout first the freelist then the partial mbuf
4434 socket_unlock(so
, 0);
4435 if (delayed_copy_len
) {
4436 error
= sodelayed_copy_list(so
, msgarray
,
4437 uiocnt
, &free_list
, &delayed_copy_len
);
4441 error
= uiomove(mtod(m
, caddr_t
), (int)len
,
4451 so
->so_rcv
.sb_cc
-= len
;
4452 flags
|= MSG_RCVMORE
;
4454 (void) sbdroprecord(&so
->so_rcv
);
4455 nextrecord
= so
->so_rcv
.sb_mb
;
4462 so
->so_rcv
.sb_mb
= nextrecord
;
4464 * First part is an inline SB_EMPTY_FIXUP(). Second
4465 * part makes sure sb_lastrecord is up-to-date if
4466 * there is still data in the socket buffer.
4468 if (so
->so_rcv
.sb_mb
== NULL
) {
4469 so
->so_rcv
.sb_mbtail
= NULL
;
4470 so
->so_rcv
.sb_lastrecord
= NULL
;
4471 } else if (nextrecord
->m_nextpkt
== NULL
) {
4472 so
->so_rcv
.sb_lastrecord
= nextrecord
;
4474 SB_MB_CHECK(&so
->so_rcv
);
4476 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 4");
4477 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 4");
4480 * We can continue to the next packet as long as:
4481 * - We haven't exhausted the uio array
4482 * - There was no error
4483 * - A packet was not truncated
4484 * - We can still receive more data
4486 if (npkts
< uiocnt
&& error
== 0 &&
4487 (flags
& (MSG_RCVMORE
| MSG_TRUNC
)) == 0 &&
4488 (so
->so_state
& SS_CANTRCVMORE
) == 0) {
4489 sbunlock(&so
->so_rcv
, TRUE
); /* keep socket locked */
4494 if (flagsp
!= NULL
) {
4500 * pru_rcvd may cause more data to be received if the socket lock
4501 * is dropped so we set MSG_HAVEMORE now based on what we know.
4502 * That way the caller won't be surprised if it receives less data
4505 if ((so
->so_options
& SO_WANTMORE
) && so
->so_rcv
.sb_cc
> 0) {
4506 flags
|= MSG_HAVEMORE
;
4509 if (pr
->pr_flags
& PR_WANTRCVD
&& so
->so_pcb
) {
4510 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
4514 sbunlock(&so
->so_rcv
, FALSE
); /* will unlock socket */
4516 socket_unlock(so
, 1);
4519 if (delayed_copy_len
) {
4520 error
= sodelayed_copy_list(so
, msgarray
, uiocnt
,
4521 &free_list
, &delayed_copy_len
);
4525 * Amortize the cost of freeing the mbufs
4527 if (free_list
!= NULL
) {
4528 m_freem_list(free_list
);
4530 if (free_others
!= NULL
) {
4531 m_freem_list(free_others
);
4534 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST
| DBG_FUNC_END
, error
,
4540 so_statistics_event_to_nstat_event(int64_t *input_options
,
4541 uint64_t *nstat_event
)
4544 switch (*input_options
) {
4545 case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK
:
4546 *nstat_event
= NSTAT_EVENT_SRC_ENTER_CELLFALLBACK
;
4548 case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK
:
4549 *nstat_event
= NSTAT_EVENT_SRC_EXIT_CELLFALLBACK
;
4551 #if (DEBUG || DEVELOPMENT)
4552 case SO_STATISTICS_EVENT_RESERVED_1
:
4553 *nstat_event
= NSTAT_EVENT_SRC_RESERVED_1
;
4555 case SO_STATISTICS_EVENT_RESERVED_2
:
4556 *nstat_event
= NSTAT_EVENT_SRC_RESERVED_2
;
4558 #endif /* (DEBUG || DEVELOPMENT) */
4567 * Returns: 0 Success
4570 * <pru_shutdown>:EINVAL
4571 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4572 * <pru_shutdown>:ENOBUFS[TCP]
4573 * <pru_shutdown>:EMSGSIZE[TCP]
4574 * <pru_shutdown>:EHOSTUNREACH[TCP]
4575 * <pru_shutdown>:ENETUNREACH[TCP]
4576 * <pru_shutdown>:ENETDOWN[TCP]
4577 * <pru_shutdown>:ENOMEM[TCP]
4578 * <pru_shutdown>:EACCES[TCP]
4579 * <pru_shutdown>:EMSGSIZE[TCP]
4580 * <pru_shutdown>:ENOBUFS[TCP]
4581 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4582 * <pru_shutdown>:??? [other protocol families]
4585 soshutdown(struct socket
*so
, int how
)
4589 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN
| DBG_FUNC_START
, how
, 0, 0, 0, 0);
4597 (SS_ISCONNECTED
| SS_ISCONNECTING
| SS_ISDISCONNECTING
)) == 0) {
4600 error
= soshutdownlock(so
, how
);
4602 socket_unlock(so
, 1);
4609 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN
| DBG_FUNC_END
, how
, error
, 0, 0, 0);
4615 soshutdownlock_final(struct socket
*so
, int how
)
4617 struct protosw
*pr
= so
->so_proto
;
4620 sflt_notify(so
, sock_evt_shutdown
, &how
);
4622 if (how
!= SHUT_WR
) {
4623 if ((so
->so_state
& SS_CANTRCVMORE
) != 0) {
4624 /* read already shut down */
4630 if (how
!= SHUT_RD
) {
4631 if ((so
->so_state
& SS_CANTSENDMORE
) != 0) {
4632 /* write already shut down */
4636 error
= (*pr
->pr_usrreqs
->pru_shutdown
)(so
);
4639 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN
, how
, 1, 0, 0, 0);
4644 soshutdownlock(struct socket
*so
, int how
)
4650 * A content filter may delay the actual shutdown until it
4651 * has processed the pending data
4653 if (so
->so_flags
& SOF_CONTENT_FILTER
) {
4654 error
= cfil_sock_shutdown(so
, &how
);
4655 if (error
== EJUSTRETURN
) {
4658 } else if (error
!= 0) {
4662 #endif /* CONTENT_FILTER */
4664 error
= soshutdownlock_final(so
, how
);
4671 sowflush(struct socket
*so
)
4673 struct sockbuf
*sb
= &so
->so_snd
;
4676 * Obtain lock on the socket buffer (SB_LOCK). This is required
4677 * to prevent the socket buffer from being unexpectedly altered
4678 * while it is used by another thread in socket send/receive.
4680 * sblock() must not fail here, hence the assertion.
4682 (void) sblock(sb
, SBL_WAIT
| SBL_NOINTR
| SBL_IGNDEFUNCT
);
4683 VERIFY(sb
->sb_flags
& SB_LOCK
);
4685 sb
->sb_flags
&= ~(SB_SEL
| SB_UPCALL
);
4686 sb
->sb_flags
|= SB_DROP
;
4687 sb
->sb_upcall
= NULL
;
4688 sb
->sb_upcallarg
= NULL
;
4690 sbunlock(sb
, TRUE
); /* keep socket locked */
4692 selthreadclear(&sb
->sb_sel
);
4697 sorflush(struct socket
*so
)
4699 struct sockbuf
*sb
= &so
->so_rcv
;
4700 struct protosw
*pr
= so
->so_proto
;
4703 lck_mtx_t
*mutex_held
;
4705 * XXX: This code is currently commented out, because we may get here
4706 * as part of sofreelastref(), and at that time, pr_getlock() may no
4707 * longer be able to return us the lock; this will be fixed in future.
4709 if (so
->so_proto
->pr_getlock
!= NULL
) {
4710 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
4712 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
4715 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
4718 sflt_notify(so
, sock_evt_flush_read
, NULL
);
4723 * Obtain lock on the socket buffer (SB_LOCK). This is required
4724 * to prevent the socket buffer from being unexpectedly altered
4725 * while it is used by another thread in socket send/receive.
4727 * sblock() must not fail here, hence the assertion.
4729 (void) sblock(sb
, SBL_WAIT
| SBL_NOINTR
| SBL_IGNDEFUNCT
);
4730 VERIFY(sb
->sb_flags
& SB_LOCK
);
4733 * Copy only the relevant fields from "sb" to "asb" which we
4734 * need for sbrelease() to function. In particular, skip
4735 * sb_sel as it contains the wait queue linkage, which would
4736 * wreak havoc if we were to issue selthreadclear() on "asb".
4737 * Make sure to not carry over SB_LOCK in "asb", as we need
4738 * to acquire it later as part of sbrelease().
4740 bzero(&asb
, sizeof(asb
));
4741 asb
.sb_cc
= sb
->sb_cc
;
4742 asb
.sb_hiwat
= sb
->sb_hiwat
;
4743 asb
.sb_mbcnt
= sb
->sb_mbcnt
;
4744 asb
.sb_mbmax
= sb
->sb_mbmax
;
4745 asb
.sb_ctl
= sb
->sb_ctl
;
4746 asb
.sb_lowat
= sb
->sb_lowat
;
4747 asb
.sb_mb
= sb
->sb_mb
;
4748 asb
.sb_mbtail
= sb
->sb_mbtail
;
4749 asb
.sb_lastrecord
= sb
->sb_lastrecord
;
4750 asb
.sb_so
= sb
->sb_so
;
4751 asb
.sb_flags
= sb
->sb_flags
;
4752 asb
.sb_flags
&= ~(SB_LOCK
| SB_SEL
| SB_KNOTE
| SB_UPCALL
);
4753 asb
.sb_flags
|= SB_DROP
;
4756 * Ideally we'd bzero() these and preserve the ones we need;
4757 * but to do that we'd need to shuffle things around in the
4758 * sockbuf, and we can't do it now because there are KEXTS
4759 * that are directly referring to the socket structure.
4761 * Setting SB_DROP acts as a barrier to prevent further appends.
4762 * Clearing SB_SEL is done for selthreadclear() below.
4771 sb
->sb_mbtail
= NULL
;
4772 sb
->sb_lastrecord
= NULL
;
4773 sb
->sb_timeo
.tv_sec
= 0;
4774 sb
->sb_timeo
.tv_usec
= 0;
4775 sb
->sb_upcall
= NULL
;
4776 sb
->sb_upcallarg
= NULL
;
4777 sb
->sb_flags
&= ~(SB_SEL
| SB_UPCALL
);
4778 sb
->sb_flags
|= SB_DROP
;
4780 sbunlock(sb
, TRUE
); /* keep socket locked */
4783 * Note that selthreadclear() is called on the original "sb" and
4784 * not the local "asb" because of the way wait queue linkage is
4785 * implemented. Given that selwakeup() may be triggered, SB_SEL
4786 * should no longer be set (cleared above.)
4788 selthreadclear(&sb
->sb_sel
);
4790 if ((pr
->pr_flags
& PR_RIGHTS
) && pr
->pr_domain
->dom_dispose
) {
4791 (*pr
->pr_domain
->dom_dispose
)(asb
.sb_mb
);
4798 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4799 * an additional variant to handle the case where the option value needs
4800 * to be some kind of integer, but not a specific size.
4801 * In addition to their use here, these functions are also called by the
4802 * protocol-level pr_ctloutput() routines.
4804 * Returns: 0 Success
4809 sooptcopyin(struct sockopt
*sopt
, void *buf
, size_t len
, size_t minlen
)
4814 * If the user gives us more than we wanted, we ignore it,
4815 * but if we don't get the minimum length the caller
4816 * wants, we return EINVAL. On success, sopt->sopt_valsize
4817 * is set to however much we actually retrieved.
4819 if ((valsize
= sopt
->sopt_valsize
) < minlen
) {
4822 if (valsize
> len
) {
4823 sopt
->sopt_valsize
= valsize
= len
;
4826 if (sopt
->sopt_p
!= kernproc
) {
4827 return copyin(sopt
->sopt_val
, buf
, valsize
);
4830 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), buf
, valsize
);
4835 * sooptcopyin_timeval
4836 * Copy in a timeval value into tv_p, and take into account whether the
4837 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4838 * code here so that we can verify the 64-bit tv_sec value before we lose
4839 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4842 sooptcopyin_timeval(struct sockopt
*sopt
, struct timeval
*tv_p
)
4846 if (proc_is64bit(sopt
->sopt_p
)) {
4847 struct user64_timeval tv64
;
4849 if (sopt
->sopt_valsize
< sizeof(tv64
)) {
4853 sopt
->sopt_valsize
= sizeof(tv64
);
4854 if (sopt
->sopt_p
!= kernproc
) {
4855 error
= copyin(sopt
->sopt_val
, &tv64
, sizeof(tv64
));
4860 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), &tv64
,
4863 if (tv64
.tv_sec
< 0 || tv64
.tv_sec
> LONG_MAX
||
4864 tv64
.tv_usec
< 0 || tv64
.tv_usec
>= 1000000) {
4868 tv_p
->tv_sec
= tv64
.tv_sec
;
4869 tv_p
->tv_usec
= tv64
.tv_usec
;
4871 struct user32_timeval tv32
;
4873 if (sopt
->sopt_valsize
< sizeof(tv32
)) {
4877 sopt
->sopt_valsize
= sizeof(tv32
);
4878 if (sopt
->sopt_p
!= kernproc
) {
4879 error
= copyin(sopt
->sopt_val
, &tv32
, sizeof(tv32
));
4884 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), &tv32
,
4889 * K64todo "comparison is always false due to
4890 * limited range of data type"
4892 if (tv32
.tv_sec
< 0 || tv32
.tv_sec
> LONG_MAX
||
4893 tv32
.tv_usec
< 0 || tv32
.tv_usec
>= 1000000) {
4897 tv_p
->tv_sec
= tv32
.tv_sec
;
4898 tv_p
->tv_usec
= tv32
.tv_usec
;
4904 soopt_cred_check(struct socket
*so
, int priv
, boolean_t allow_root
,
4905 boolean_t ignore_delegate
)
4907 kauth_cred_t cred
= NULL
;
4908 proc_t ep
= PROC_NULL
;
4912 if (ignore_delegate
== false && so
->so_flags
& SOF_DELEGATED
) {
4913 ep
= proc_find(so
->e_pid
);
4915 cred
= kauth_cred_proc_ref(ep
);
4919 uid
= kauth_cred_getuid(cred
? cred
: so
->so_cred
);
4921 /* uid is 0 for root */
4922 if (uid
!= 0 || !allow_root
) {
4923 error
= priv_check_cred(cred
? cred
: so
->so_cred
, priv
, 0);
4926 kauth_cred_unref(&cred
);
4928 if (ep
!= PROC_NULL
) {
4936 * Returns: 0 Success
4941 * sooptcopyin:EINVAL
4942 * sooptcopyin:EFAULT
4943 * sooptcopyin_timeval:EINVAL
4944 * sooptcopyin_timeval:EFAULT
4945 * sooptcopyin_timeval:EDOM
4946 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4947 * <pr_ctloutput>:???w
4948 * sflt_attach_private:??? [whatever a filter author chooses]
4949 * <sf_setoption>:??? [whatever a filter author chooses]
4951 * Notes: Other <pru_listen> returns depend on the protocol family; all
4952 * <sf_listen> returns depend on what the filter author causes
4953 * their filter to return.
4956 sosetoptlock(struct socket
*so
, struct sockopt
*sopt
, int dolock
)
4959 int64_t long_optval
;
4963 if (sopt
->sopt_dir
!= SOPT_SET
) {
4964 sopt
->sopt_dir
= SOPT_SET
;
4971 if ((so
->so_state
& (SS_CANTRCVMORE
| SS_CANTSENDMORE
)) ==
4972 (SS_CANTRCVMORE
| SS_CANTSENDMORE
) &&
4973 (so
->so_flags
& SOF_NPX_SETOPTSHUT
) == 0) {
4974 /* the socket has been shutdown, no more sockopt's */
4979 error
= sflt_setsockopt(so
, sopt
);
4981 if (error
== EJUSTRETURN
) {
4987 if (sopt
->sopt_level
!= SOL_SOCKET
) {
4988 if (so
->so_proto
!= NULL
&&
4989 so
->so_proto
->pr_ctloutput
!= NULL
) {
4990 error
= (*so
->so_proto
->pr_ctloutput
)(so
, sopt
);
4993 error
= ENOPROTOOPT
;
4996 * Allow socket-level (SOL_SOCKET) options to be filtered by
4997 * the protocol layer, if needed. A zero value returned from
4998 * the handler means use default socket-level processing as
4999 * done by the rest of this routine. Otherwise, any other
5000 * return value indicates that the option is unsupported.
5002 if (so
->so_proto
!= NULL
&& (error
= so
->so_proto
->pr_usrreqs
->
5003 pru_socheckopt(so
, sopt
)) != 0) {
5008 switch (sopt
->sopt_name
) {
5011 error
= sooptcopyin(sopt
, &l
, sizeof(l
), sizeof(l
));
5016 so
->so_linger
= (sopt
->sopt_name
== SO_LINGER
) ?
5017 l
.l_linger
: l
.l_linger
* hz
;
5018 if (l
.l_onoff
!= 0) {
5019 so
->so_options
|= SO_LINGER
;
5021 so
->so_options
&= ~SO_LINGER
;
5028 case SO_USELOOPBACK
:
5034 case SO_TIMESTAMP_MONOTONIC
:
5035 case SO_TIMESTAMP_CONTINUOUS
:
5038 case SO_WANTOOBFLAG
:
5039 case SO_NOWAKEFROMSLEEP
:
5040 case SO_NOAPNFALLBK
:
5041 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5047 so
->so_options
|= sopt
->sopt_name
;
5049 so
->so_options
&= ~sopt
->sopt_name
;
5057 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5064 * Values < 1 make no sense for any of these
5065 * options, so disallow them.
5072 switch (sopt
->sopt_name
) {
5075 struct sockbuf
*sb
=
5076 (sopt
->sopt_name
== SO_SNDBUF
) ?
5077 &so
->so_snd
: &so
->so_rcv
;
5078 if (sbreserve(sb
, (u_int32_t
)optval
) == 0) {
5082 sb
->sb_flags
|= SB_USRSIZE
;
5083 sb
->sb_flags
&= ~SB_AUTOSIZE
;
5084 sb
->sb_idealsize
= (u_int32_t
)optval
;
5088 * Make sure the low-water is never greater than
5092 int space
= sbspace(&so
->so_snd
);
5093 u_int32_t hiwat
= so
->so_snd
.sb_hiwat
;
5095 if (so
->so_snd
.sb_flags
& SB_UNIX
) {
5097 (struct unpcb
*)(so
->so_pcb
);
5099 unp
->unp_conn
!= NULL
) {
5100 hiwat
+= unp
->unp_conn
->unp_cc
;
5104 so
->so_snd
.sb_lowat
=
5108 if (space
>= so
->so_snd
.sb_lowat
) {
5115 so
->so_rcv
.sb_lowat
=
5116 (optval
> so
->so_rcv
.sb_hiwat
) ?
5117 so
->so_rcv
.sb_hiwat
: optval
;
5118 data_len
= so
->so_rcv
.sb_cc
5119 - so
->so_rcv
.sb_ctl
;
5120 if (data_len
>= so
->so_rcv
.sb_lowat
) {
5130 error
= sooptcopyin_timeval(sopt
, &tv
);
5135 switch (sopt
->sopt_name
) {
5137 so
->so_snd
.sb_timeo
= tv
;
5140 so
->so_rcv
.sb_timeo
= tv
;
5148 error
= sooptcopyin(sopt
, &nke
, sizeof(nke
),
5154 error
= sflt_attach_internal(so
, nke
.nke_handle
);
5159 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5165 so
->so_flags
|= SOF_NOSIGPIPE
;
5167 so
->so_flags
&= ~SOF_NOSIGPIPE
;
5172 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5178 so
->so_flags
|= SOF_NOADDRAVAIL
;
5180 so
->so_flags
&= ~SOF_NOADDRAVAIL
;
5184 case SO_REUSESHAREUID
:
5185 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5191 so
->so_flags
|= SOF_REUSESHAREUID
;
5193 so
->so_flags
&= ~SOF_REUSESHAREUID
;
5197 case SO_NOTIFYCONFLICT
:
5198 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5202 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5208 so
->so_flags
|= SOF_NOTIFYCONFLICT
;
5210 so
->so_flags
&= ~SOF_NOTIFYCONFLICT
;
5214 case SO_RESTRICTIONS
:
5215 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5221 error
= so_set_restrictions(so
, optval
);
5224 case SO_AWDL_UNRESTRICTED
:
5225 if (SOCK_DOM(so
) != PF_INET
&&
5226 SOCK_DOM(so
) != PF_INET6
) {
5230 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5236 error
= soopt_cred_check(so
,
5237 PRIV_NET_RESTRICTED_AWDL
, false, false);
5239 inp_set_awdl_unrestricted(
5243 inp_clear_awdl_unrestricted(sotoinpcb(so
));
5246 case SO_INTCOPROC_ALLOW
:
5247 if (SOCK_DOM(so
) != PF_INET6
) {
5251 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5257 inp_get_intcoproc_allowed(sotoinpcb(so
)) == FALSE
) {
5258 error
= soopt_cred_check(so
,
5259 PRIV_NET_RESTRICTED_INTCOPROC
, false, false);
5261 inp_set_intcoproc_allowed(
5264 } else if (optval
== 0) {
5265 inp_clear_intcoproc_allowed(sotoinpcb(so
));
5273 case SO_UPCALLCLOSEWAIT
:
5274 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5280 so
->so_flags
|= SOF_UPCALLCLOSEWAIT
;
5282 so
->so_flags
&= ~SOF_UPCALLCLOSEWAIT
;
5287 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5293 so
->so_flags
|= SOF_BINDRANDOMPORT
;
5295 so
->so_flags
&= ~SOF_BINDRANDOMPORT
;
5299 case SO_NP_EXTENSIONS
: {
5300 struct so_np_extensions sonpx
;
5302 error
= sooptcopyin(sopt
, &sonpx
, sizeof(sonpx
),
5307 if (sonpx
.npx_mask
& ~SONPX_MASK_VALID
) {
5312 * Only one bit defined for now
5314 if ((sonpx
.npx_mask
& SONPX_SETOPTSHUT
)) {
5315 if ((sonpx
.npx_flags
& SONPX_SETOPTSHUT
)) {
5316 so
->so_flags
|= SOF_NPX_SETOPTSHUT
;
5318 so
->so_flags
&= ~SOF_NPX_SETOPTSHUT
;
5324 case SO_TRAFFIC_CLASS
: {
5325 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5330 if (optval
>= SO_TC_NET_SERVICE_OFFSET
) {
5331 int netsvc
= optval
- SO_TC_NET_SERVICE_OFFSET
;
5332 error
= so_set_net_service_type(so
, netsvc
);
5335 error
= so_set_traffic_class(so
, optval
);
5339 so
->so_flags1
&= ~SOF1_TC_NET_SERV_TYPE
;
5340 so
->so_netsvctype
= _NET_SERVICE_TYPE_UNSPEC
;
5344 case SO_RECV_TRAFFIC_CLASS
: {
5345 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5351 so
->so_flags
&= ~SOF_RECV_TRAFFIC_CLASS
;
5353 so
->so_flags
|= SOF_RECV_TRAFFIC_CLASS
;
5358 #if (DEVELOPMENT || DEBUG)
5359 case SO_TRAFFIC_CLASS_DBG
: {
5360 struct so_tcdbg so_tcdbg
;
5362 error
= sooptcopyin(sopt
, &so_tcdbg
,
5363 sizeof(struct so_tcdbg
), sizeof(struct so_tcdbg
));
5367 error
= so_set_tcdbg(so
, &so_tcdbg
);
5373 #endif /* (DEVELOPMENT || DEBUG) */
5375 case SO_PRIVILEGED_TRAFFIC_CLASS
:
5376 error
= priv_check_cred(kauth_cred_get(),
5377 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS
, 0);
5381 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5387 so
->so_flags
&= ~SOF_PRIVILEGED_TRAFFIC_CLASS
;
5389 so
->so_flags
|= SOF_PRIVILEGED_TRAFFIC_CLASS
;
5393 #if (DEVELOPMENT || DEBUG)
5395 error
= sosetdefunct(current_proc(), so
, 0, FALSE
);
5397 error
= sodefunct(current_proc(), so
, 0);
5401 #endif /* (DEVELOPMENT || DEBUG) */
5404 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5406 if (error
!= 0 || (so
->so_flags
& SOF_DEFUNCT
)) {
5413 * Any process can set SO_DEFUNCTOK (clear
5414 * SOF_NODEFUNCT), but only root can clear
5415 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5418 kauth_cred_issuser(kauth_cred_get()) == 0) {
5423 so
->so_flags
&= ~SOF_NODEFUNCT
;
5425 so
->so_flags
|= SOF_NODEFUNCT
;
5428 if (SOCK_DOM(so
) == PF_INET
||
5429 SOCK_DOM(so
) == PF_INET6
) {
5430 char s
[MAX_IPv6_STR_LEN
];
5431 char d
[MAX_IPv6_STR_LEN
];
5432 struct inpcb
*inp
= sotoinpcb(so
);
5434 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5435 "[%s %s:%d -> %s:%d] is now marked "
5436 "as %seligible for "
5437 "defunct\n", __func__
, proc_selfpid(),
5438 proc_best_name(current_proc()),
5439 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
5440 (SOCK_TYPE(so
) == SOCK_STREAM
) ?
5441 "TCP" : "UDP", inet_ntop(SOCK_DOM(so
),
5442 ((SOCK_DOM(so
) == PF_INET
) ?
5443 (void *)&inp
->inp_laddr
.s_addr
:
5444 (void *)&inp
->in6p_laddr
), s
, sizeof(s
)),
5445 ntohs(inp
->in6p_lport
),
5446 inet_ntop(SOCK_DOM(so
),
5447 (SOCK_DOM(so
) == PF_INET
) ?
5448 (void *)&inp
->inp_faddr
.s_addr
:
5449 (void *)&inp
->in6p_faddr
, d
, sizeof(d
)),
5450 ntohs(inp
->in6p_fport
),
5451 (so
->so_flags
& SOF_NODEFUNCT
) ?
5454 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5455 "is now marked as %seligible for "
5457 __func__
, proc_selfpid(),
5458 proc_best_name(current_proc()),
5459 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
5460 SOCK_DOM(so
), SOCK_TYPE(so
),
5461 (so
->so_flags
& SOF_NODEFUNCT
) ?
5467 /* This option is not settable */
5471 case SO_OPPORTUNISTIC
:
5472 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5475 error
= so_set_opportunistic(so
, optval
);
5480 /* This option is handled by lower layer(s) */
5485 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5488 error
= so_set_recv_anyif(so
, optval
);
5492 case SO_TRAFFIC_MGT_BACKGROUND
: {
5493 /* This option is handled by lower layer(s) */
5499 case SO_FLOW_DIVERT_TOKEN
:
5500 error
= flow_divert_token_set(so
, sopt
);
5502 #endif /* FLOW_DIVERT */
5506 if ((error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5507 sizeof(optval
))) != 0) {
5511 error
= so_set_effective_pid(so
, optval
, sopt
->sopt_p
, true);
5514 case SO_DELEGATED_UUID
: {
5517 if ((error
= sooptcopyin(sopt
, &euuid
, sizeof(euuid
),
5518 sizeof(euuid
))) != 0) {
5522 error
= so_set_effective_uuid(so
, euuid
, sopt
->sopt_p
, true);
5527 case SO_NECP_ATTRIBUTES
:
5528 error
= necp_set_socket_attributes(so
, sopt
);
5531 case SO_NECP_CLIENTUUID
: {
5532 if (SOCK_DOM(so
) == PF_MULTIPATH
) {
5533 /* Handled by MPTCP itself */
5537 if (SOCK_DOM(so
) != PF_INET
&& SOCK_DOM(so
) != PF_INET6
) {
5542 struct inpcb
*inp
= sotoinpcb(so
);
5543 if (!uuid_is_null(inp
->necp_client_uuid
)) {
5544 // Clear out the old client UUID if present
5545 necp_inpcb_remove_cb(inp
);
5548 error
= sooptcopyin(sopt
, &inp
->necp_client_uuid
,
5549 sizeof(uuid_t
), sizeof(uuid_t
));
5554 if (uuid_is_null(inp
->necp_client_uuid
)) {
5559 pid_t current_pid
= proc_pid(current_proc());
5560 error
= necp_client_register_socket_flow(current_pid
,
5561 inp
->necp_client_uuid
, inp
);
5563 uuid_clear(inp
->necp_client_uuid
);
5567 if (inp
->inp_lport
!= 0) {
5568 // There is a bound local port, so this is not
5569 // a fresh socket. Assign to the client.
5570 necp_client_assign_from_socket(current_pid
, inp
->necp_client_uuid
, inp
);
5575 case SO_NECP_LISTENUUID
: {
5576 if (SOCK_DOM(so
) != PF_INET
&& SOCK_DOM(so
) != PF_INET6
) {
5581 struct inpcb
*inp
= sotoinpcb(so
);
5582 if (!uuid_is_null(inp
->necp_client_uuid
)) {
5587 error
= sooptcopyin(sopt
, &inp
->necp_client_uuid
,
5588 sizeof(uuid_t
), sizeof(uuid_t
));
5593 if (uuid_is_null(inp
->necp_client_uuid
)) {
5598 error
= necp_client_register_socket_listener(proc_pid(current_proc()),
5599 inp
->necp_client_uuid
, inp
);
5601 uuid_clear(inp
->necp_client_uuid
);
5605 // Mark that the port registration is held by NECP
5606 inp
->inp_flags2
|= INP2_EXTERNAL_PORT
;
5612 case SO_EXTENDED_BK_IDLE
:
5613 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5616 error
= so_set_extended_bk_idle(so
, optval
);
5620 case SO_MARK_CELLFALLBACK
:
5621 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5631 so
->so_flags1
&= ~SOF1_CELLFALLBACK
;
5633 so
->so_flags1
|= SOF1_CELLFALLBACK
;
5637 case SO_STATISTICS_EVENT
:
5638 error
= sooptcopyin(sopt
, &long_optval
,
5639 sizeof(long_optval
), sizeof(long_optval
));
5643 u_int64_t nstat_event
= 0;
5644 error
= so_statistics_event_to_nstat_event(
5645 &long_optval
, &nstat_event
);
5649 nstat_pcb_event(sotoinpcb(so
), nstat_event
);
5652 case SO_NET_SERVICE_TYPE
: {
5653 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5658 error
= so_set_net_service_type(so
, optval
);
5662 case SO_QOSMARKING_POLICY_OVERRIDE
:
5663 error
= priv_check_cred(kauth_cred_get(),
5664 PRIV_NET_QOSMARKING_POLICY_OVERRIDE
, 0);
5668 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5674 so
->so_flags1
&= ~SOF1_QOSMARKING_POLICY_OVERRIDE
;
5676 so
->so_flags1
|= SOF1_QOSMARKING_POLICY_OVERRIDE
;
5680 case SO_MPKL_SEND_INFO
: {
5681 struct so_mpkl_send_info so_mpkl_send_info
;
5683 error
= sooptcopyin(sopt
, &so_mpkl_send_info
,
5684 sizeof(struct so_mpkl_send_info
), sizeof(struct so_mpkl_send_info
));
5688 uuid_copy(so
->so_mpkl_send_uuid
, so_mpkl_send_info
.mpkl_uuid
);
5689 so
->so_mpkl_send_proto
= so_mpkl_send_info
.mpkl_proto
;
5691 if (uuid_is_null(so
->so_mpkl_send_uuid
) && so
->so_mpkl_send_proto
== 0) {
5692 so
->so_flags1
&= ~SOF1_MPKL_SEND_INFO
;
5694 so
->so_flags1
|= SOF1_MPKL_SEND_INFO
;
5698 case SO_WANT_KEV_SOCKET_CLOSED
: {
5699 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5705 so
->so_flags1
&= ~SOF1_WANT_KEV_SOCK_CLOSED
;
5707 so
->so_flags1
|= SOF1_WANT_KEV_SOCK_CLOSED
;
5712 error
= ENOPROTOOPT
;
5715 if (error
== 0 && so
->so_proto
!= NULL
&&
5716 so
->so_proto
->pr_ctloutput
!= NULL
) {
5717 (void) so
->so_proto
->pr_ctloutput(so
, sopt
);
5722 socket_unlock(so
, 1);
5727 /* Helper routines for getsockopt */
5729 sooptcopyout(struct sockopt
*sopt
, void *buf
, size_t len
)
5737 * Documented get behavior is that we always return a value,
5738 * possibly truncated to fit in the user's buffer.
5739 * Traditional behavior is that we always tell the user
5740 * precisely how much we copied, rather than something useful
5741 * like the total amount we had available for her.
5742 * Note that this interface is not idempotent; the entire answer must
5743 * generated ahead of time.
5745 valsize
= min(len
, sopt
->sopt_valsize
);
5746 sopt
->sopt_valsize
= valsize
;
5747 if (sopt
->sopt_val
!= USER_ADDR_NULL
) {
5748 if (sopt
->sopt_p
!= kernproc
) {
5749 error
= copyout(buf
, sopt
->sopt_val
, valsize
);
5751 bcopy(buf
, CAST_DOWN(caddr_t
, sopt
->sopt_val
), valsize
);
5758 sooptcopyout_timeval(struct sockopt
*sopt
, const struct timeval
*tv_p
)
5762 struct user64_timeval tv64
= {};
5763 struct user32_timeval tv32
= {};
5768 if (proc_is64bit(sopt
->sopt_p
)) {
5770 tv64
.tv_sec
= tv_p
->tv_sec
;
5771 tv64
.tv_usec
= tv_p
->tv_usec
;
5775 tv32
.tv_sec
= tv_p
->tv_sec
;
5776 tv32
.tv_usec
= tv_p
->tv_usec
;
5779 valsize
= min(len
, sopt
->sopt_valsize
);
5780 sopt
->sopt_valsize
= valsize
;
5781 if (sopt
->sopt_val
!= USER_ADDR_NULL
) {
5782 if (sopt
->sopt_p
!= kernproc
) {
5783 error
= copyout(val
, sopt
->sopt_val
, valsize
);
5785 bcopy(val
, CAST_DOWN(caddr_t
, sopt
->sopt_val
), valsize
);
5794 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5795 * <pr_ctloutput>:???
5796 * <sf_getoption>:???
5799 sogetoptlock(struct socket
*so
, struct sockopt
*sopt
, int dolock
)
5805 if (sopt
->sopt_dir
!= SOPT_GET
) {
5806 sopt
->sopt_dir
= SOPT_GET
;
5813 error
= sflt_getsockopt(so
, sopt
);
5815 if (error
== EJUSTRETURN
) {
5821 if (sopt
->sopt_level
!= SOL_SOCKET
) {
5822 if (so
->so_proto
!= NULL
&&
5823 so
->so_proto
->pr_ctloutput
!= NULL
) {
5824 error
= (*so
->so_proto
->pr_ctloutput
)(so
, sopt
);
5827 error
= ENOPROTOOPT
;
5830 * Allow socket-level (SOL_SOCKET) options to be filtered by
5831 * the protocol layer, if needed. A zero value returned from
5832 * the handler means use default socket-level processing as
5833 * done by the rest of this routine. Otherwise, any other
5834 * return value indicates that the option is unsupported.
5836 if (so
->so_proto
!= NULL
&& (error
= so
->so_proto
->pr_usrreqs
->
5837 pru_socheckopt(so
, sopt
)) != 0) {
5842 switch (sopt
->sopt_name
) {
5845 l
.l_onoff
= ((so
->so_options
& SO_LINGER
) ? 1 : 0);
5846 l
.l_linger
= (sopt
->sopt_name
== SO_LINGER
) ?
5847 so
->so_linger
: so
->so_linger
/ hz
;
5848 error
= sooptcopyout(sopt
, &l
, sizeof(l
));
5851 case SO_USELOOPBACK
:
5860 case SO_TIMESTAMP_MONOTONIC
:
5861 case SO_TIMESTAMP_CONTINUOUS
:
5864 case SO_WANTOOBFLAG
:
5865 case SO_NOWAKEFROMSLEEP
:
5866 case SO_NOAPNFALLBK
:
5867 optval
= so
->so_options
& sopt
->sopt_name
;
5869 error
= sooptcopyout(sopt
, &optval
, sizeof(optval
));
5873 optval
= so
->so_type
;
5877 if (so
->so_proto
->pr_flags
& PR_ATOMIC
) {
5882 m1
= so
->so_rcv
.sb_mb
;
5883 while (m1
!= NULL
) {
5884 if (m1
->m_type
== MT_DATA
||
5885 m1
->m_type
== MT_HEADER
||
5886 m1
->m_type
== MT_OOBDATA
) {
5887 pkt_total
+= m1
->m_len
;
5893 optval
= so
->so_rcv
.sb_cc
- so
->so_rcv
.sb_ctl
;
5898 if (so
->so_proto
->pr_flags
& PR_ATOMIC
) {
5902 m1
= so
->so_rcv
.sb_mb
;
5903 while (m1
!= NULL
) {
5910 error
= ENOPROTOOPT
;
5915 optval
= so
->so_snd
.sb_cc
;
5919 optval
= so
->so_error
;
5924 u_int32_t hiwat
= so
->so_snd
.sb_hiwat
;
5926 if (so
->so_snd
.sb_flags
& SB_UNIX
) {
5928 (struct unpcb
*)(so
->so_pcb
);
5929 if (unp
!= NULL
&& unp
->unp_conn
!= NULL
) {
5930 hiwat
+= unp
->unp_conn
->unp_cc
;
5938 optval
= so
->so_rcv
.sb_hiwat
;
5942 optval
= so
->so_snd
.sb_lowat
;
5946 optval
= so
->so_rcv
.sb_lowat
;
5951 tv
= (sopt
->sopt_name
== SO_SNDTIMEO
?
5952 so
->so_snd
.sb_timeo
: so
->so_rcv
.sb_timeo
);
5954 error
= sooptcopyout_timeval(sopt
, &tv
);
5958 optval
= (so
->so_flags
& SOF_NOSIGPIPE
);
5962 optval
= (so
->so_flags
& SOF_NOADDRAVAIL
);
5965 case SO_REUSESHAREUID
:
5966 optval
= (so
->so_flags
& SOF_REUSESHAREUID
);
5970 case SO_NOTIFYCONFLICT
:
5971 optval
= (so
->so_flags
& SOF_NOTIFYCONFLICT
);
5974 case SO_RESTRICTIONS
:
5975 optval
= so_get_restrictions(so
);
5978 case SO_AWDL_UNRESTRICTED
:
5979 if (SOCK_DOM(so
) == PF_INET
||
5980 SOCK_DOM(so
) == PF_INET6
) {
5981 optval
= inp_get_awdl_unrestricted(
5989 case SO_INTCOPROC_ALLOW
:
5990 if (SOCK_DOM(so
) == PF_INET6
) {
5991 optval
= inp_get_intcoproc_allowed(
6007 #ifdef __APPLE_API_PRIVATE
6008 case SO_UPCALLCLOSEWAIT
:
6009 optval
= (so
->so_flags
& SOF_UPCALLCLOSEWAIT
);
6013 optval
= (so
->so_flags
& SOF_BINDRANDOMPORT
);
6016 case SO_NP_EXTENSIONS
: {
6017 struct so_np_extensions sonpx
= {};
6019 sonpx
.npx_flags
= (so
->so_flags
& SOF_NPX_SETOPTSHUT
) ?
6020 SONPX_SETOPTSHUT
: 0;
6021 sonpx
.npx_mask
= SONPX_MASK_VALID
;
6023 error
= sooptcopyout(sopt
, &sonpx
,
6024 sizeof(struct so_np_extensions
));
6028 case SO_TRAFFIC_CLASS
:
6029 optval
= so
->so_traffic_class
;
6032 case SO_RECV_TRAFFIC_CLASS
:
6033 optval
= (so
->so_flags
& SOF_RECV_TRAFFIC_CLASS
);
6036 #if (DEVELOPMENT || DEBUG)
6037 case SO_TRAFFIC_CLASS_DBG
:
6038 error
= sogetopt_tcdbg(so
, sopt
);
6040 #endif /* (DEVELOPMENT || DEBUG) */
6042 case SO_PRIVILEGED_TRAFFIC_CLASS
:
6043 optval
= (so
->so_flags
& SOF_PRIVILEGED_TRAFFIC_CLASS
);
6047 optval
= !(so
->so_flags
& SOF_NODEFUNCT
);
6051 optval
= (so
->so_flags
& SOF_DEFUNCT
);
6054 case SO_OPPORTUNISTIC
:
6055 optval
= so_get_opportunistic(so
);
6059 /* This option is not gettable */
6064 optval
= so_get_recv_anyif(so
);
6067 case SO_TRAFFIC_MGT_BACKGROUND
:
6068 /* This option is handled by lower layer(s) */
6069 if (so
->so_proto
!= NULL
&&
6070 so
->so_proto
->pr_ctloutput
!= NULL
) {
6071 (void) so
->so_proto
->pr_ctloutput(so
, sopt
);
6076 case SO_FLOW_DIVERT_TOKEN
:
6077 error
= flow_divert_token_get(so
, sopt
);
6079 #endif /* FLOW_DIVERT */
6082 case SO_NECP_ATTRIBUTES
:
6083 error
= necp_get_socket_attributes(so
, sopt
);
6086 case SO_NECP_CLIENTUUID
: {
6089 if (SOCK_DOM(so
) == PF_MULTIPATH
) {
6090 ncu
= &mpsotomppcb(so
)->necp_client_uuid
;
6091 } else if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
6092 ncu
= &sotoinpcb(so
)->necp_client_uuid
;
6098 error
= sooptcopyout(sopt
, ncu
, sizeof(uuid_t
));
6102 case SO_NECP_LISTENUUID
: {
6105 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
6106 if (sotoinpcb(so
)->inp_flags2
& INP2_EXTERNAL_PORT
) {
6107 nlu
= &sotoinpcb(so
)->necp_client_uuid
;
6117 error
= sooptcopyout(sopt
, nlu
, sizeof(uuid_t
));
6123 case SO_CFIL_SOCK_ID
: {
6124 cfil_sock_id_t sock_id
;
6126 sock_id
= cfil_sock_id_from_socket(so
);
6128 error
= sooptcopyout(sopt
, &sock_id
,
6129 sizeof(cfil_sock_id_t
));
6132 #endif /* CONTENT_FILTER */
6134 case SO_EXTENDED_BK_IDLE
:
6135 optval
= (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
);
6137 case SO_MARK_CELLFALLBACK
:
6138 optval
= ((so
->so_flags1
& SOF1_CELLFALLBACK
) > 0)
6141 case SO_NET_SERVICE_TYPE
: {
6142 if ((so
->so_flags1
& SOF1_TC_NET_SERV_TYPE
)) {
6143 optval
= so
->so_netsvctype
;
6145 optval
= NET_SERVICE_TYPE_BE
;
6149 case SO_NETSVC_MARKING_LEVEL
:
6150 optval
= so_get_netsvc_marking_level(so
);
6153 case SO_MPKL_SEND_INFO
: {
6154 struct so_mpkl_send_info so_mpkl_send_info
;
6156 uuid_copy(so_mpkl_send_info
.mpkl_uuid
, so
->so_mpkl_send_uuid
);
6157 so_mpkl_send_info
.mpkl_proto
= so
->so_mpkl_send_proto
;
6158 error
= sooptcopyout(sopt
, &so_mpkl_send_info
,
6159 sizeof(struct so_mpkl_send_info
));
6163 error
= ENOPROTOOPT
;
6169 socket_unlock(so
, 1);
6175 * The size limits on our soopt_getm is different from that on FreeBSD.
6176 * We limit the size of options to MCLBYTES. This will have to change
6177 * if we need to define options that need more space than MCLBYTES.
6180 soopt_getm(struct sockopt
*sopt
, struct mbuf
**mp
)
6182 struct mbuf
*m
, *m_prev
;
6183 int sopt_size
= sopt
->sopt_valsize
;
6186 if (sopt_size
<= 0 || sopt_size
> MCLBYTES
) {
6190 how
= sopt
->sopt_p
!= kernproc
? M_WAIT
: M_DONTWAIT
;
6191 MGET(m
, how
, MT_DATA
);
6195 if (sopt_size
> MLEN
) {
6197 if ((m
->m_flags
& M_EXT
) == 0) {
6201 m
->m_len
= min(MCLBYTES
, sopt_size
);
6203 m
->m_len
= min(MLEN
, sopt_size
);
6205 sopt_size
-= m
->m_len
;
6209 while (sopt_size
> 0) {
6210 MGET(m
, how
, MT_DATA
);
6215 if (sopt_size
> MLEN
) {
6217 if ((m
->m_flags
& M_EXT
) == 0) {
6222 m
->m_len
= min(MCLBYTES
, sopt_size
);
6224 m
->m_len
= min(MLEN
, sopt_size
);
6226 sopt_size
-= m
->m_len
;
6233 /* copyin sopt data into mbuf chain */
6235 soopt_mcopyin(struct sockopt
*sopt
, struct mbuf
*m
)
6237 struct mbuf
*m0
= m
;
6239 if (sopt
->sopt_val
== USER_ADDR_NULL
) {
6242 while (m
!= NULL
&& sopt
->sopt_valsize
>= m
->m_len
) {
6243 if (sopt
->sopt_p
!= kernproc
) {
6246 error
= copyin(sopt
->sopt_val
, mtod(m
, char *),
6253 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
),
6254 mtod(m
, char *), m
->m_len
);
6256 sopt
->sopt_valsize
-= m
->m_len
;
6257 sopt
->sopt_val
+= m
->m_len
;
6260 /* should be allocated enoughly at ip6_sooptmcopyin() */
6262 panic("soopt_mcopyin");
6268 /* copyout mbuf chain data into soopt */
6270 soopt_mcopyout(struct sockopt
*sopt
, struct mbuf
*m
)
6272 struct mbuf
*m0
= m
;
6275 if (sopt
->sopt_val
== USER_ADDR_NULL
) {
6278 while (m
!= NULL
&& sopt
->sopt_valsize
>= m
->m_len
) {
6279 if (sopt
->sopt_p
!= kernproc
) {
6282 error
= copyout(mtod(m
, char *), sopt
->sopt_val
,
6289 bcopy(mtod(m
, char *),
6290 CAST_DOWN(caddr_t
, sopt
->sopt_val
), m
->m_len
);
6292 sopt
->sopt_valsize
-= m
->m_len
;
6293 sopt
->sopt_val
+= m
->m_len
;
6294 valsize
+= m
->m_len
;
6298 /* enough soopt buffer should be given from user-land */
6302 sopt
->sopt_valsize
= valsize
;
6307 sohasoutofband(struct socket
*so
)
6309 if (so
->so_pgid
< 0) {
6310 gsignal(-so
->so_pgid
, SIGURG
);
6311 } else if (so
->so_pgid
> 0) {
6312 proc_signal(so
->so_pgid
, SIGURG
);
6314 selwakeup(&so
->so_rcv
.sb_sel
);
6315 if (so
->so_rcv
.sb_flags
& SB_KNOTE
) {
6316 KNOTE(&so
->so_rcv
.sb_sel
.si_note
,
6317 (NOTE_OOB
| SO_FILT_HINT_LOCKED
));
6322 sopoll(struct socket
*so
, int events
, kauth_cred_t cred
, void * wql
)
6324 #pragma unused(cred)
6325 struct proc
*p
= current_proc();
6329 so_update_last_owner_locked(so
, PROC_NULL
);
6330 so_update_policy(so
);
6332 if (events
& (POLLIN
| POLLRDNORM
)) {
6333 if (soreadable(so
)) {
6334 revents
|= events
& (POLLIN
| POLLRDNORM
);
6338 if (events
& (POLLOUT
| POLLWRNORM
)) {
6339 if (sowriteable(so
)) {
6340 revents
|= events
& (POLLOUT
| POLLWRNORM
);
6344 if (events
& (POLLPRI
| POLLRDBAND
)) {
6345 if (so
->so_oobmark
|| (so
->so_state
& SS_RCVATMARK
)) {
6346 revents
|= events
& (POLLPRI
| POLLRDBAND
);
6351 if (events
& (POLLIN
| POLLPRI
| POLLRDNORM
| POLLRDBAND
)) {
6353 * Darwin sets the flag first,
6354 * BSD calls selrecord first
6356 so
->so_rcv
.sb_flags
|= SB_SEL
;
6357 selrecord(p
, &so
->so_rcv
.sb_sel
, wql
);
6360 if (events
& (POLLOUT
| POLLWRNORM
)) {
6362 * Darwin sets the flag first,
6363 * BSD calls selrecord first
6365 so
->so_snd
.sb_flags
|= SB_SEL
;
6366 selrecord(p
, &so
->so_snd
.sb_sel
, wql
);
6370 socket_unlock(so
, 1);
6375 soo_kqfilter(struct fileproc
*fp
, struct knote
*kn
, struct kevent_qos_s
*kev
)
6377 struct socket
*so
= (struct socket
*)fp
->fp_glob
->fg_data
;
6381 so_update_last_owner_locked(so
, PROC_NULL
);
6382 so_update_policy(so
);
6384 switch (kn
->kn_filter
) {
6386 kn
->kn_filtid
= EVFILTID_SOREAD
;
6389 kn
->kn_filtid
= EVFILTID_SOWRITE
;
6392 kn
->kn_filtid
= EVFILTID_SCK
;
6395 kn
->kn_filtid
= EVFILTID_SOEXCEPT
;
6398 socket_unlock(so
, 1);
6399 knote_set_error(kn
, EINVAL
);
6404 * call the appropriate sub-filter attach
6405 * with the socket still locked
6407 result
= knote_fops(kn
)->f_attach(kn
, kev
);
6409 socket_unlock(so
, 1);
6415 filt_soread_common(struct knote
*kn
, struct kevent_qos_s
*kev
, struct socket
*so
)
6420 if (so
->so_options
& SO_ACCEPTCONN
) {
6422 * Radar 6615193 handle the listen case dynamically
6423 * for kqueue read filter. This allows to call listen()
6424 * after registering the kqueue EVFILT_READ.
6427 retval
= !TAILQ_EMPTY(&so
->so_comp
);
6432 /* socket isn't a listener */
6434 * NOTE_LOWAT specifies new low water mark in data, i.e.
6435 * the bytes of protocol data. We therefore exclude any
6438 data
= so
->so_rcv
.sb_cc
- so
->so_rcv
.sb_ctl
;
6440 if (kn
->kn_sfflags
& NOTE_OOB
) {
6441 if (so
->so_oobmark
|| (so
->so_state
& SS_RCVATMARK
)) {
6442 kn
->kn_fflags
|= NOTE_OOB
;
6443 data
-= so
->so_oobmark
;
6449 if ((so
->so_state
& SS_CANTRCVMORE
)
6451 && cfil_sock_data_pending(&so
->so_rcv
) == 0
6452 #endif /* CONTENT_FILTER */
6454 kn
->kn_flags
|= EV_EOF
;
6455 kn
->kn_fflags
= so
->so_error
;
6460 if (so
->so_error
) { /* temporary udp error */
6465 int64_t lowwat
= so
->so_rcv
.sb_lowat
;
6467 * Ensure that when NOTE_LOWAT is used, the derived
6468 * low water mark is bounded by socket's rcv buf's
6469 * high and low water mark values.
6471 if (kn
->kn_sfflags
& NOTE_LOWAT
) {
6472 if (kn
->kn_sdata
> so
->so_rcv
.sb_hiwat
) {
6473 lowwat
= so
->so_rcv
.sb_hiwat
;
6474 } else if (kn
->kn_sdata
> lowwat
) {
6475 lowwat
= kn
->kn_sdata
;
6480 * While the `data` field is the amount of data to read,
6481 * 0-sized packets need to wake up the kqueue, see 58140856,
6482 * so we need to take control bytes into account too.
6484 retval
= (so
->so_rcv
.sb_cc
>= lowwat
);
6487 if (retval
&& kev
) {
6488 knote_fill_kevent(kn
, kev
, data
);
6494 filt_sorattach(struct knote
*kn
, __unused
struct kevent_qos_s
*kev
)
6496 struct socket
*so
= (struct socket
*)kn
->kn_fp
->fp_glob
->fg_data
;
6501 * If the caller explicitly asked for OOB results (e.g. poll())
6502 * from EVFILT_READ, then save that off in the hookid field
6503 * and reserve the kn_flags EV_OOBAND bit for output only.
6505 if (kn
->kn_filter
== EVFILT_READ
&&
6506 kn
->kn_flags
& EV_OOBAND
) {
6507 kn
->kn_flags
&= ~EV_OOBAND
;
6508 kn
->kn_hook32
= EV_OOBAND
;
6512 if (KNOTE_ATTACH(&so
->so_rcv
.sb_sel
.si_note
, kn
)) {
6513 so
->so_rcv
.sb_flags
|= SB_KNOTE
;
6516 /* indicate if event is already fired */
6517 return filt_soread_common(kn
, NULL
, so
);
6521 filt_sordetach(struct knote
*kn
)
6523 struct socket
*so
= (struct socket
*)kn
->kn_fp
->fp_glob
->fg_data
;
6526 if (so
->so_rcv
.sb_flags
& SB_KNOTE
) {
6527 if (KNOTE_DETACH(&so
->so_rcv
.sb_sel
.si_note
, kn
)) {
6528 so
->so_rcv
.sb_flags
&= ~SB_KNOTE
;
6531 socket_unlock(so
, 1);
6536 filt_soread(struct knote
*kn
, long hint
)
6538 struct socket
*so
= (struct socket
*)kn
->kn_fp
->fp_glob
->fg_data
;
6541 if ((hint
& SO_FILT_HINT_LOCKED
) == 0) {
6545 retval
= filt_soread_common(kn
, NULL
, so
);
6547 if ((hint
& SO_FILT_HINT_LOCKED
) == 0) {
6548 socket_unlock(so
, 1);
6555 filt_sortouch(struct knote
*kn
, struct kevent_qos_s
*kev
)
6557 struct socket
*so
= (struct socket
*)kn
->kn_fp
->fp_glob
->fg_data
;
6562 /* save off the new input fflags and data */
6563 kn
->kn_sfflags
= kev
->fflags
;
6564 kn
->kn_sdata
= kev
->data
;
6566 /* determine if changes result in fired events */
6567 retval
= filt_soread_common(kn
, NULL
, so
);
6569 socket_unlock(so
, 1);
6575 filt_sorprocess(struct knote
*kn
, struct kevent_qos_s
*kev
)
6577 struct socket
*so
= (struct socket
*)kn
->kn_fp
->fp_glob
->fg_data
;
6581 retval
= filt_soread_common(kn
, kev
, so
);
6582 socket_unlock(so
, 1);
6588 so_wait_for_if_feedback(struct socket
*so
)
6590 if ((SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) &&
6591 (so
->so_state
& SS_ISCONNECTED
)) {
6592 struct inpcb
*inp
= sotoinpcb(so
);
6593 if (INP_WAIT_FOR_IF_FEEDBACK(inp
)) {
6601 filt_sowrite_common(struct knote
*kn
, struct kevent_qos_s
*kev
, struct socket
*so
)
6604 int64_t data
= sbspace(&so
->so_snd
);
6606 if (so
->so_state
& SS_CANTSENDMORE
) {
6607 kn
->kn_flags
|= EV_EOF
;
6608 kn
->kn_fflags
= so
->so_error
;
6613 if (so
->so_error
) { /* temporary udp error */
6618 if (!socanwrite(so
)) {
6623 if (so
->so_flags1
& SOF1_PRECONNECT_DATA
) {
6628 int64_t lowwat
= so
->so_snd
.sb_lowat
;
6630 if (kn
->kn_sfflags
& NOTE_LOWAT
) {
6631 if (kn
->kn_sdata
> so
->so_snd
.sb_hiwat
) {
6632 lowwat
= so
->so_snd
.sb_hiwat
;
6633 } else if (kn
->kn_sdata
> lowwat
) {
6634 lowwat
= kn
->kn_sdata
;
6638 if (data
>= lowwat
) {
6639 if ((so
->so_flags
& SOF_NOTSENT_LOWAT
)
6640 #if (DEBUG || DEVELOPMENT)
6641 && so_notsent_lowat_check
== 1
6642 #endif /* DEBUG || DEVELOPMENT */
6644 if ((SOCK_DOM(so
) == PF_INET
||
6645 SOCK_DOM(so
) == PF_INET6
) &&
6646 so
->so_type
== SOCK_STREAM
) {
6647 ret
= tcp_notsent_lowat_check(so
);
6650 else if ((SOCK_DOM(so
) == PF_MULTIPATH
) &&
6651 (SOCK_PROTO(so
) == IPPROTO_TCP
)) {
6652 ret
= mptcp_notsent_lowat_check(so
);
6663 if (so_wait_for_if_feedback(so
)) {
6669 knote_fill_kevent(kn
, kev
, data
);
6675 filt_sowattach(struct knote
*kn
, __unused
struct kevent_qos_s
*kev
)
6677 struct socket
*so
= (struct socket
*)kn
->kn_fp
->fp_glob
->fg_data
;
6680 if (KNOTE_ATTACH(&so
->so_snd
.sb_sel
.si_note
, kn
)) {
6681 so
->so_snd
.sb_flags
|= SB_KNOTE
;
6684 /* determine if its already fired */
6685 return filt_sowrite_common(kn
, NULL
, so
);
6689 filt_sowdetach(struct knote
*kn
)
6691 struct socket
*so
= (struct socket
*)kn
->kn_fp
->fp_glob
->fg_data
;
6694 if (so
->so_snd
.sb_flags
& SB_KNOTE
) {
6695 if (KNOTE_DETACH(&so
->so_snd
.sb_sel
.si_note
, kn
)) {
6696 so
->so_snd
.sb_flags
&= ~SB_KNOTE
;
6699 socket_unlock(so
, 1);
6704 filt_sowrite(struct knote
*kn
, long hint
)
6706 struct socket
*so
= (struct socket
*)kn
->kn_fp
->fp_glob
->fg_data
;
6709 if ((hint
& SO_FILT_HINT_LOCKED
) == 0) {
6713 ret
= filt_sowrite_common(kn
, NULL
, so
);
6715 if ((hint
& SO_FILT_HINT_LOCKED
) == 0) {
6716 socket_unlock(so
, 1);
6723 filt_sowtouch(struct knote
*kn
, struct kevent_qos_s
*kev
)
6725 struct socket
*so
= (struct socket
*)kn
->kn_fp
->fp_glob
->fg_data
;
6730 /*save off the new input fflags and data */
6731 kn
->kn_sfflags
= kev
->fflags
;
6732 kn
->kn_sdata
= kev
->data
;
6734 /* determine if these changes result in a triggered event */
6735 ret
= filt_sowrite_common(kn
, NULL
, so
);
6737 socket_unlock(so
, 1);
6743 filt_sowprocess(struct knote
*kn
, struct kevent_qos_s
*kev
)
6745 struct socket
*so
= (struct socket
*)kn
->kn_fp
->fp_glob
->fg_data
;
6749 ret
= filt_sowrite_common(kn
, kev
, so
);
6750 socket_unlock(so
, 1);
6756 filt_sockev_common(struct knote
*kn
, struct kevent_qos_s
*kev
,
6757 struct socket
*so
, long ev_hint
)
6761 uint32_t level_trigger
= 0;
6763 if (ev_hint
& SO_FILT_HINT_CONNRESET
) {
6764 kn
->kn_fflags
|= NOTE_CONNRESET
;
6766 if (ev_hint
& SO_FILT_HINT_TIMEOUT
) {
6767 kn
->kn_fflags
|= NOTE_TIMEOUT
;
6769 if (ev_hint
& SO_FILT_HINT_NOSRCADDR
) {
6770 kn
->kn_fflags
|= NOTE_NOSRCADDR
;
6772 if (ev_hint
& SO_FILT_HINT_IFDENIED
) {
6773 kn
->kn_fflags
|= NOTE_IFDENIED
;
6775 if (ev_hint
& SO_FILT_HINT_KEEPALIVE
) {
6776 kn
->kn_fflags
|= NOTE_KEEPALIVE
;
6778 if (ev_hint
& SO_FILT_HINT_ADAPTIVE_WTIMO
) {
6779 kn
->kn_fflags
|= NOTE_ADAPTIVE_WTIMO
;
6781 if (ev_hint
& SO_FILT_HINT_ADAPTIVE_RTIMO
) {
6782 kn
->kn_fflags
|= NOTE_ADAPTIVE_RTIMO
;
6784 if ((ev_hint
& SO_FILT_HINT_CONNECTED
) ||
6785 (so
->so_state
& SS_ISCONNECTED
)) {
6786 kn
->kn_fflags
|= NOTE_CONNECTED
;
6787 level_trigger
|= NOTE_CONNECTED
;
6789 if ((ev_hint
& SO_FILT_HINT_DISCONNECTED
) ||
6790 (so
->so_state
& SS_ISDISCONNECTED
)) {
6791 kn
->kn_fflags
|= NOTE_DISCONNECTED
;
6792 level_trigger
|= NOTE_DISCONNECTED
;
6794 if (ev_hint
& SO_FILT_HINT_CONNINFO_UPDATED
) {
6795 if (so
->so_proto
!= NULL
&&
6796 (so
->so_proto
->pr_flags
& PR_EVCONNINFO
)) {
6797 kn
->kn_fflags
|= NOTE_CONNINFO_UPDATED
;
6801 if ((ev_hint
& SO_FILT_HINT_NOTIFY_ACK
) ||
6802 tcp_notify_ack_active(so
)) {
6803 kn
->kn_fflags
|= NOTE_NOTIFY_ACK
;
6806 if ((so
->so_state
& SS_CANTRCVMORE
)
6808 && cfil_sock_data_pending(&so
->so_rcv
) == 0
6809 #endif /* CONTENT_FILTER */
6811 kn
->kn_fflags
|= NOTE_READCLOSED
;
6812 level_trigger
|= NOTE_READCLOSED
;
6815 if (so
->so_state
& SS_CANTSENDMORE
) {
6816 kn
->kn_fflags
|= NOTE_WRITECLOSED
;
6817 level_trigger
|= NOTE_WRITECLOSED
;
6820 if ((ev_hint
& SO_FILT_HINT_SUSPEND
) ||
6821 (so
->so_flags
& SOF_SUSPENDED
)) {
6822 kn
->kn_fflags
&= ~(NOTE_SUSPEND
| NOTE_RESUME
);
6824 /* If resume event was delivered before, reset it */
6825 kn
->kn_hook32
&= ~NOTE_RESUME
;
6827 kn
->kn_fflags
|= NOTE_SUSPEND
;
6828 level_trigger
|= NOTE_SUSPEND
;
6831 if ((ev_hint
& SO_FILT_HINT_RESUME
) ||
6832 (so
->so_flags
& SOF_SUSPENDED
) == 0) {
6833 kn
->kn_fflags
&= ~(NOTE_SUSPEND
| NOTE_RESUME
);
6835 /* If suspend event was delivered before, reset it */
6836 kn
->kn_hook32
&= ~NOTE_SUSPEND
;
6838 kn
->kn_fflags
|= NOTE_RESUME
;
6839 level_trigger
|= NOTE_RESUME
;
6842 if (so
->so_error
!= 0) {
6844 data
= so
->so_error
;
6845 kn
->kn_flags
|= EV_EOF
;
6847 u_int32_t data32
= 0;
6848 get_sockev_state(so
, &data32
);
6852 /* Reset any events that are not requested on this knote */
6853 kn
->kn_fflags
&= (kn
->kn_sfflags
& EVFILT_SOCK_ALL_MASK
);
6854 level_trigger
&= (kn
->kn_sfflags
& EVFILT_SOCK_ALL_MASK
);
6856 /* Find the level triggerred events that are already delivered */
6857 level_trigger
&= kn
->kn_hook32
;
6858 level_trigger
&= EVFILT_SOCK_LEVEL_TRIGGER_MASK
;
6860 /* Do not deliver level triggerred events more than once */
6861 if ((kn
->kn_fflags
& ~level_trigger
) != 0) {
6867 * Store the state of the events being delivered. This
6868 * state can be used to deliver level triggered events
6869 * ateast once and still avoid waking up the application
6870 * multiple times as long as the event is active.
6872 if (kn
->kn_fflags
!= 0) {
6873 kn
->kn_hook32
|= (kn
->kn_fflags
&
6874 EVFILT_SOCK_LEVEL_TRIGGER_MASK
);
6878 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6879 * only one of them and remember the last one that was
6882 if (kn
->kn_fflags
& NOTE_SUSPEND
) {
6883 kn
->kn_hook32
&= ~NOTE_RESUME
;
6885 if (kn
->kn_fflags
& NOTE_RESUME
) {
6886 kn
->kn_hook32
&= ~NOTE_SUSPEND
;
6889 knote_fill_kevent(kn
, kev
, data
);
6895 filt_sockattach(struct knote
*kn
, __unused
struct kevent_qos_s
*kev
)
6897 struct socket
*so
= (struct socket
*)kn
->kn_fp
->fp_glob
->fg_data
;
6901 if (KNOTE_ATTACH(&so
->so_klist
, kn
)) {
6902 so
->so_flags
|= SOF_KNOTE
;
6905 /* determine if event already fired */
6906 return filt_sockev_common(kn
, NULL
, so
, 0);
6910 filt_sockdetach(struct knote
*kn
)
6912 struct socket
*so
= (struct socket
*)kn
->kn_fp
->fp_glob
->fg_data
;
6915 if ((so
->so_flags
& SOF_KNOTE
) != 0) {
6916 if (KNOTE_DETACH(&so
->so_klist
, kn
)) {
6917 so
->so_flags
&= ~SOF_KNOTE
;
6920 socket_unlock(so
, 1);
6924 filt_sockev(struct knote
*kn
, long hint
)
6926 int ret
= 0, locked
= 0;
6927 struct socket
*so
= (struct socket
*)kn
->kn_fp
->fp_glob
->fg_data
;
6928 long ev_hint
= (hint
& SO_FILT_HINT_EV
);
6930 if ((hint
& SO_FILT_HINT_LOCKED
) == 0) {
6935 ret
= filt_sockev_common(kn
, NULL
, so
, ev_hint
);
6938 socket_unlock(so
, 1);
6947 * filt_socktouch - update event state
6952 struct kevent_qos_s
*kev
)
6954 struct socket
*so
= (struct socket
*)kn
->kn_fp
->fp_glob
->fg_data
;
6955 uint32_t changed_flags
;
6960 /* save off the [result] data and fflags */
6961 changed_flags
= (kn
->kn_sfflags
^ kn
->kn_hook32
);
6963 /* save off the new input fflags and data */
6964 kn
->kn_sfflags
= kev
->fflags
;
6965 kn
->kn_sdata
= kev
->data
;
6967 /* restrict the current results to the (smaller?) set of new interest */
6969 * For compatibility with previous implementations, we leave kn_fflags
6970 * as they were before.
6972 //kn->kn_fflags &= kev->fflags;
6975 * Since we keep track of events that are already
6976 * delivered, if any of those events are not requested
6977 * anymore the state related to them can be reset
6979 kn
->kn_hook32
&= ~(changed_flags
& EVFILT_SOCK_LEVEL_TRIGGER_MASK
);
6981 /* determine if we have events to deliver */
6982 ret
= filt_sockev_common(kn
, NULL
, so
, 0);
6984 socket_unlock(so
, 1);
6990 * filt_sockprocess - query event fired state and return data
6993 filt_sockprocess(struct knote
*kn
, struct kevent_qos_s
*kev
)
6995 struct socket
*so
= (struct socket
*)kn
->kn_fp
->fp_glob
->fg_data
;
7000 ret
= filt_sockev_common(kn
, kev
, so
, 0);
7002 socket_unlock(so
, 1);
7008 get_sockev_state(struct socket
*so
, u_int32_t
*statep
)
7010 u_int32_t state
= *(statep
);
7013 * If the state variable is already used by a previous event,
7020 if (so
->so_state
& SS_ISCONNECTED
) {
7021 state
|= SOCKEV_CONNECTED
;
7023 state
&= ~(SOCKEV_CONNECTED
);
7025 state
|= ((so
->so_state
& SS_ISDISCONNECTED
) ? SOCKEV_DISCONNECTED
: 0);
7029 #define SO_LOCK_HISTORY_STR_LEN \
7030 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7032 __private_extern__
const char *
7033 solockhistory_nr(struct socket
*so
)
7037 static char lock_history_str
[SO_LOCK_HISTORY_STR_LEN
];
7039 bzero(lock_history_str
, sizeof(lock_history_str
));
7040 for (i
= SO_LCKDBG_MAX
- 1; i
>= 0; i
--) {
7041 n
+= scnprintf(lock_history_str
+ n
,
7042 SO_LOCK_HISTORY_STR_LEN
- n
, "%p:%p ",
7043 so
->lock_lr
[(so
->next_lock_lr
+ i
) % SO_LCKDBG_MAX
],
7044 so
->unlock_lr
[(so
->next_unlock_lr
+ i
) % SO_LCKDBG_MAX
]);
7046 return lock_history_str
;
7050 socket_getlock(struct socket
*so
, int flags
)
7052 if (so
->so_proto
->pr_getlock
!= NULL
) {
7053 return (*so
->so_proto
->pr_getlock
)(so
, flags
);
7055 return so
->so_proto
->pr_domain
->dom_mtx
;
7060 socket_lock(struct socket
*so
, int refcount
)
7064 lr_saved
= __builtin_return_address(0);
7066 if (so
->so_proto
->pr_lock
) {
7067 (*so
->so_proto
->pr_lock
)(so
, refcount
, lr_saved
);
7069 #ifdef MORE_LOCKING_DEBUG
7070 LCK_MTX_ASSERT(so
->so_proto
->pr_domain
->dom_mtx
,
7071 LCK_MTX_ASSERT_NOTOWNED
);
7073 lck_mtx_lock(so
->so_proto
->pr_domain
->dom_mtx
);
7077 so
->lock_lr
[so
->next_lock_lr
] = lr_saved
;
7078 so
->next_lock_lr
= (so
->next_lock_lr
+ 1) % SO_LCKDBG_MAX
;
7083 socket_lock_assert_owned(struct socket
*so
)
7085 lck_mtx_t
*mutex_held
;
7087 if (so
->so_proto
->pr_getlock
!= NULL
) {
7088 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
7090 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
7093 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
7097 socket_try_lock(struct socket
*so
)
7101 if (so
->so_proto
->pr_getlock
!= NULL
) {
7102 mtx
= (*so
->so_proto
->pr_getlock
)(so
, 0);
7104 mtx
= so
->so_proto
->pr_domain
->dom_mtx
;
7107 return lck_mtx_try_lock(mtx
);
7111 socket_unlock(struct socket
*so
, int refcount
)
7114 lck_mtx_t
*mutex_held
;
7116 lr_saved
= __builtin_return_address(0);
7118 if (so
== NULL
|| so
->so_proto
== NULL
) {
7119 panic("%s: null so_proto so=%p\n", __func__
, so
);
7123 if (so
->so_proto
->pr_unlock
) {
7124 (*so
->so_proto
->pr_unlock
)(so
, refcount
, lr_saved
);
7126 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
7127 #ifdef MORE_LOCKING_DEBUG
7128 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
7130 so
->unlock_lr
[so
->next_unlock_lr
] = lr_saved
;
7131 so
->next_unlock_lr
= (so
->next_unlock_lr
+ 1) % SO_LCKDBG_MAX
;
7134 if (so
->so_usecount
<= 0) {
7135 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7136 "lrh=%s", __func__
, so
->so_usecount
, so
,
7137 SOCK_DOM(so
), so
->so_type
,
7138 SOCK_PROTO(so
), solockhistory_nr(so
));
7143 if (so
->so_usecount
== 0) {
7144 sofreelastref(so
, 1);
7147 lck_mtx_unlock(mutex_held
);
7151 /* Called with socket locked, will unlock socket */
7153 sofree(struct socket
*so
)
7155 lck_mtx_t
*mutex_held
;
7157 if (so
->so_proto
->pr_getlock
!= NULL
) {
7158 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
7160 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
7162 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
7164 sofreelastref(so
, 0);
7168 soreference(struct socket
*so
)
7170 socket_lock(so
, 1); /* locks & take one reference on socket */
7171 socket_unlock(so
, 0); /* unlock only */
7175 sodereference(struct socket
*so
)
7178 socket_unlock(so
, 1);
7182 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7183 * possibility of using jumbo clusters. Caller must ensure to hold
7187 somultipages(struct socket
*so
, boolean_t set
)
7190 so
->so_flags
|= SOF_MULTIPAGES
;
7192 so
->so_flags
&= ~SOF_MULTIPAGES
;
7197 soif2kcl(struct socket
*so
, boolean_t set
)
7200 so
->so_flags1
|= SOF1_IF_2KCL
;
7202 so
->so_flags1
&= ~SOF1_IF_2KCL
;
7207 so_isdstlocal(struct socket
*so
)
7209 struct inpcb
*inp
= (struct inpcb
*)so
->so_pcb
;
7211 if (SOCK_DOM(so
) == PF_INET
) {
7212 return inaddr_local(inp
->inp_faddr
);
7213 } else if (SOCK_DOM(so
) == PF_INET6
) {
7214 return in6addr_local(&inp
->in6p_faddr
);
7221 sosetdefunct(struct proc
*p
, struct socket
*so
, int level
, boolean_t noforce
)
7223 struct sockbuf
*rcv
, *snd
;
7224 int err
= 0, defunct
;
7229 defunct
= (so
->so_flags
& SOF_DEFUNCT
);
7231 if (!(snd
->sb_flags
& rcv
->sb_flags
& SB_DROP
)) {
7232 panic("%s: SB_DROP not set", __func__
);
7238 if (so
->so_flags
& SOF_NODEFUNCT
) {
7241 if (p
!= PROC_NULL
) {
7242 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7243 "name %s level %d) so 0x%llx [%d,%d] "
7244 "is not eligible for defunct "
7245 "(%d)\n", __func__
, proc_selfpid(),
7246 proc_best_name(current_proc()), proc_pid(p
),
7247 proc_best_name(p
), level
,
7248 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7249 SOCK_DOM(so
), SOCK_TYPE(so
), err
);
7253 so
->so_flags
&= ~SOF_NODEFUNCT
;
7254 if (p
!= PROC_NULL
) {
7255 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7256 "name %s level %d) so 0x%llx [%d,%d] "
7258 "(%d)\n", __func__
, proc_selfpid(),
7259 proc_best_name(current_proc()), proc_pid(p
),
7260 proc_best_name(p
), level
,
7261 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7262 SOCK_DOM(so
), SOCK_TYPE(so
), err
);
7264 } else if (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
) {
7265 struct inpcb
*inp
= (struct inpcb
*)so
->so_pcb
;
7266 struct ifnet
*ifp
= inp
->inp_last_outifp
;
7268 if (ifp
&& IFNET_IS_CELLULAR(ifp
)) {
7269 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_nocell
);
7270 } else if (so
->so_flags
& SOF_DELEGATED
) {
7271 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_nodlgtd
);
7272 } else if (soextbkidlestat
.so_xbkidle_time
== 0) {
7273 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_notime
);
7274 } else if (noforce
&& p
!= PROC_NULL
) {
7275 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_active
);
7277 so
->so_flags1
|= SOF1_EXTEND_BK_IDLE_INPROG
;
7278 so
->so_extended_bk_start
= net_uptime();
7279 OSBitOrAtomic(P_LXBKIDLEINPROG
, &p
->p_ladvflag
);
7281 inpcb_timer_sched(inp
->inp_pcbinfo
, INPCB_TIMER_LAZY
);
7284 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7285 "name %s level %d) so 0x%llx [%d,%d] "
7287 "(%d)\n", __func__
, proc_selfpid(),
7288 proc_best_name(current_proc()), proc_pid(p
),
7289 proc_best_name(p
), level
,
7290 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7291 SOCK_DOM(so
), SOCK_TYPE(so
), err
);
7294 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_forced
);
7298 so
->so_flags
|= SOF_DEFUNCT
;
7300 /* Prevent further data from being appended to the socket buffers */
7301 snd
->sb_flags
|= SB_DROP
;
7302 rcv
->sb_flags
|= SB_DROP
;
7304 /* Flush any existing data in the socket buffers */
7305 if (rcv
->sb_cc
!= 0) {
7306 rcv
->sb_flags
&= ~SB_SEL
;
7307 selthreadclear(&rcv
->sb_sel
);
7310 if (snd
->sb_cc
!= 0) {
7311 snd
->sb_flags
&= ~SB_SEL
;
7312 selthreadclear(&snd
->sb_sel
);
7317 if (p
!= PROC_NULL
) {
7318 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7319 "so 0x%llx [%d,%d] %s defunct%s\n", __func__
,
7320 proc_selfpid(), proc_best_name(current_proc()),
7321 proc_pid(p
), proc_best_name(p
), level
,
7322 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), SOCK_DOM(so
),
7323 SOCK_TYPE(so
), defunct
? "is already" : "marked as",
7324 (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
) ?
7331 sodefunct(struct proc
*p
, struct socket
*so
, int level
)
7333 struct sockbuf
*rcv
, *snd
;
7335 if (!(so
->so_flags
& SOF_DEFUNCT
)) {
7336 panic("%s improperly called", __func__
);
7339 if (so
->so_state
& SS_DEFUNCT
) {
7346 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
7347 char s
[MAX_IPv6_STR_LEN
];
7348 char d
[MAX_IPv6_STR_LEN
];
7349 struct inpcb
*inp
= sotoinpcb(so
);
7351 if (p
!= PROC_NULL
) {
7353 "%s[%d, %s]: (target pid %d name %s level %d) "
7354 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
7355 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7356 " snd_fl 0x%x]\n", __func__
,
7357 proc_selfpid(), proc_best_name(current_proc()),
7358 proc_pid(p
), proc_best_name(p
), level
,
7359 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7360 (SOCK_TYPE(so
) == SOCK_STREAM
) ? "TCP" : "UDP",
7361 inet_ntop(SOCK_DOM(so
), ((SOCK_DOM(so
) == PF_INET
) ?
7362 (void *)&inp
->inp_laddr
.s_addr
:
7363 (void *)&inp
->in6p_laddr
),
7364 s
, sizeof(s
)), ntohs(inp
->in6p_lport
),
7365 inet_ntop(SOCK_DOM(so
), (SOCK_DOM(so
) == PF_INET
) ?
7366 (void *)&inp
->inp_faddr
.s_addr
:
7367 (void *)&inp
->in6p_faddr
,
7368 d
, sizeof(d
)), ntohs(inp
->in6p_fport
),
7369 (uint32_t)rcv
->sb_sel
.si_flags
,
7370 (uint32_t)snd
->sb_sel
.si_flags
,
7371 rcv
->sb_flags
, snd
->sb_flags
);
7373 } else if (p
!= PROC_NULL
) {
7374 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7375 "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7376 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__
,
7377 proc_selfpid(), proc_best_name(current_proc()),
7378 proc_pid(p
), proc_best_name(p
), level
,
7379 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7380 SOCK_DOM(so
), SOCK_TYPE(so
),
7381 (uint32_t)rcv
->sb_sel
.si_flags
,
7382 (uint32_t)snd
->sb_sel
.si_flags
, rcv
->sb_flags
,
7387 * Unwedge threads blocked on sbwait() and sb_lock().
7392 so
->so_flags1
|= SOF1_DEFUNCTINPROG
;
7393 if (rcv
->sb_flags
& SB_LOCK
) {
7394 sbunlock(rcv
, TRUE
); /* keep socket locked */
7396 if (snd
->sb_flags
& SB_LOCK
) {
7397 sbunlock(snd
, TRUE
); /* keep socket locked */
7400 * Flush the buffers and disconnect. We explicitly call shutdown
7401 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7402 * states are set for the socket. This would also flush out data
7403 * hanging off the receive list of this socket.
7405 (void) soshutdownlock_final(so
, SHUT_RD
);
7406 (void) soshutdownlock_final(so
, SHUT_WR
);
7407 (void) sodisconnectlocked(so
);
7410 * Explicitly handle connectionless-protocol disconnection
7411 * and release any remaining data in the socket buffers.
7413 if (!(so
->so_state
& SS_ISDISCONNECTED
)) {
7414 (void) soisdisconnected(so
);
7417 if (so
->so_error
== 0) {
7418 so
->so_error
= EBADF
;
7421 if (rcv
->sb_cc
!= 0) {
7422 rcv
->sb_flags
&= ~SB_SEL
;
7423 selthreadclear(&rcv
->sb_sel
);
7426 if (snd
->sb_cc
!= 0) {
7427 snd
->sb_flags
&= ~SB_SEL
;
7428 selthreadclear(&snd
->sb_sel
);
7431 so
->so_state
|= SS_DEFUNCT
;
7432 OSIncrementAtomicLong((volatile long *)&sodefunct_calls
);
7439 soresume(struct proc
*p
, struct socket
*so
, int locked
)
7445 if (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_INPROG
) {
7446 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7447 "[%d,%d] resumed from bk idle\n",
7448 __func__
, proc_selfpid(), proc_best_name(current_proc()),
7449 proc_pid(p
), proc_best_name(p
),
7450 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7451 SOCK_DOM(so
), SOCK_TYPE(so
));
7453 so
->so_flags1
&= ~SOF1_EXTEND_BK_IDLE_INPROG
;
7454 so
->so_extended_bk_start
= 0;
7455 OSBitAndAtomic(~P_LXBKIDLEINPROG
, &p
->p_ladvflag
);
7457 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_resumed
);
7458 OSDecrementAtomic(&soextbkidlestat
.so_xbkidle_active
);
7459 VERIFY(soextbkidlestat
.so_xbkidle_active
>= 0);
7462 socket_unlock(so
, 1);
7469 * Does not attempt to account for sockets that are delegated from
7470 * the current process
7473 so_set_extended_bk_idle(struct socket
*so
, int optval
)
7477 if ((SOCK_DOM(so
) != PF_INET
&& SOCK_DOM(so
) != PF_INET6
) ||
7478 SOCK_PROTO(so
) != IPPROTO_TCP
) {
7479 OSDecrementAtomic(&soextbkidlestat
.so_xbkidle_notsupp
);
7481 } else if (optval
== 0) {
7482 so
->so_flags1
&= ~SOF1_EXTEND_BK_IDLE_WANTED
;
7484 soresume(current_proc(), so
, 1);
7486 struct proc
*p
= current_proc();
7487 struct fileproc
*fp
;
7491 * Unlock socket to avoid lock ordering issue with
7492 * the proc fd table lock
7494 socket_unlock(so
, 0);
7497 fdt_foreach(fp
, p
) {
7500 if (FILEGLOB_DTYPE(fp
->fp_glob
) != DTYPE_SOCKET
) {
7504 so2
= (struct socket
*)fp
->fp_glob
->fg_data
;
7506 so2
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
) {
7509 if (count
>= soextbkidlestat
.so_xbkidle_maxperproc
) {
7517 if (count
>= soextbkidlestat
.so_xbkidle_maxperproc
) {
7518 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_toomany
);
7520 } else if (so
->so_flags
& SOF_DELEGATED
) {
7521 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_nodlgtd
);
7524 so
->so_flags1
|= SOF1_EXTEND_BK_IDLE_WANTED
;
7525 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_wantok
);
7527 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7528 "%s marked for extended bk idle\n",
7529 __func__
, proc_selfpid(), proc_best_name(current_proc()),
7530 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7531 SOCK_DOM(so
), SOCK_TYPE(so
),
7532 (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
) ?
7540 so_stop_extended_bk_idle(struct socket
*so
)
7542 so
->so_flags1
&= ~SOF1_EXTEND_BK_IDLE_INPROG
;
7543 so
->so_extended_bk_start
= 0;
7545 OSDecrementAtomic(&soextbkidlestat
.so_xbkidle_active
);
7546 VERIFY(soextbkidlestat
.so_xbkidle_active
>= 0);
7550 sosetdefunct(current_proc(), so
,
7551 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL
, FALSE
);
7552 if (so
->so_flags
& SOF_DEFUNCT
) {
7553 sodefunct(current_proc(), so
,
7554 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL
);
7559 so_drain_extended_bk_idle(struct socket
*so
)
7561 if (so
&& (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_INPROG
)) {
7563 * Only penalize sockets that have outstanding data
7565 if (so
->so_rcv
.sb_cc
|| so
->so_snd
.sb_cc
) {
7566 so_stop_extended_bk_idle(so
);
7568 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_drained
);
7574 * Return values tells if socket is still in extended background idle
7577 so_check_extended_bk_idle_time(struct socket
*so
)
7581 if ((so
->so_flags1
& SOF1_EXTEND_BK_IDLE_INPROG
)) {
7582 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7583 __func__
, proc_selfpid(), proc_best_name(current_proc()),
7584 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7585 SOCK_DOM(so
), SOCK_TYPE(so
));
7586 if (net_uptime() - so
->so_extended_bk_start
>
7587 soextbkidlestat
.so_xbkidle_time
) {
7588 so_stop_extended_bk_idle(so
);
7590 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_expired
);
7594 struct inpcb
*inp
= (struct inpcb
*)so
->so_pcb
;
7596 inpcb_timer_sched(inp
->inp_pcbinfo
, INPCB_TIMER_LAZY
);
7597 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_resched
);
7605 resume_proc_sockets(proc_t p
)
7607 if (p
->p_ladvflag
& P_LXBKIDLEINPROG
) {
7608 struct fileproc
*fp
;
7612 fdt_foreach(fp
, p
) {
7613 if (FILEGLOB_DTYPE(fp
->fp_glob
) != DTYPE_SOCKET
) {
7617 so
= (struct socket
*)fp
->fp_glob
->fg_data
;
7618 (void) soresume(p
, so
, 0);
7622 OSBitAndAtomic(~P_LXBKIDLEINPROG
, &p
->p_ladvflag
);
7626 __private_extern__
int
7627 so_set_recv_anyif(struct socket
*so
, int optval
)
7631 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
7633 sotoinpcb(so
)->inp_flags
|= INP_RECV_ANYIF
;
7635 sotoinpcb(so
)->inp_flags
&= ~INP_RECV_ANYIF
;
7643 __private_extern__
int
7644 so_get_recv_anyif(struct socket
*so
)
7648 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
7649 ret
= (sotoinpcb(so
)->inp_flags
& INP_RECV_ANYIF
) ? 1 : 0;
7656 so_set_restrictions(struct socket
*so
, uint32_t vals
)
7658 int nocell_old
, nocell_new
;
7659 int noexpensive_old
, noexpensive_new
;
7660 int noconstrained_old
, noconstrained_new
;
7663 * Deny-type restrictions are trapdoors; once set they cannot be
7664 * unset for the lifetime of the socket. This allows them to be
7665 * issued by a framework on behalf of the application without
7666 * having to worry that they can be undone.
7668 * Note here that socket-level restrictions overrides any protocol
7669 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7670 * socket restriction issued on the socket has a higher precendence
7671 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7672 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7673 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7675 nocell_old
= (so
->so_restrictions
& SO_RESTRICT_DENY_CELLULAR
);
7676 noexpensive_old
= (so
->so_restrictions
& SO_RESTRICT_DENY_EXPENSIVE
);
7677 noconstrained_old
= (so
->so_restrictions
& SO_RESTRICT_DENY_CONSTRAINED
);
7678 so
->so_restrictions
|= (vals
& (SO_RESTRICT_DENY_IN
|
7679 SO_RESTRICT_DENY_OUT
| SO_RESTRICT_DENY_CELLULAR
|
7680 SO_RESTRICT_DENY_EXPENSIVE
| SO_RESTRICT_DENY_CONSTRAINED
));
7681 nocell_new
= (so
->so_restrictions
& SO_RESTRICT_DENY_CELLULAR
);
7682 noexpensive_new
= (so
->so_restrictions
& SO_RESTRICT_DENY_EXPENSIVE
);
7683 noconstrained_new
= (so
->so_restrictions
& SO_RESTRICT_DENY_CONSTRAINED
);
7685 /* we can only set, not clear restrictions */
7686 if ((nocell_new
- nocell_old
) == 0 &&
7687 (noexpensive_new
- noexpensive_old
) == 0 &&
7688 (noconstrained_new
- noconstrained_old
) == 0) {
7691 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
7692 if (nocell_new
- nocell_old
!= 0) {
7694 * if deny cellular is now set, do what's needed
7697 inp_set_nocellular(sotoinpcb(so
));
7699 if (noexpensive_new
- noexpensive_old
!= 0) {
7700 inp_set_noexpensive(sotoinpcb(so
));
7702 if (noconstrained_new
- noconstrained_old
!= 0) {
7703 inp_set_noconstrained(sotoinpcb(so
));
7707 if (SOCK_DOM(so
) == PF_MULTIPATH
) {
7708 mptcp_set_restrictions(so
);
7715 so_get_restrictions(struct socket
*so
)
7717 return so
->so_restrictions
& (SO_RESTRICT_DENY_IN
|
7718 SO_RESTRICT_DENY_OUT
|
7719 SO_RESTRICT_DENY_CELLULAR
| SO_RESTRICT_DENY_EXPENSIVE
);
7723 so_set_effective_pid(struct socket
*so
, int epid
, struct proc
*p
, boolean_t check_cred
)
7725 struct proc
*ep
= PROC_NULL
;
7728 /* pid 0 is reserved for kernel */
7735 * If this is an in-kernel socket, prevent its delegate
7736 * association from changing unless the socket option is
7737 * coming from within the kernel itself.
7739 if (so
->last_pid
== 0 && p
!= kernproc
) {
7745 * If this is issued by a process that's recorded as the
7746 * real owner of the socket, or if the pid is the same as
7747 * the process's own pid, then proceed. Otherwise ensure
7748 * that the issuing process has the necessary privileges.
7750 if (check_cred
&& (epid
!= so
->last_pid
|| epid
!= proc_pid(p
))) {
7751 if ((error
= priv_check_cred(kauth_cred_get(),
7752 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE
, 0))) {
7758 /* Find the process that corresponds to the effective pid */
7759 if ((ep
= proc_find(epid
)) == PROC_NULL
) {
7765 * If a process tries to delegate the socket to itself, then
7766 * there's really nothing to do; treat it as a way for the
7767 * delegate association to be cleared. Note that we check
7768 * the passed-in proc rather than calling proc_selfpid(),
7769 * as we need to check the process issuing the socket option
7770 * which could be kernproc. Given that we don't allow 0 for
7771 * effective pid, it means that a delegated in-kernel socket
7772 * stays delegated during its lifetime (which is probably OK.)
7774 if (epid
== proc_pid(p
)) {
7775 so
->so_flags
&= ~SOF_DELEGATED
;
7778 uuid_clear(so
->e_uuid
);
7780 so
->so_flags
|= SOF_DELEGATED
;
7781 so
->e_upid
= proc_uniqueid(ep
);
7782 so
->e_pid
= proc_pid(ep
);
7783 proc_getexecutableuuid(ep
, so
->e_uuid
, sizeof(so
->e_uuid
));
7785 #if defined(XNU_TARGET_OS_OSX)
7786 if (ep
->p_responsible_pid
!= so
->e_pid
) {
7787 proc_t rp
= proc_find(ep
->p_responsible_pid
);
7788 if (rp
!= PROC_NULL
) {
7789 proc_getexecutableuuid(rp
, so
->so_ruuid
, sizeof(so
->so_ruuid
));
7790 so
->so_rpid
= ep
->p_responsible_pid
;
7793 uuid_clear(so
->so_ruuid
);
7799 if (so
->so_proto
!= NULL
&& so
->so_proto
->pr_update_last_owner
!= NULL
) {
7800 (*so
->so_proto
->pr_update_last_owner
)(so
, NULL
, ep
);
7803 if (error
== 0 && net_io_policy_log
) {
7806 uuid_unparse(so
->e_uuid
, buf
);
7807 log(LOG_DEBUG
, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7808 "euuid %s%s\n", __func__
, proc_name_address(p
),
7809 proc_pid(p
), (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7810 SOCK_DOM(so
), SOCK_TYPE(so
),
7811 so
->e_pid
, proc_name_address(ep
), buf
,
7812 ((so
->so_flags
& SOF_DELEGATED
) ? " [delegated]" : ""));
7813 } else if (error
!= 0 && net_io_policy_log
) {
7814 log(LOG_ERR
, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7815 "ERROR (%d)\n", __func__
, proc_name_address(p
),
7816 proc_pid(p
), (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7817 SOCK_DOM(so
), SOCK_TYPE(so
),
7818 epid
, (ep
== PROC_NULL
) ? "PROC_NULL" :
7819 proc_name_address(ep
), error
);
7822 /* Update this socket's policy upon success */
7824 so
->so_policy_gencnt
*= -1;
7825 so_update_policy(so
);
7827 so_update_necp_policy(so
, NULL
, NULL
);
7831 if (ep
!= PROC_NULL
) {
7839 so_set_effective_uuid(struct socket
*so
, uuid_t euuid
, struct proc
*p
, boolean_t check_cred
)
7845 /* UUID must not be all-zeroes (reserved for kernel) */
7846 if (uuid_is_null(euuid
)) {
7852 * If this is an in-kernel socket, prevent its delegate
7853 * association from changing unless the socket option is
7854 * coming from within the kernel itself.
7856 if (so
->last_pid
== 0 && p
!= kernproc
) {
7861 /* Get the UUID of the issuing process */
7862 proc_getexecutableuuid(p
, uuid
, sizeof(uuid
));
7865 * If this is issued by a process that's recorded as the
7866 * real owner of the socket, or if the uuid is the same as
7867 * the process's own uuid, then proceed. Otherwise ensure
7868 * that the issuing process has the necessary privileges.
7871 (uuid_compare(euuid
, so
->last_uuid
) != 0 ||
7872 uuid_compare(euuid
, uuid
) != 0)) {
7873 if ((error
= priv_check_cred(kauth_cred_get(),
7874 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE
, 0))) {
7881 * If a process tries to delegate the socket to itself, then
7882 * there's really nothing to do; treat it as a way for the
7883 * delegate association to be cleared. Note that we check
7884 * the uuid of the passed-in proc rather than that of the
7885 * current process, as we need to check the process issuing
7886 * the socket option which could be kernproc itself. Given
7887 * that we don't allow 0 for effective uuid, it means that
7888 * a delegated in-kernel socket stays delegated during its
7889 * lifetime (which is okay.)
7891 if (uuid_compare(euuid
, uuid
) == 0) {
7892 so
->so_flags
&= ~SOF_DELEGATED
;
7895 uuid_clear(so
->e_uuid
);
7897 so
->so_flags
|= SOF_DELEGATED
;
7899 * Unlike so_set_effective_pid(), we only have the UUID
7900 * here and the process ID is not known. Inherit the
7901 * real {pid,upid} of the socket.
7903 so
->e_upid
= so
->last_upid
;
7904 so
->e_pid
= so
->last_pid
;
7905 uuid_copy(so
->e_uuid
, euuid
);
7908 * The following will clear the effective process name as it's the same
7909 * as the real process
7911 if (so
->so_proto
!= NULL
&& so
->so_proto
->pr_update_last_owner
!= NULL
) {
7912 (*so
->so_proto
->pr_update_last_owner
)(so
, NULL
, NULL
);
7915 if (error
== 0 && net_io_policy_log
) {
7916 uuid_unparse(so
->e_uuid
, buf
);
7917 log(LOG_DEBUG
, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7918 "euuid %s%s\n", __func__
, proc_name_address(p
), proc_pid(p
),
7919 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), SOCK_DOM(so
),
7920 SOCK_TYPE(so
), so
->e_pid
, buf
,
7921 ((so
->so_flags
& SOF_DELEGATED
) ? " [delegated]" : ""));
7922 } else if (error
!= 0 && net_io_policy_log
) {
7923 uuid_unparse(euuid
, buf
);
7924 log(LOG_DEBUG
, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7925 "ERROR (%d)\n", __func__
, proc_name_address(p
), proc_pid(p
),
7926 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), SOCK_DOM(so
),
7927 SOCK_TYPE(so
), buf
, error
);
7930 /* Update this socket's policy upon success */
7932 so
->so_policy_gencnt
*= -1;
7933 so_update_policy(so
);
7935 so_update_necp_policy(so
, NULL
, NULL
);
7943 netpolicy_post_msg(uint32_t ev_code
, struct netpolicy_event_data
*ev_data
,
7944 uint32_t ev_datalen
)
7946 struct kev_msg ev_msg
;
7949 * A netpolicy event always starts with a netpolicy_event_data
7950 * structure, but the caller can provide for a longer event
7951 * structure to post, depending on the event code.
7953 VERIFY(ev_data
!= NULL
&& ev_datalen
>= sizeof(*ev_data
));
7955 bzero(&ev_msg
, sizeof(ev_msg
));
7956 ev_msg
.vendor_code
= KEV_VENDOR_APPLE
;
7957 ev_msg
.kev_class
= KEV_NETWORK_CLASS
;
7958 ev_msg
.kev_subclass
= KEV_NETPOLICY_SUBCLASS
;
7959 ev_msg
.event_code
= ev_code
;
7961 ev_msg
.dv
[0].data_ptr
= ev_data
;
7962 ev_msg
.dv
[0].data_length
= ev_datalen
;
7964 kev_post_msg(&ev_msg
);
7968 socket_post_kev_msg(uint32_t ev_code
,
7969 struct kev_socket_event_data
*ev_data
,
7970 uint32_t ev_datalen
)
7972 struct kev_msg ev_msg
;
7974 bzero(&ev_msg
, sizeof(ev_msg
));
7975 ev_msg
.vendor_code
= KEV_VENDOR_APPLE
;
7976 ev_msg
.kev_class
= KEV_NETWORK_CLASS
;
7977 ev_msg
.kev_subclass
= KEV_SOCKET_SUBCLASS
;
7978 ev_msg
.event_code
= ev_code
;
7980 ev_msg
.dv
[0].data_ptr
= ev_data
;
7981 ev_msg
.dv
[0].data_length
= ev_datalen
;
7983 kev_post_msg(&ev_msg
);
7987 socket_post_kev_msg_closed(struct socket
*so
)
7989 struct kev_socket_closed ev
= {};
7990 struct sockaddr
*socksa
= NULL
, *peersa
= NULL
;
7993 if ((so
->so_flags1
& SOF1_WANT_KEV_SOCK_CLOSED
) == 0) {
7996 err
= (*so
->so_proto
->pr_usrreqs
->pru_sockaddr
)(so
, &socksa
);
7998 err
= (*so
->so_proto
->pr_usrreqs
->pru_peeraddr
)(so
,
8001 memcpy(&ev
.ev_data
.kev_sockname
, socksa
,
8003 sizeof(ev
.ev_data
.kev_sockname
)));
8004 memcpy(&ev
.ev_data
.kev_peername
, peersa
,
8006 sizeof(ev
.ev_data
.kev_peername
)));
8007 socket_post_kev_msg(KEV_SOCKET_CLOSED
,
8008 &ev
.ev_data
, sizeof(ev
));
8011 FREE(socksa
, M_SONAME
);
8012 FREE(peersa
, M_SONAME
);