2 * Copyright (c) 1998-2018 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
92 #include <sys/uio_internal.h>
94 #include <sys/kdebug.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/net_api_stats.h>
102 #include <net/ntstat.h>
103 #include <net/content_filter.h>
104 #include <netinet/in.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet/in_tclass.h>
107 #include <netinet/tcp_var.h>
108 #include <netinet/ip6.h>
109 #include <netinet6/ip6_var.h>
110 #include <netinet/flow_divert.h>
111 #include <kern/zalloc.h>
112 #include <kern/locks.h>
113 #include <machine/limits.h>
114 #include <libkern/OSAtomic.h>
115 #include <pexpert/pexpert.h>
116 #include <kern/assert.h>
117 #include <kern/task.h>
118 #include <kern/policy_internal.h>
120 #include <sys/kpi_mbuf.h>
121 #include <sys/mcache.h>
122 #include <sys/unpcb.h>
123 #include <libkern/section_keywords.h>
126 #include <security/mac_framework.h>
130 #include <netinet/mp_pcb.h>
131 #include <netinet/mptcp_var.h>
132 #endif /* MULTIPATH */
134 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
136 #if DEBUG || DEVELOPMENT
137 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
139 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
142 /* TODO: this should be in a header file somewhere */
143 extern char *proc_name_address(void *p
);
145 static u_int32_t so_cache_hw
; /* High water mark for socache */
146 static u_int32_t so_cache_timeouts
; /* number of timeouts */
147 static u_int32_t so_cache_max_freed
; /* max freed per timeout */
148 static u_int32_t cached_sock_count
= 0;
149 STAILQ_HEAD(, socket
) so_cache_head
;
150 int max_cached_sock_count
= MAX_CACHED_SOCKETS
;
151 static u_int32_t so_cache_time
;
152 static int socketinit_done
;
153 static struct zone
*so_cache_zone
;
155 static lck_grp_t
*so_cache_mtx_grp
;
156 static lck_attr_t
*so_cache_mtx_attr
;
157 static lck_grp_attr_t
*so_cache_mtx_grp_attr
;
158 static lck_mtx_t
*so_cache_mtx
;
160 #include <machine/limits.h>
162 static int filt_sorattach(struct knote
*kn
, struct kevent_internal_s
*kev
);
163 static void filt_sordetach(struct knote
*kn
);
164 static int filt_soread(struct knote
*kn
, long hint
);
165 static int filt_sortouch(struct knote
*kn
, struct kevent_internal_s
*kev
);
166 static int filt_sorprocess(struct knote
*kn
, struct filt_process_s
*data
, struct kevent_internal_s
*kev
);
168 static int filt_sowattach(struct knote
*kn
, struct kevent_internal_s
*kev
);
169 static void filt_sowdetach(struct knote
*kn
);
170 static int filt_sowrite(struct knote
*kn
, long hint
);
171 static int filt_sowtouch(struct knote
*kn
, struct kevent_internal_s
*kev
);
172 static int filt_sowprocess(struct knote
*kn
, struct filt_process_s
*data
, struct kevent_internal_s
*kev
);
174 static int filt_sockattach(struct knote
*kn
, struct kevent_internal_s
*kev
);
175 static void filt_sockdetach(struct knote
*kn
);
176 static int filt_sockev(struct knote
*kn
, long hint
);
177 static int filt_socktouch(struct knote
*kn
, struct kevent_internal_s
*kev
);
178 static int filt_sockprocess(struct knote
*kn
, struct filt_process_s
*data
, struct kevent_internal_s
*kev
);
180 static int sooptcopyin_timeval(struct sockopt
*, struct timeval
*);
181 static int sooptcopyout_timeval(struct sockopt
*, const struct timeval
*);
183 SECURITY_READ_ONLY_EARLY(struct filterops
) soread_filtops
= {
185 .f_attach
= filt_sorattach
,
186 .f_detach
= filt_sordetach
,
187 .f_event
= filt_soread
,
188 .f_touch
= filt_sortouch
,
189 .f_process
= filt_sorprocess
,
192 SECURITY_READ_ONLY_EARLY(struct filterops
) sowrite_filtops
= {
194 .f_attach
= filt_sowattach
,
195 .f_detach
= filt_sowdetach
,
196 .f_event
= filt_sowrite
,
197 .f_touch
= filt_sowtouch
,
198 .f_process
= filt_sowprocess
,
201 SECURITY_READ_ONLY_EARLY(struct filterops
) sock_filtops
= {
203 .f_attach
= filt_sockattach
,
204 .f_detach
= filt_sockdetach
,
205 .f_event
= filt_sockev
,
206 .f_touch
= filt_socktouch
,
207 .f_process
= filt_sockprocess
,
210 SECURITY_READ_ONLY_EARLY(struct filterops
) soexcept_filtops
= {
212 .f_attach
= filt_sorattach
,
213 .f_detach
= filt_sordetach
,
214 .f_event
= filt_soread
,
215 .f_touch
= filt_sortouch
,
216 .f_process
= filt_sorprocess
,
219 SYSCTL_DECL(_kern_ipc
);
221 #define EVEN_MORE_LOCKING_DEBUG 0
223 int socket_debug
= 0;
224 SYSCTL_INT(_kern_ipc
, OID_AUTO
, socket_debug
,
225 CTLFLAG_RW
| CTLFLAG_LOCKED
, &socket_debug
, 0, "");
227 static unsigned long sodefunct_calls
= 0;
228 SYSCTL_LONG(_kern_ipc
, OID_AUTO
, sodefunct_calls
, CTLFLAG_LOCKED
,
229 &sodefunct_calls
, "");
231 static int socket_zone
= M_SOCKET
;
232 so_gen_t so_gencnt
; /* generation count for sockets */
234 MALLOC_DEFINE(M_SONAME
, "soname", "socket name");
235 MALLOC_DEFINE(M_PCB
, "pcb", "protocol control block");
237 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
238 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
239 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
240 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
241 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
242 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
243 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
244 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
245 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
247 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
249 int somaxconn
= SOMAXCONN
;
250 SYSCTL_INT(_kern_ipc
, KIPC_SOMAXCONN
, somaxconn
,
251 CTLFLAG_RW
| CTLFLAG_LOCKED
, &somaxconn
, 0, "");
253 /* Should we get a maximum also ??? */
254 static int sosendmaxchain
= 65536;
255 static int sosendminchain
= 16384;
256 static int sorecvmincopy
= 16384;
257 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendminchain
,
258 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sosendminchain
, 0, "");
259 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sorecvmincopy
,
260 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sorecvmincopy
, 0, "");
263 * Set to enable jumbo clusters (if available) for large writes when
264 * the socket is marked with SOF_MULTIPAGES; see below.
267 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendjcl
,
268 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sosendjcl
, 0, "");
271 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
272 * writes on the socket for all protocols on any network interfaces,
273 * depending upon sosendjcl above. Be extra careful when setting this
274 * to 1, because sending down packets that cross physical pages down to
275 * broken drivers (those that falsely assume that the physical pages
276 * are contiguous) might lead to system panics or silent data corruption.
277 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
278 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
279 * capable. Set this to 1 only for testing/debugging purposes.
281 int sosendjcl_ignore_capab
= 0;
282 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendjcl_ignore_capab
,
283 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sosendjcl_ignore_capab
, 0, "");
286 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
287 * writes on the socket for all protocols on any network interfaces.
288 * Be extra careful when setting this to 1, because sending down packets with
289 * clusters larger that 2 KB might lead to system panics or data corruption.
290 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
291 * on the outgoing interface
292 * Set this to 1 for testing/debugging purposes only.
294 int sosendbigcl_ignore_capab
= 0;
295 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendbigcl_ignore_capab
,
296 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sosendbigcl_ignore_capab
, 0, "");
298 int sodefunctlog
= 0;
299 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sodefunctlog
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
300 &sodefunctlog
, 0, "");
302 int sothrottlelog
= 0;
303 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sothrottlelog
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
304 &sothrottlelog
, 0, "");
306 int sorestrictrecv
= 1;
307 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sorestrictrecv
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
308 &sorestrictrecv
, 0, "Enable inbound interface restrictions");
310 int sorestrictsend
= 1;
311 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sorestrictsend
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
312 &sorestrictsend
, 0, "Enable outbound interface restrictions");
314 int soreserveheadroom
= 1;
315 SYSCTL_INT(_kern_ipc
, OID_AUTO
, soreserveheadroom
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
316 &soreserveheadroom
, 0, "To allocate contiguous datagram buffers");
318 #if (DEBUG || DEVELOPMENT)
319 int so_notsent_lowat_check
= 1;
320 SYSCTL_INT(_kern_ipc
, OID_AUTO
, notsent_lowat
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
321 &so_notsent_lowat_check
, 0, "enable/disable notsnet lowat check");
322 #endif /* DEBUG || DEVELOPMENT */
324 int so_accept_list_waits
= 0;
325 #if (DEBUG || DEVELOPMENT)
326 SYSCTL_INT(_kern_ipc
, OID_AUTO
, accept_list_waits
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
327 &so_accept_list_waits
, 0, "number of waits for listener incomp list");
328 #endif /* DEBUG || DEVELOPMENT */
330 extern struct inpcbinfo tcbinfo
;
332 /* TODO: these should be in header file */
333 extern int get_inpcb_str_size(void);
334 extern int get_tcp_str_size(void);
336 vm_size_t so_cache_zone_element_size
;
338 static int sodelayed_copy(struct socket
*, struct uio
*, struct mbuf
**,
340 static void cached_sock_alloc(struct socket
**, int);
341 static void cached_sock_free(struct socket
*);
344 * Maximum of extended background idle sockets per process
345 * Set to zero to disable further setting of the option
348 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
349 #define SO_IDLE_BK_IDLE_TIME 600
350 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
352 struct soextbkidlestat soextbkidlestat
;
354 SYSCTL_UINT(_kern_ipc
, OID_AUTO
, maxextbkidleperproc
,
355 CTLFLAG_RW
| CTLFLAG_LOCKED
, &soextbkidlestat
.so_xbkidle_maxperproc
, 0,
356 "Maximum of extended background idle sockets per process");
358 SYSCTL_UINT(_kern_ipc
, OID_AUTO
, extbkidletime
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
359 &soextbkidlestat
.so_xbkidle_time
, 0,
360 "Time in seconds to keep extended background idle sockets");
362 SYSCTL_UINT(_kern_ipc
, OID_AUTO
, extbkidlercvhiwat
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
363 &soextbkidlestat
.so_xbkidle_rcvhiwat
, 0,
364 "High water mark for extended background idle sockets");
366 SYSCTL_STRUCT(_kern_ipc
, OID_AUTO
, extbkidlestat
, CTLFLAG_RD
| CTLFLAG_LOCKED
,
367 &soextbkidlestat
, soextbkidlestat
, "");
369 int so_set_extended_bk_idle(struct socket
*, int);
373 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
374 * setting the DSCP code on the packet based on the service class; see
375 * <rdar://problem/11277343> for details.
377 __private_extern__ u_int32_t sotcdb
= 0;
378 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sotcdb
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
384 _CASSERT(sizeof(so_gencnt
) == sizeof(uint64_t));
385 VERIFY(IS_P2ALIGNED(&so_gencnt
, sizeof(uint32_t)));
388 _CASSERT(sizeof(struct sa_endpoints
) == sizeof(struct user64_sa_endpoints
));
389 _CASSERT(offsetof(struct sa_endpoints
, sae_srcif
) == offsetof(struct user64_sa_endpoints
, sae_srcif
));
390 _CASSERT(offsetof(struct sa_endpoints
, sae_srcaddr
) == offsetof(struct user64_sa_endpoints
, sae_srcaddr
));
391 _CASSERT(offsetof(struct sa_endpoints
, sae_srcaddrlen
) == offsetof(struct user64_sa_endpoints
, sae_srcaddrlen
));
392 _CASSERT(offsetof(struct sa_endpoints
, sae_dstaddr
) == offsetof(struct user64_sa_endpoints
, sae_dstaddr
));
393 _CASSERT(offsetof(struct sa_endpoints
, sae_dstaddrlen
) == offsetof(struct user64_sa_endpoints
, sae_dstaddrlen
));
395 _CASSERT(sizeof(struct sa_endpoints
) == sizeof(struct user32_sa_endpoints
));
396 _CASSERT(offsetof(struct sa_endpoints
, sae_srcif
) == offsetof(struct user32_sa_endpoints
, sae_srcif
));
397 _CASSERT(offsetof(struct sa_endpoints
, sae_srcaddr
) == offsetof(struct user32_sa_endpoints
, sae_srcaddr
));
398 _CASSERT(offsetof(struct sa_endpoints
, sae_srcaddrlen
) == offsetof(struct user32_sa_endpoints
, sae_srcaddrlen
));
399 _CASSERT(offsetof(struct sa_endpoints
, sae_dstaddr
) == offsetof(struct user32_sa_endpoints
, sae_dstaddr
));
400 _CASSERT(offsetof(struct sa_endpoints
, sae_dstaddrlen
) == offsetof(struct user32_sa_endpoints
, sae_dstaddrlen
));
403 if (socketinit_done
) {
404 printf("socketinit: already called...\n");
409 PE_parse_boot_argn("socket_debug", &socket_debug
,
410 sizeof(socket_debug
));
413 * allocate lock group attribute and group for socket cache mutex
415 so_cache_mtx_grp_attr
= lck_grp_attr_alloc_init();
416 so_cache_mtx_grp
= lck_grp_alloc_init("so_cache",
417 so_cache_mtx_grp_attr
);
420 * allocate the lock attribute for socket cache mutex
422 so_cache_mtx_attr
= lck_attr_alloc_init();
424 /* cached sockets mutex */
425 so_cache_mtx
= lck_mtx_alloc_init(so_cache_mtx_grp
, so_cache_mtx_attr
);
426 if (so_cache_mtx
== NULL
) {
427 panic("%s: unable to allocate so_cache_mtx\n", __func__
);
430 STAILQ_INIT(&so_cache_head
);
432 so_cache_zone_element_size
= (vm_size_t
)(sizeof(struct socket
) + 4
433 + get_inpcb_str_size() + 4 + get_tcp_str_size());
435 so_cache_zone
= zinit(so_cache_zone_element_size
,
436 (120000 * so_cache_zone_element_size
), 8192, "socache zone");
437 zone_change(so_cache_zone
, Z_CALLERACCT
, FALSE
);
438 zone_change(so_cache_zone
, Z_NOENCRYPT
, TRUE
);
440 bzero(&soextbkidlestat
, sizeof(struct soextbkidlestat
));
441 soextbkidlestat
.so_xbkidle_maxperproc
= SO_IDLE_BK_IDLE_MAX_PER_PROC
;
442 soextbkidlestat
.so_xbkidle_time
= SO_IDLE_BK_IDLE_TIME
;
443 soextbkidlestat
.so_xbkidle_rcvhiwat
= SO_IDLE_BK_IDLE_RCV_HIWAT
;
447 socket_tclass_init();
450 #endif /* MULTIPATH */
454 cached_sock_alloc(struct socket
**so
, int waitok
)
459 lck_mtx_lock(so_cache_mtx
);
461 if (!STAILQ_EMPTY(&so_cache_head
)) {
462 VERIFY(cached_sock_count
> 0);
464 *so
= STAILQ_FIRST(&so_cache_head
);
465 STAILQ_REMOVE_HEAD(&so_cache_head
, so_cache_ent
);
466 STAILQ_NEXT((*so
), so_cache_ent
) = NULL
;
469 lck_mtx_unlock(so_cache_mtx
);
471 temp
= (*so
)->so_saved_pcb
;
472 bzero((caddr_t
)*so
, sizeof(struct socket
));
474 (*so
)->so_saved_pcb
= temp
;
476 lck_mtx_unlock(so_cache_mtx
);
479 *so
= (struct socket
*)zalloc(so_cache_zone
);
481 *so
= (struct socket
*)zalloc_noblock(so_cache_zone
);
488 bzero((caddr_t
)*so
, sizeof(struct socket
));
491 * Define offsets for extra structures into our
492 * single block of memory. Align extra structures
493 * on longword boundaries.
496 offset
= (uintptr_t)*so
;
497 offset
+= sizeof(struct socket
);
499 offset
= ALIGN(offset
);
501 (*so
)->so_saved_pcb
= (caddr_t
)offset
;
502 offset
+= get_inpcb_str_size();
504 offset
= ALIGN(offset
);
506 ((struct inpcb
*)(void *)(*so
)->so_saved_pcb
)->inp_saved_ppcb
=
510 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER
, &(*so
)->so_flags1
);
514 cached_sock_free(struct socket
*so
)
516 lck_mtx_lock(so_cache_mtx
);
518 so_cache_time
= net_uptime();
519 if (++cached_sock_count
> max_cached_sock_count
) {
521 lck_mtx_unlock(so_cache_mtx
);
522 zfree(so_cache_zone
, so
);
524 if (so_cache_hw
< cached_sock_count
) {
525 so_cache_hw
= cached_sock_count
;
528 STAILQ_INSERT_TAIL(&so_cache_head
, so
, so_cache_ent
);
530 so
->cache_timestamp
= so_cache_time
;
531 lck_mtx_unlock(so_cache_mtx
);
536 so_update_last_owner_locked(struct socket
*so
, proc_t self
)
538 if (so
->last_pid
!= 0) {
540 * last_pid and last_upid should remain zero for sockets
541 * created using sock_socket. The check above achieves that
543 if (self
== PROC_NULL
) {
544 self
= current_proc();
547 if (so
->last_upid
!= proc_uniqueid(self
) ||
548 so
->last_pid
!= proc_pid(self
)) {
549 so
->last_upid
= proc_uniqueid(self
);
550 so
->last_pid
= proc_pid(self
);
551 proc_getexecutableuuid(self
, so
->last_uuid
,
552 sizeof(so
->last_uuid
));
554 proc_pidoriginatoruuid(so
->so_vuuid
, sizeof(so
->so_vuuid
));
559 so_update_policy(struct socket
*so
)
561 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
562 (void) inp_update_policy(sotoinpcb(so
));
568 so_update_necp_policy(struct socket
*so
, struct sockaddr
*override_local_addr
,
569 struct sockaddr
*override_remote_addr
)
571 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
572 inp_update_necp_policy(sotoinpcb(so
), override_local_addr
,
573 override_remote_addr
, 0);
583 boolean_t rc
= FALSE
;
585 lck_mtx_lock(so_cache_mtx
);
587 so_cache_time
= net_uptime();
589 while (!STAILQ_EMPTY(&so_cache_head
)) {
590 VERIFY(cached_sock_count
> 0);
591 p
= STAILQ_FIRST(&so_cache_head
);
592 if ((so_cache_time
- p
->cache_timestamp
) <
593 SO_CACHE_TIME_LIMIT
) {
597 STAILQ_REMOVE_HEAD(&so_cache_head
, so_cache_ent
);
600 zfree(so_cache_zone
, p
);
602 if (++n_freed
>= SO_CACHE_MAX_FREE_BATCH
) {
603 so_cache_max_freed
++;
608 /* Schedule again if there is more to cleanup */
609 if (!STAILQ_EMPTY(&so_cache_head
)) {
613 lck_mtx_unlock(so_cache_mtx
);
618 * Get a socket structure from our zone, and initialize it.
619 * We don't implement `waitok' yet (see comments in uipc_domain.c).
620 * Note that it would probably be better to allocate socket
621 * and PCB at the same time, but I'm not convinced that all
622 * the protocols can be easily modified to do this.
625 soalloc(int waitok
, int dom
, int type
)
629 if ((dom
== PF_INET
) && (type
== SOCK_STREAM
)) {
630 cached_sock_alloc(&so
, waitok
);
632 MALLOC_ZONE(so
, struct socket
*, sizeof(*so
), socket_zone
,
635 bzero(so
, sizeof(*so
));
639 so
->so_gencnt
= OSIncrementAtomic64((SInt64
*)&so_gencnt
);
640 so
->so_zone
= socket_zone
;
643 * Increment the socket allocation statistics
645 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_alloc_total
);
647 #if CONFIG_MACF_SOCKET
648 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
649 if (mac_socket_label_init(so
, !waitok
) != 0) {
653 #endif /* MAC_SOCKET */
660 socreate_internal(int dom
, struct socket
**aso
, int type
, int proto
,
661 struct proc
*p
, uint32_t flags
, struct proc
*ep
)
668 extern int tcpconsdebug
;
675 prp
= pffindproto(dom
, proto
, type
);
677 prp
= pffindtype(dom
, type
);
680 if (prp
== NULL
|| prp
->pr_usrreqs
->pru_attach
== NULL
) {
681 if (pffinddomain(dom
) == NULL
) {
685 if (pffindprotonotype(dom
, proto
) != NULL
) {
689 return EPROTONOSUPPORT
;
691 if (prp
->pr_type
!= type
) {
694 so
= soalloc(1, dom
, type
);
701 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_local_total
);
704 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_inet_total
);
705 if (type
== SOCK_STREAM
) {
706 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_inet_stream_total
);
708 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_inet_dgram_total
);
712 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_route_total
);
715 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_ndrv_total
);
718 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_key_total
);
721 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_inet6_total
);
722 if (type
== SOCK_STREAM
) {
723 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_inet6_stream_total
);
725 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_inet6_dgram_total
);
729 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_system_total
);
732 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_multipath_total
);
735 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_other_total
);
739 if (flags
& SOCF_ASYNC
) {
740 so
->so_state
|= SS_NBIO
;
743 TAILQ_INIT(&so
->so_incomp
);
744 TAILQ_INIT(&so
->so_comp
);
746 so
->last_upid
= proc_uniqueid(p
);
747 so
->last_pid
= proc_pid(p
);
748 proc_getexecutableuuid(p
, so
->last_uuid
, sizeof(so
->last_uuid
));
749 proc_pidoriginatoruuid(so
->so_vuuid
, sizeof(so
->so_vuuid
));
751 if (ep
!= PROC_NULL
&& ep
!= p
) {
752 so
->e_upid
= proc_uniqueid(ep
);
753 so
->e_pid
= proc_pid(ep
);
754 proc_getexecutableuuid(ep
, so
->e_uuid
, sizeof(so
->e_uuid
));
755 so
->so_flags
|= SOF_DELEGATED
;
758 so
->so_cred
= kauth_cred_proc_ref(p
);
759 if (!suser(kauth_cred_get(), NULL
)) {
760 so
->so_state
|= SS_PRIV
;
764 so
->so_rcv
.sb_flags
|= SB_RECV
;
765 so
->so_rcv
.sb_so
= so
->so_snd
.sb_so
= so
;
766 so
->next_lock_lr
= 0;
767 so
->next_unlock_lr
= 0;
769 #if CONFIG_MACF_SOCKET
770 mac_socket_label_associate(kauth_cred_get(), so
);
771 #endif /* MAC_SOCKET */
774 * Attachment will create the per pcb lock if necessary and
775 * increase refcount for creation, make sure it's done before
776 * socket is inserted in lists.
780 error
= (*prp
->pr_usrreqs
->pru_attach
)(so
, proto
, p
);
784 * If so_pcb is not zero, the socket will be leaked,
785 * so protocol attachment handler must be coded carefuly
787 so
->so_state
|= SS_NOFDREF
;
788 VERIFY(so
->so_usecount
> 0);
790 sofreelastref(so
, 1); /* will deallocate the socket */
794 atomic_add_32(&prp
->pr_domain
->dom_refs
, 1);
795 TAILQ_INIT(&so
->so_evlist
);
797 /* Attach socket filters for this protocol */
800 if (tcpconsdebug
== 2) {
801 so
->so_options
|= SO_DEBUG
;
804 so_set_default_traffic_class(so
);
807 * If this thread or task is marked to create backgrounded sockets,
808 * mark the socket as background.
810 if (proc_get_effective_thread_policy(current_thread(),
811 TASK_POLICY_NEW_SOCKETS_BG
)) {
812 socket_set_traffic_mgt_flags(so
, TRAFFIC_MGT_SO_BACKGROUND
);
813 so
->so_background_thread
= current_thread();
818 * Don't mark Unix domain, system or multipath sockets as
819 * eligible for defunct by default.
824 so
->so_flags
|= SOF_NODEFUNCT
;
831 * Entitlements can't be checked at socket creation time except if the
832 * application requested a feature guarded by a privilege (c.f., socket
834 * The priv(9) and the Sandboxing APIs are designed with the idea that
835 * a privilege check should only be triggered by a userland request.
836 * A privilege check at socket creation time is time consuming and
837 * could trigger many authorisation error messages from the security
852 * <pru_attach>:ENOBUFS[AF_UNIX]
853 * <pru_attach>:ENOBUFS[TCP]
854 * <pru_attach>:ENOMEM[TCP]
855 * <pru_attach>:??? [other protocol families, IPSEC]
858 socreate(int dom
, struct socket
**aso
, int type
, int proto
)
860 return socreate_internal(dom
, aso
, type
, proto
, current_proc(), 0,
865 socreate_delegate(int dom
, struct socket
**aso
, int type
, int proto
, pid_t epid
)
868 struct proc
*ep
= PROC_NULL
;
870 if ((proc_selfpid() != epid
) && ((ep
= proc_find(epid
)) == PROC_NULL
)) {
875 error
= socreate_internal(dom
, aso
, type
, proto
, current_proc(), 0, ep
);
878 * It might not be wise to hold the proc reference when calling
879 * socreate_internal since it calls soalloc with M_WAITOK
882 if (ep
!= PROC_NULL
) {
891 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
892 * <pru_bind>:EAFNOSUPPORT Address family not supported
893 * <pru_bind>:EADDRNOTAVAIL Address not available.
894 * <pru_bind>:EINVAL Invalid argument
895 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
896 * <pru_bind>:EACCES Permission denied
897 * <pru_bind>:EADDRINUSE Address in use
898 * <pru_bind>:EAGAIN Resource unavailable, try again
899 * <pru_bind>:EPERM Operation not permitted
903 * Notes: It's not possible to fully enumerate the return codes above,
904 * since socket filter authors and protocol family authors may
905 * not choose to limit their error returns to those listed, even
906 * though this may result in some software operating incorrectly.
908 * The error codes which are enumerated above are those known to
909 * be returned by the tcp_usr_bind function supplied.
912 sobindlock(struct socket
*so
, struct sockaddr
*nam
, int dolock
)
914 struct proc
*p
= current_proc();
921 so_update_last_owner_locked(so
, p
);
922 so_update_policy(so
);
925 so_update_necp_policy(so
, nam
, NULL
);
929 * If this is a bind request on a socket that has been marked
930 * as inactive, reject it now before we go any further.
932 if (so
->so_flags
& SOF_DEFUNCT
) {
934 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
935 __func__
, proc_pid(p
), proc_best_name(p
),
936 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
937 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
942 error
= sflt_bind(so
, nam
);
945 error
= (*so
->so_proto
->pr_usrreqs
->pru_bind
)(so
, nam
, p
);
949 socket_unlock(so
, 1);
952 if (error
== EJUSTRETURN
) {
960 sodealloc(struct socket
*so
)
962 kauth_cred_unref(&so
->so_cred
);
964 /* Remove any filters */
968 cfil_sock_detach(so
);
969 #endif /* CONTENT_FILTER */
971 /* Delete the state allocated for msg queues on a socket */
972 if (so
->so_flags
& SOF_ENABLE_MSGS
) {
973 FREE(so
->so_msg_state
, M_TEMP
);
974 so
->so_msg_state
= NULL
;
976 VERIFY(so
->so_msg_state
== NULL
);
978 so
->so_gencnt
= OSIncrementAtomic64((SInt64
*)&so_gencnt
);
980 #if CONFIG_MACF_SOCKET
981 mac_socket_label_destroy(so
);
982 #endif /* MAC_SOCKET */
984 if (so
->so_flags1
& SOF1_CACHED_IN_SOCK_LAYER
) {
985 cached_sock_free(so
);
987 FREE_ZONE(so
, sizeof(*so
), so
->so_zone
);
995 * <pru_listen>:EINVAL[AF_UNIX]
996 * <pru_listen>:EINVAL[TCP]
997 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
998 * <pru_listen>:EINVAL[TCP] Invalid argument
999 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
1000 * <pru_listen>:EACCES[TCP] Permission denied
1001 * <pru_listen>:EADDRINUSE[TCP] Address in use
1002 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
1003 * <pru_listen>:EPERM[TCP] Operation not permitted
1006 * Notes: Other <pru_listen> returns depend on the protocol family; all
1007 * <sf_listen> returns depend on what the filter author causes
1008 * their filter to return.
1011 solisten(struct socket
*so
, int backlog
)
1013 struct proc
*p
= current_proc();
1018 so_update_last_owner_locked(so
, p
);
1019 so_update_policy(so
);
1022 so_update_necp_policy(so
, NULL
, NULL
);
1025 if (so
->so_proto
== NULL
) {
1029 if ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) == 0) {
1035 * If the listen request is made on a socket that is not fully
1036 * disconnected, or on a socket that has been marked as inactive,
1037 * reject the request now.
1040 (SS_ISCONNECTED
| SS_ISCONNECTING
| SS_ISDISCONNECTING
)) ||
1041 (so
->so_flags
& SOF_DEFUNCT
)) {
1043 if (so
->so_flags
& SOF_DEFUNCT
) {
1044 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1045 "(%d)\n", __func__
, proc_pid(p
),
1047 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
1048 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
1053 if ((so
->so_restrictions
& SO_RESTRICT_DENY_IN
) != 0) {
1058 error
= sflt_listen(so
);
1060 error
= (*so
->so_proto
->pr_usrreqs
->pru_listen
)(so
, p
);
1064 if (error
== EJUSTRETURN
) {
1070 if (TAILQ_EMPTY(&so
->so_comp
)) {
1071 so
->so_options
|= SO_ACCEPTCONN
;
1074 * POSIX: The implementation may have an upper limit on the length of
1075 * the listen queue-either global or per accepting socket. If backlog
1076 * exceeds this limit, the length of the listen queue is set to the
1079 * If listen() is called with a backlog argument value that is less
1080 * than 0, the function behaves as if it had been called with a backlog
1081 * argument value of 0.
1083 * A backlog argument of 0 may allow the socket to accept connections,
1084 * in which case the length of the listen queue may be set to an
1085 * implementation-defined minimum value.
1087 if (backlog
<= 0 || backlog
> somaxconn
) {
1088 backlog
= somaxconn
;
1091 so
->so_qlimit
= backlog
;
1093 socket_unlock(so
, 1);
1098 * The "accept list lock" protects the fields related to the listener queues
1099 * because we can unlock a socket to respect the lock ordering between
1100 * the listener socket and its clients sockets. The lock ordering is first to
1101 * acquire the client socket before the listener socket.
1103 * The accept list lock serializes access to the following fields:
1104 * - of the listener socket:
1109 * - of client sockets that are in so_comp or so_incomp:
1113 * As one can see the accept list lock protects the consistent of the
1114 * linkage of the client sockets.
1116 * Note that those fields may be read without holding the accept list lock
1117 * for a preflight provided the accept list lock is taken when committing
1118 * to take an action based on the result of the preflight. The preflight
1119 * saves the cost of doing the unlock/lock dance.
1122 so_acquire_accept_list(struct socket
*head
, struct socket
*so
)
1124 lck_mtx_t
*mutex_held
;
1126 if (head
->so_proto
->pr_getlock
== NULL
) {
1129 mutex_held
= (*head
->so_proto
->pr_getlock
)(head
, PR_F_WILLUNLOCK
);
1130 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1132 if (!(head
->so_flags1
& SOF1_ACCEPT_LIST_HELD
)) {
1133 head
->so_flags1
|= SOF1_ACCEPT_LIST_HELD
;
1137 socket_unlock(so
, 0);
1139 while (head
->so_flags1
& SOF1_ACCEPT_LIST_HELD
) {
1140 so_accept_list_waits
+= 1;
1141 msleep((caddr_t
)&head
->so_incomp
, mutex_held
,
1142 PSOCK
| PCATCH
, __func__
, NULL
);
1144 head
->so_flags1
|= SOF1_ACCEPT_LIST_HELD
;
1146 socket_unlock(head
, 0);
1148 socket_lock(head
, 0);
1153 so_release_accept_list(struct socket
*head
)
1155 if (head
->so_proto
->pr_getlock
!= NULL
) {
1156 lck_mtx_t
*mutex_held
;
1158 mutex_held
= (*head
->so_proto
->pr_getlock
)(head
, 0);
1159 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1161 head
->so_flags1
&= ~SOF1_ACCEPT_LIST_HELD
;
1162 wakeup((caddr_t
)&head
->so_incomp
);
1167 sofreelastref(struct socket
*so
, int dealloc
)
1169 struct socket
*head
= so
->so_head
;
1171 /* Assume socket is locked */
1173 if (!(so
->so_flags
& SOF_PCBCLEARING
) || !(so
->so_state
& SS_NOFDREF
)) {
1174 selthreadclear(&so
->so_snd
.sb_sel
);
1175 selthreadclear(&so
->so_rcv
.sb_sel
);
1176 so
->so_rcv
.sb_flags
&= ~(SB_SEL
| SB_UPCALL
);
1177 so
->so_snd
.sb_flags
&= ~(SB_SEL
| SB_UPCALL
);
1178 so
->so_event
= sonullevent
;
1183 * Need to lock the listener when the protocol has
1186 if (head
->so_proto
->pr_getlock
!= NULL
) {
1187 socket_lock(head
, 1);
1188 so_acquire_accept_list(head
, so
);
1190 if (so
->so_state
& SS_INCOMP
) {
1191 so
->so_state
&= ~SS_INCOMP
;
1192 TAILQ_REMOVE(&head
->so_incomp
, so
, so_list
);
1197 if (head
->so_proto
->pr_getlock
!= NULL
) {
1198 so_release_accept_list(head
);
1199 socket_unlock(head
, 1);
1201 } else if (so
->so_state
& SS_COMP
) {
1202 if (head
->so_proto
->pr_getlock
!= NULL
) {
1203 so_release_accept_list(head
);
1204 socket_unlock(head
, 1);
1207 * We must not decommission a socket that's
1208 * on the accept(2) queue. If we do, then
1209 * accept(2) may hang after select(2) indicated
1210 * that the listening socket was ready.
1212 selthreadclear(&so
->so_snd
.sb_sel
);
1213 selthreadclear(&so
->so_rcv
.sb_sel
);
1214 so
->so_rcv
.sb_flags
&= ~(SB_SEL
| SB_UPCALL
);
1215 so
->so_snd
.sb_flags
&= ~(SB_SEL
| SB_UPCALL
);
1216 so
->so_event
= sonullevent
;
1219 if (head
->so_proto
->pr_getlock
!= NULL
) {
1220 so_release_accept_list(head
);
1221 socket_unlock(head
, 1);
1223 printf("sofree: not queued\n");
1230 if (so
->so_flags
& SOF_FLOW_DIVERT
) {
1231 flow_divert_detach(so
);
1233 #endif /* FLOW_DIVERT */
1235 /* 3932268: disable upcall */
1236 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
1237 so
->so_snd
.sb_flags
&= ~(SB_UPCALL
| SB_SNDBYTE_CNT
);
1238 so
->so_event
= sonullevent
;
1246 soclose_wait_locked(struct socket
*so
)
1248 lck_mtx_t
*mutex_held
;
1250 if (so
->so_proto
->pr_getlock
!= NULL
) {
1251 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, PR_F_WILLUNLOCK
);
1253 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1255 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1258 * Double check here and return if there's no outstanding upcall;
1259 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1261 if (!so
->so_upcallusecount
|| !(so
->so_flags
& SOF_UPCALLCLOSEWAIT
)) {
1264 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
1265 so
->so_snd
.sb_flags
&= ~SB_UPCALL
;
1266 so
->so_flags
|= SOF_CLOSEWAIT
;
1268 (void) msleep((caddr_t
)&so
->so_upcallusecount
, mutex_held
, (PZERO
- 1),
1269 "soclose_wait_locked", NULL
);
1270 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1271 so
->so_flags
&= ~SOF_CLOSEWAIT
;
1275 * Close a socket on last file table reference removal.
1276 * Initiate disconnect if connected.
1277 * Free socket when disconnect complete.
1280 soclose_locked(struct socket
*so
)
1285 if (so
->so_usecount
== 0) {
1286 panic("soclose: so=%p refcount=0\n", so
);
1290 sflt_notify(so
, sock_evt_closing
, NULL
);
1292 if (so
->so_upcallusecount
) {
1293 soclose_wait_locked(so
);
1298 * We have to wait until the content filters are done
1300 if ((so
->so_flags
& SOF_CONTENT_FILTER
) != 0) {
1301 cfil_sock_close_wait(so
);
1302 cfil_sock_is_closed(so
);
1303 cfil_sock_detach(so
);
1305 #endif /* CONTENT_FILTER */
1307 if (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_INPROG
) {
1308 soresume(current_proc(), so
, 1);
1309 so
->so_flags1
&= ~SOF1_EXTEND_BK_IDLE_WANTED
;
1312 if ((so
->so_options
& SO_ACCEPTCONN
)) {
1313 struct socket
*sp
, *sonext
;
1314 int persocklock
= 0;
1315 int incomp_overflow_only
;
1318 * We do not want new connection to be added
1319 * to the connection queues
1321 so
->so_options
&= ~SO_ACCEPTCONN
;
1324 * We can drop the lock on the listener once
1325 * we've acquired the incoming list
1327 if (so
->so_proto
->pr_getlock
!= NULL
) {
1329 so_acquire_accept_list(so
, NULL
);
1330 socket_unlock(so
, 0);
1333 incomp_overflow_only
= 1;
1335 TAILQ_FOREACH_SAFE(sp
, &so
->so_incomp
, so_list
, sonext
) {
1338 * skip sockets thrown away by tcpdropdropblreq
1339 * they will get cleanup by the garbage collection.
1340 * otherwise, remove the incomp socket from the queue
1341 * and let soabort trigger the appropriate cleanup.
1343 if (sp
->so_flags
& SOF_OVERFLOW
) {
1347 if (persocklock
!= 0) {
1353 * The extra reference for the list insure the
1354 * validity of the socket pointer when we perform the
1355 * unlock of the head above
1357 if (sp
->so_state
& SS_INCOMP
) {
1358 sp
->so_state
&= ~SS_INCOMP
;
1360 TAILQ_REMOVE(&so
->so_incomp
, sp
, so_list
);
1366 panic("%s sp %p in so_incomp but !SS_INCOMP",
1370 if (persocklock
!= 0) {
1371 socket_unlock(sp
, 1);
1375 TAILQ_FOREACH_SAFE(sp
, &so
->so_comp
, so_list
, sonext
) {
1376 /* Dequeue from so_comp since sofree() won't do it */
1377 if (persocklock
!= 0) {
1381 if (sp
->so_state
& SS_COMP
) {
1382 sp
->so_state
&= ~SS_COMP
;
1384 TAILQ_REMOVE(&so
->so_comp
, sp
, so_list
);
1389 panic("%s sp %p in so_comp but !SS_COMP",
1394 socket_unlock(sp
, 1);
1398 if (incomp_overflow_only
== 0 && !TAILQ_EMPTY(&so
->so_incomp
)) {
1399 #if (DEBUG | DEVELOPMENT)
1400 panic("%s head %p so_comp not empty\n", __func__
, so
);
1401 #endif /* (DEVELOPMENT || DEBUG) */
1406 if (!TAILQ_EMPTY(&so
->so_comp
)) {
1407 #if (DEBUG | DEVELOPMENT)
1408 panic("%s head %p so_comp not empty\n", __func__
, so
);
1409 #endif /* (DEVELOPMENT || DEBUG) */
1416 so_release_accept_list(so
);
1419 if (so
->so_pcb
== NULL
) {
1420 /* 3915887: mark the socket as ready for dealloc */
1421 so
->so_flags
|= SOF_PCBCLEARING
;
1424 if (so
->so_state
& SS_ISCONNECTED
) {
1425 if ((so
->so_state
& SS_ISDISCONNECTING
) == 0) {
1426 error
= sodisconnectlocked(so
);
1431 if (so
->so_options
& SO_LINGER
) {
1432 lck_mtx_t
*mutex_held
;
1434 if ((so
->so_state
& SS_ISDISCONNECTING
) &&
1435 (so
->so_state
& SS_NBIO
)) {
1438 if (so
->so_proto
->pr_getlock
!= NULL
) {
1439 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, PR_F_WILLUNLOCK
);
1441 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1443 while (so
->so_state
& SS_ISCONNECTED
) {
1444 ts
.tv_sec
= (so
->so_linger
/ 100);
1445 ts
.tv_nsec
= (so
->so_linger
% 100) *
1446 NSEC_PER_USEC
* 1000 * 10;
1447 error
= msleep((caddr_t
)&so
->so_timeo
,
1448 mutex_held
, PSOCK
| PCATCH
, "soclose", &ts
);
1451 * It's OK when the time fires,
1452 * don't report an error
1454 if (error
== EWOULDBLOCK
) {
1463 if (so
->so_usecount
== 0) {
1464 panic("soclose: usecount is zero so=%p\n", so
);
1467 if (so
->so_pcb
!= NULL
&& !(so
->so_flags
& SOF_PCBCLEARING
)) {
1468 int error2
= (*so
->so_proto
->pr_usrreqs
->pru_detach
)(so
);
1473 if (so
->so_usecount
<= 0) {
1474 panic("soclose: usecount is zero so=%p\n", so
);
1478 if (so
->so_pcb
!= NULL
&& !(so
->so_flags
& SOF_MP_SUBFLOW
) &&
1479 (so
->so_state
& SS_NOFDREF
)) {
1480 panic("soclose: NOFDREF");
1483 so
->so_state
|= SS_NOFDREF
;
1485 if ((so
->so_flags
& SOF_KNOTE
) != 0) {
1486 KNOTE(&so
->so_klist
, SO_FILT_HINT_LOCKED
);
1489 atomic_add_32(&so
->so_proto
->pr_domain
->dom_refs
, -1);
1492 VERIFY(so
->so_usecount
> 0);
1499 soclose(struct socket
*so
)
1504 if (so
->so_retaincnt
== 0) {
1505 error
= soclose_locked(so
);
1508 * if the FD is going away, but socket is
1509 * retained in kernel remove its reference
1512 if (so
->so_usecount
< 2) {
1513 panic("soclose: retaincnt non null and so=%p "
1514 "usecount=%d\n", so
, so
->so_usecount
);
1517 socket_unlock(so
, 1);
1522 * Must be called at splnet...
1524 /* Should already be locked */
1526 soabort(struct socket
*so
)
1530 #ifdef MORE_LOCKING_DEBUG
1531 lck_mtx_t
*mutex_held
;
1533 if (so
->so_proto
->pr_getlock
!= NULL
) {
1534 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
1536 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1538 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1541 if ((so
->so_flags
& SOF_ABORTED
) == 0) {
1542 so
->so_flags
|= SOF_ABORTED
;
1543 error
= (*so
->so_proto
->pr_usrreqs
->pru_abort
)(so
);
1553 soacceptlock(struct socket
*so
, struct sockaddr
**nam
, int dolock
)
1561 so_update_last_owner_locked(so
, PROC_NULL
);
1562 so_update_policy(so
);
1564 so_update_necp_policy(so
, NULL
, NULL
);
1567 if ((so
->so_state
& SS_NOFDREF
) == 0) {
1568 panic("soaccept: !NOFDREF");
1570 so
->so_state
&= ~SS_NOFDREF
;
1571 error
= (*so
->so_proto
->pr_usrreqs
->pru_accept
)(so
, nam
);
1574 socket_unlock(so
, 1);
1580 soaccept(struct socket
*so
, struct sockaddr
**nam
)
1582 return soacceptlock(so
, nam
, 1);
1586 soacceptfilter(struct socket
*so
, struct socket
*head
)
1588 struct sockaddr
*local
= NULL
, *remote
= NULL
;
1592 * Hold the lock even if this socket has not been made visible
1593 * to the filter(s). For sockets with global locks, this protects
1594 * against the head or peer going away
1597 if (sogetaddr_locked(so
, &remote
, 1) != 0 ||
1598 sogetaddr_locked(so
, &local
, 0) != 0) {
1599 so
->so_state
&= ~SS_NOFDREF
;
1600 socket_unlock(so
, 1);
1602 /* Out of resources; try it again next time */
1603 error
= ECONNABORTED
;
1607 error
= sflt_accept(head
, so
, local
, remote
);
1610 * If we get EJUSTRETURN from one of the filters, mark this socket
1611 * as inactive and return it anyway. This newly accepted socket
1612 * will be disconnected later before we hand it off to the caller.
1614 if (error
== EJUSTRETURN
) {
1616 (void) sosetdefunct(current_proc(), so
,
1617 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL
, FALSE
);
1622 * This may seem like a duplication to the above error
1623 * handling part when we return ECONNABORTED, except
1624 * the following is done while holding the lock since
1625 * the socket has been exposed to the filter(s) earlier.
1627 so
->so_state
&= ~SS_NOFDREF
;
1628 socket_unlock(so
, 1);
1630 /* Propagate socket filter's error code to the caller */
1632 socket_unlock(so
, 1);
1635 /* Callee checks for NULL pointer */
1636 sock_freeaddr(remote
);
1637 sock_freeaddr(local
);
1642 * Returns: 0 Success
1643 * EOPNOTSUPP Operation not supported on socket
1644 * EISCONN Socket is connected
1645 * <pru_connect>:EADDRNOTAVAIL Address not available.
1646 * <pru_connect>:EINVAL Invalid argument
1647 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1648 * <pru_connect>:EACCES Permission denied
1649 * <pru_connect>:EADDRINUSE Address in use
1650 * <pru_connect>:EAGAIN Resource unavailable, try again
1651 * <pru_connect>:EPERM Operation not permitted
1652 * <sf_connect_out>:??? [anything a filter writer might set]
1655 soconnectlock(struct socket
*so
, struct sockaddr
*nam
, int dolock
)
1658 struct proc
*p
= current_proc();
1664 so_update_last_owner_locked(so
, p
);
1665 so_update_policy(so
);
1668 so_update_necp_policy(so
, NULL
, nam
);
1672 * If this is a listening socket or if this is a previously-accepted
1673 * socket that has been marked as inactive, reject the connect request.
1675 if ((so
->so_options
& SO_ACCEPTCONN
) || (so
->so_flags
& SOF_DEFUNCT
)) {
1677 if (so
->so_flags
& SOF_DEFUNCT
) {
1678 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1679 "(%d)\n", __func__
, proc_pid(p
),
1681 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
1682 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
1685 socket_unlock(so
, 1);
1690 if ((so
->so_restrictions
& SO_RESTRICT_DENY_OUT
) != 0) {
1692 socket_unlock(so
, 1);
1698 * If protocol is connection-based, can only connect once.
1699 * Otherwise, if connected, try to disconnect first.
1700 * This allows user to disconnect by connecting to, e.g.,
1703 if (so
->so_state
& (SS_ISCONNECTED
| SS_ISCONNECTING
) &&
1704 ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ||
1705 (error
= sodisconnectlocked(so
)))) {
1709 * Run connect filter before calling protocol:
1710 * - non-blocking connect returns before completion;
1712 error
= sflt_connectout(so
, nam
);
1714 if (error
== EJUSTRETURN
) {
1718 error
= (*so
->so_proto
->pr_usrreqs
->pru_connect
)
1723 socket_unlock(so
, 1);
1729 soconnect(struct socket
*so
, struct sockaddr
*nam
)
1731 return soconnectlock(so
, nam
, 1);
1735 * Returns: 0 Success
1736 * <pru_connect2>:EINVAL[AF_UNIX]
1737 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1738 * <pru_connect2>:??? [other protocol families]
1740 * Notes: <pru_connect2> is not supported by [TCP].
1743 soconnect2(struct socket
*so1
, struct socket
*so2
)
1747 socket_lock(so1
, 1);
1748 if (so2
->so_proto
->pr_lock
) {
1749 socket_lock(so2
, 1);
1752 error
= (*so1
->so_proto
->pr_usrreqs
->pru_connect2
)(so1
, so2
);
1754 socket_unlock(so1
, 1);
1755 if (so2
->so_proto
->pr_lock
) {
1756 socket_unlock(so2
, 1);
1762 soconnectxlocked(struct socket
*so
, struct sockaddr
*src
,
1763 struct sockaddr
*dst
, struct proc
*p
, uint32_t ifscope
,
1764 sae_associd_t aid
, sae_connid_t
*pcid
, uint32_t flags
, void *arg
,
1765 uint32_t arglen
, uio_t auio
, user_ssize_t
*bytes_written
)
1769 so_update_last_owner_locked(so
, p
);
1770 so_update_policy(so
);
1773 * If this is a listening socket or if this is a previously-accepted
1774 * socket that has been marked as inactive, reject the connect request.
1776 if ((so
->so_options
& SO_ACCEPTCONN
) || (so
->so_flags
& SOF_DEFUNCT
)) {
1778 if (so
->so_flags
& SOF_DEFUNCT
) {
1779 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1780 "(%d)\n", __func__
, proc_pid(p
),
1782 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
1783 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
1788 if ((so
->so_restrictions
& SO_RESTRICT_DENY_OUT
) != 0) {
1793 * If protocol is connection-based, can only connect once
1794 * unless PR_MULTICONN is set. Otherwise, if connected,
1795 * try to disconnect first. This allows user to disconnect
1796 * by connecting to, e.g., a null address.
1798 if ((so
->so_state
& (SS_ISCONNECTED
| SS_ISCONNECTING
)) &&
1799 !(so
->so_proto
->pr_flags
& PR_MULTICONN
) &&
1800 ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ||
1801 (error
= sodisconnectlocked(so
)) != 0)) {
1805 * Run connect filter before calling protocol:
1806 * - non-blocking connect returns before completion;
1808 error
= sflt_connectout(so
, dst
);
1810 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1811 so
->so_flags1
&= ~SOF1_PRECONNECT_DATA
;
1812 if (error
== EJUSTRETURN
) {
1816 error
= (*so
->so_proto
->pr_usrreqs
->pru_connectx
)
1817 (so
, src
, dst
, p
, ifscope
, aid
, pcid
,
1818 flags
, arg
, arglen
, auio
, bytes_written
);
1826 sodisconnectlocked(struct socket
*so
)
1830 if ((so
->so_state
& SS_ISCONNECTED
) == 0) {
1834 if (so
->so_state
& SS_ISDISCONNECTING
) {
1839 error
= (*so
->so_proto
->pr_usrreqs
->pru_disconnect
)(so
);
1841 sflt_notify(so
, sock_evt_disconnected
, NULL
);
1848 /* Locking version */
1850 sodisconnect(struct socket
*so
)
1855 error
= sodisconnectlocked(so
);
1856 socket_unlock(so
, 1);
1861 sodisconnectxlocked(struct socket
*so
, sae_associd_t aid
, sae_connid_t cid
)
1866 * Call the protocol disconnectx handler; let it handle all
1867 * matters related to the connection state of this session.
1869 error
= (*so
->so_proto
->pr_usrreqs
->pru_disconnectx
)(so
, aid
, cid
);
1872 * The event applies only for the session, not for
1873 * the disconnection of individual subflows.
1875 if (so
->so_state
& (SS_ISDISCONNECTING
| SS_ISDISCONNECTED
)) {
1876 sflt_notify(so
, sock_evt_disconnected
, NULL
);
1883 sodisconnectx(struct socket
*so
, sae_associd_t aid
, sae_connid_t cid
)
1888 error
= sodisconnectxlocked(so
, aid
, cid
);
1889 socket_unlock(so
, 1);
1893 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1896 * sosendcheck will lock the socket buffer if it isn't locked and
1897 * verify that there is space for the data being inserted.
1899 * Returns: 0 Success
1901 * sblock:EWOULDBLOCK
1908 sosendcheck(struct socket
*so
, struct sockaddr
*addr
, user_ssize_t resid
,
1909 int32_t clen
, int32_t atomic
, int flags
, int *sblocked
,
1910 struct mbuf
*control
)
1917 if (*sblocked
== 0) {
1918 if ((so
->so_snd
.sb_flags
& SB_LOCK
) != 0 &&
1919 so
->so_send_filt_thread
!= 0 &&
1920 so
->so_send_filt_thread
== current_thread()) {
1922 * We're being called recursively from a filter,
1923 * allow this to continue. Radar 4150520.
1924 * Don't set sblocked because we don't want
1925 * to perform an unlock later.
1929 error
= sblock(&so
->so_snd
, SBLOCKWAIT(flags
));
1931 if (so
->so_flags
& SOF_DEFUNCT
) {
1941 * If a send attempt is made on a socket that has been marked
1942 * as inactive (disconnected), reject the request.
1944 if (so
->so_flags
& SOF_DEFUNCT
) {
1947 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1948 __func__
, proc_selfpid(), proc_best_name(current_proc()),
1949 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
1950 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
1954 if (so
->so_state
& SS_CANTSENDMORE
) {
1957 * Can re-inject data of half closed connections
1959 if ((so
->so_state
& SS_ISDISCONNECTED
) == 0 &&
1960 so
->so_snd
.sb_cfil_thread
== current_thread() &&
1961 cfil_sock_data_pending(&so
->so_snd
) != 0) {
1963 "so %llx ignore SS_CANTSENDMORE",
1964 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
));
1966 #endif /* CONTENT_FILTER */
1970 error
= so
->so_error
;
1975 if ((so
->so_state
& SS_ISCONNECTED
) == 0) {
1976 if ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) != 0) {
1977 if (((so
->so_state
& SS_ISCONFIRMING
) == 0) &&
1978 (resid
!= 0 || clen
== 0) &&
1979 !(so
->so_flags1
& SOF1_PRECONNECT_DATA
)) {
1982 } else if (addr
== 0 && !(flags
& MSG_HOLD
)) {
1983 return (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ?
1984 ENOTCONN
: EDESTADDRREQ
;
1988 if (so
->so_flags
& SOF_ENABLE_MSGS
) {
1989 space
= msgq_sbspace(so
, control
);
1991 space
= sbspace(&so
->so_snd
);
1994 if (flags
& MSG_OOB
) {
1997 if ((atomic
&& resid
> so
->so_snd
.sb_hiwat
) ||
1998 clen
> so
->so_snd
.sb_hiwat
) {
2002 if ((space
< resid
+ clen
&&
2003 (atomic
|| (space
< (int32_t)so
->so_snd
.sb_lowat
) ||
2005 (so
->so_type
== SOCK_STREAM
&& so_wait_for_if_feedback(so
))) {
2007 * don't block the connectx call when there's more data
2008 * than can be copied.
2010 if (so
->so_flags1
& SOF1_PRECONNECT_DATA
) {
2014 if (space
< (int32_t)so
->so_snd
.sb_lowat
) {
2018 if ((so
->so_state
& SS_NBIO
) || (flags
& MSG_NBIO
) ||
2022 sbunlock(&so
->so_snd
, TRUE
); /* keep socket locked */
2024 error
= sbwait(&so
->so_snd
);
2026 if (so
->so_flags
& SOF_DEFUNCT
) {
2038 * If send must go all at once and message is larger than
2039 * send buffering, then hard error.
2040 * Lock against other senders.
2041 * If must go all at once and not enough room now, then
2042 * inform user that this would block and do nothing.
2043 * Otherwise, if nonblocking, send as much as possible.
2044 * The data to be sent is described by "uio" if nonzero,
2045 * otherwise by the mbuf chain "top" (which must be null
2046 * if uio is not). Data provided in mbuf chain must be small
2047 * enough to send all at once.
2049 * Returns nonzero on error, timeout or signal; callers
2050 * must check for short counts if EINTR/ERESTART are returned.
2051 * Data and control buffers are freed on return.
2053 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
2054 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
2055 * point at the mbuf chain being constructed and go from there.
2057 * Returns: 0 Success
2063 * sosendcheck:EWOULDBLOCK
2067 * sosendcheck:??? [value from so_error]
2068 * <pru_send>:ECONNRESET[TCP]
2069 * <pru_send>:EINVAL[TCP]
2070 * <pru_send>:ENOBUFS[TCP]
2071 * <pru_send>:EADDRINUSE[TCP]
2072 * <pru_send>:EADDRNOTAVAIL[TCP]
2073 * <pru_send>:EAFNOSUPPORT[TCP]
2074 * <pru_send>:EACCES[TCP]
2075 * <pru_send>:EAGAIN[TCP]
2076 * <pru_send>:EPERM[TCP]
2077 * <pru_send>:EMSGSIZE[TCP]
2078 * <pru_send>:EHOSTUNREACH[TCP]
2079 * <pru_send>:ENETUNREACH[TCP]
2080 * <pru_send>:ENETDOWN[TCP]
2081 * <pru_send>:ENOMEM[TCP]
2082 * <pru_send>:ENOBUFS[TCP]
2083 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2084 * <pru_send>:EINVAL[AF_UNIX]
2085 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2086 * <pru_send>:EPIPE[AF_UNIX]
2087 * <pru_send>:ENOTCONN[AF_UNIX]
2088 * <pru_send>:EISCONN[AF_UNIX]
2089 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2090 * <sf_data_out>:??? [whatever a filter author chooses]
2092 * Notes: Other <pru_send> returns depend on the protocol family; all
2093 * <sf_data_out> returns depend on what the filter author causes
2094 * their filter to return.
2097 sosend(struct socket
*so
, struct sockaddr
*addr
, struct uio
*uio
,
2098 struct mbuf
*top
, struct mbuf
*control
, int flags
)
2101 struct mbuf
*m
, *freelist
= NULL
;
2102 user_ssize_t space
, len
, resid
, orig_resid
;
2103 int clen
= 0, error
, dontroute
, mlen
, sendflags
;
2104 int atomic
= sosendallatonce(so
) || top
;
2106 struct proc
*p
= current_proc();
2107 struct mbuf
*control_copy
= NULL
;
2108 uint16_t headroom
= 0;
2109 boolean_t en_tracing
= FALSE
;
2112 resid
= uio_resid(uio
);
2114 resid
= top
->m_pkthdr
.len
;
2117 KERNEL_DEBUG((DBG_FNC_SOSEND
| DBG_FUNC_START
), so
, resid
,
2118 so
->so_snd
.sb_cc
, so
->so_snd
.sb_lowat
, so
->so_snd
.sb_hiwat
);
2123 * trace if tracing & network (vs. unix) sockets & and
2126 if (ENTR_SHOULDTRACE
&&
2127 (SOCK_CHECK_DOM(so
, AF_INET
) || SOCK_CHECK_DOM(so
, AF_INET6
))) {
2128 struct inpcb
*inp
= sotoinpcb(so
);
2129 if (inp
->inp_last_outifp
!= NULL
&&
2130 !(inp
->inp_last_outifp
->if_flags
& IFF_LOOPBACK
)) {
2132 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite
, DBG_FUNC_START
,
2133 VM_KERNEL_ADDRPERM(so
),
2134 ((so
->so_state
& SS_NBIO
) ? kEnTrFlagNonBlocking
: 0),
2141 * Re-injection should not affect process accounting
2143 if ((flags
& MSG_SKIPCFIL
) == 0) {
2144 so_update_last_owner_locked(so
, p
);
2145 so_update_policy(so
);
2148 so_update_necp_policy(so
, NULL
, addr
);
2152 if (so
->so_type
!= SOCK_STREAM
&& (flags
& MSG_OOB
) != 0) {
2158 * In theory resid should be unsigned.
2159 * However, space must be signed, as it might be less than 0
2160 * if we over-committed, and we must use a signed comparison
2161 * of space and resid. On the other hand, a negative resid
2162 * causes us to loop sending 0-length segments to the protocol.
2164 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2165 * But it will be used by sockets doing message delivery.
2167 * Note: We limit resid to be a positive int value as we use
2168 * imin() to set bytes_to_copy -- radr://14558484
2170 if (resid
< 0 || resid
> INT_MAX
|| (so
->so_type
== SOCK_STREAM
&&
2171 !(so
->so_flags
& SOF_ENABLE_MSGS
) && (flags
& MSG_EOR
))) {
2176 dontroute
= (flags
& MSG_DONTROUTE
) &&
2177 (so
->so_options
& SO_DONTROUTE
) == 0 &&
2178 (so
->so_proto
->pr_flags
& PR_ATOMIC
);
2179 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgsnd
);
2181 if (control
!= NULL
) {
2182 clen
= control
->m_len
;
2185 if (soreserveheadroom
!= 0) {
2186 headroom
= so
->so_pktheadroom
;
2190 error
= sosendcheck(so
, addr
, resid
, clen
, atomic
, flags
,
2191 &sblocked
, control
);
2197 if (so
->so_flags
& SOF_ENABLE_MSGS
) {
2198 space
= msgq_sbspace(so
, control
);
2200 space
= sbspace(&so
->so_snd
) - clen
;
2202 space
+= ((flags
& MSG_OOB
) ? 1024 : 0);
2207 * Data is prepackaged in "top".
2210 if (flags
& MSG_EOR
) {
2211 top
->m_flags
|= M_EOR
;
2220 bytes_to_copy
= imin(resid
, space
);
2222 bytes_to_alloc
= bytes_to_copy
;
2224 bytes_to_alloc
+= headroom
;
2227 if (sosendminchain
> 0) {
2230 chainlength
= sosendmaxchain
;
2234 * Use big 4 KB cluster when the outgoing interface
2235 * does not prefer 2 KB clusters
2237 bigcl
= !(so
->so_flags1
& SOF1_IF_2KCL
) ||
2238 sosendbigcl_ignore_capab
;
2241 * Attempt to use larger than system page-size
2242 * clusters for large writes only if there is
2243 * a jumbo cluster pool and if the socket is
2244 * marked accordingly.
2246 jumbocl
= sosendjcl
&& njcl
> 0 &&
2247 ((so
->so_flags
& SOF_MULTIPAGES
) ||
2248 sosendjcl_ignore_capab
) &&
2251 socket_unlock(so
, 0);
2255 int hdrs_needed
= (top
== NULL
) ? 1 : 0;
2258 * try to maintain a local cache of mbuf
2259 * clusters needed to complete this
2260 * write the list is further limited to
2261 * the number that are currently needed
2262 * to fill the socket this mechanism
2263 * allows a large number of mbufs/
2264 * clusters to be grabbed under a single
2265 * mbuf lock... if we can't get any
2266 * clusters, than fall back to trying
2267 * for mbufs if we fail early (or
2268 * miscalcluate the number needed) make
2269 * sure to release any clusters we
2270 * haven't yet consumed.
2272 if (freelist
== NULL
&&
2273 bytes_to_alloc
> MBIGCLBYTES
&&
2276 bytes_to_alloc
/ M16KCLBYTES
;
2278 if ((bytes_to_alloc
-
2279 (num_needed
* M16KCLBYTES
))
2285 m_getpackets_internal(
2286 (unsigned int *)&num_needed
,
2287 hdrs_needed
, M_WAIT
, 0,
2290 * Fall back to 4K cluster size
2291 * if allocation failed
2295 if (freelist
== NULL
&&
2296 bytes_to_alloc
> MCLBYTES
&&
2299 bytes_to_alloc
/ MBIGCLBYTES
;
2301 if ((bytes_to_alloc
-
2302 (num_needed
* MBIGCLBYTES
)) >=
2308 m_getpackets_internal(
2309 (unsigned int *)&num_needed
,
2310 hdrs_needed
, M_WAIT
, 0,
2313 * Fall back to cluster size
2314 * if allocation failed
2319 * Allocate a cluster as we want to
2320 * avoid to split the data in more
2321 * that one segment and using MINCLSIZE
2322 * would lead us to allocate two mbufs
2324 if (soreserveheadroom
!= 0 &&
2327 bytes_to_alloc
> _MHLEN
) ||
2328 bytes_to_alloc
> _MLEN
)) {
2329 num_needed
= ROUNDUP(bytes_to_alloc
, MCLBYTES
) /
2332 m_getpackets_internal(
2333 (unsigned int *)&num_needed
,
2334 hdrs_needed
, M_WAIT
, 0,
2337 * Fall back to a single mbuf
2338 * if allocation failed
2340 } else if (freelist
== NULL
&&
2341 bytes_to_alloc
> MINCLSIZE
) {
2343 bytes_to_alloc
/ MCLBYTES
;
2345 if ((bytes_to_alloc
-
2346 (num_needed
* MCLBYTES
)) >=
2352 m_getpackets_internal(
2353 (unsigned int *)&num_needed
,
2354 hdrs_needed
, M_WAIT
, 0,
2357 * Fall back to a single mbuf
2358 * if allocation failed
2362 * For datagram protocols, leave
2363 * headroom for protocol headers
2364 * in the first cluster of the chain
2366 if (freelist
!= NULL
&& atomic
&&
2367 top
== NULL
&& headroom
> 0) {
2368 freelist
->m_data
+= headroom
;
2372 * Fall back to regular mbufs without
2373 * reserving the socket headroom
2375 if (freelist
== NULL
) {
2384 if (freelist
== NULL
) {
2390 * For datagram protocols,
2391 * leave room for protocol
2392 * headers in first mbuf.
2394 if (atomic
&& top
== NULL
&&
2395 bytes_to_copy
< MHLEN
) {
2401 freelist
= m
->m_next
;
2404 if ((m
->m_flags
& M_EXT
)) {
2405 mlen
= m
->m_ext
.ext_size
-
2407 } else if ((m
->m_flags
& M_PKTHDR
)) {
2409 MHLEN
- M_LEADINGSPACE(m
);
2411 mlen
= MLEN
- M_LEADINGSPACE(m
);
2413 len
= imin(mlen
, bytes_to_copy
);
2419 error
= uiomove(mtod(m
, caddr_t
),
2422 resid
= uio_resid(uio
);
2426 top
->m_pkthdr
.len
+= len
;
2432 if (flags
& MSG_EOR
) {
2433 top
->m_flags
|= M_EOR
;
2437 bytes_to_copy
= min(resid
, space
);
2438 } while (space
> 0 &&
2439 (chainlength
< sosendmaxchain
|| atomic
||
2440 resid
< MINCLSIZE
));
2449 if (flags
& (MSG_HOLD
| MSG_SEND
)) {
2450 /* Enqueue for later, go away if HOLD */
2452 if (so
->so_temp
&& (flags
& MSG_FLUSH
)) {
2453 m_freem(so
->so_temp
);
2457 so
->so_tail
->m_next
= top
;
2462 while (mb1
->m_next
) {
2466 if (flags
& MSG_HOLD
) {
2473 so
->so_options
|= SO_DONTROUTE
;
2477 * Compute flags here, for pru_send and NKEs
2479 * If the user set MSG_EOF, the protocol
2480 * understands this flag and nothing left to
2481 * send then use PRU_SEND_EOF instead of PRU_SEND.
2483 sendflags
= (flags
& MSG_OOB
) ? PRUS_OOB
:
2484 ((flags
& MSG_EOF
) &&
2485 (so
->so_proto
->pr_flags
& PR_IMPLOPCL
) &&
2486 (resid
<= 0)) ? PRUS_EOF
:
2487 /* If there is more to send set PRUS_MORETOCOME */
2488 (resid
> 0 && space
> 0) ? PRUS_MORETOCOME
: 0;
2490 if ((flags
& MSG_SKIPCFIL
) == 0) {
2492 * Socket filter processing
2494 error
= sflt_data_out(so
, addr
, &top
,
2495 &control
, (sendflags
& MSG_OOB
) ?
2496 sock_data_filt_flag_oob
: 0);
2498 if (error
== EJUSTRETURN
) {
2508 * Content filter processing
2510 error
= cfil_sock_data_out(so
, addr
, top
,
2511 control
, sendflags
);
2513 if (error
== EJUSTRETURN
) {
2521 #endif /* CONTENT_FILTER */
2523 if (so
->so_flags
& SOF_ENABLE_MSGS
) {
2525 * Make a copy of control mbuf,
2526 * so that msg priority can be
2527 * passed to subsequent mbufs.
2529 control_copy
= m_dup(control
, M_NOWAIT
);
2531 error
= (*so
->so_proto
->pr_usrreqs
->pru_send
)
2532 (so
, sendflags
, top
, addr
, control
, p
);
2534 if (flags
& MSG_SEND
) {
2539 so
->so_options
&= ~SO_DONTROUTE
;
2543 control
= control_copy
;
2544 control_copy
= NULL
;
2550 } while (resid
&& space
> 0);
2555 sbunlock(&so
->so_snd
, FALSE
); /* will unlock socket */
2557 socket_unlock(so
, 1);
2562 if (control
!= NULL
) {
2565 if (freelist
!= NULL
) {
2566 m_freem_list(freelist
);
2568 if (control_copy
!= NULL
) {
2569 m_freem(control_copy
);
2572 soclearfastopen(so
);
2575 /* resid passed here is the bytes left in uio */
2576 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite
, DBG_FUNC_END
,
2577 VM_KERNEL_ADDRPERM(so
),
2578 ((error
== EWOULDBLOCK
) ? kEnTrFlagNoWork
: 0),
2579 (int64_t)(orig_resid
- resid
));
2581 KERNEL_DEBUG(DBG_FNC_SOSEND
| DBG_FUNC_END
, so
, resid
,
2582 so
->so_snd
.sb_cc
, space
, error
);
2588 sosend_reinject(struct socket
*so
, struct sockaddr
*addr
, struct mbuf
*top
, struct mbuf
*control
, uint32_t sendflags
)
2590 struct mbuf
*m0
, *control_end
;
2592 socket_lock_assert_owned(so
);
2595 * top must points to mbuf chain to be sent.
2596 * If control is not NULL, top must be packet header
2598 VERIFY(top
!= NULL
&&
2599 (control
== NULL
|| top
->m_flags
& M_PKTHDR
));
2602 * If control is not passed in, see if we can get it
2605 if (control
== NULL
&& (top
->m_flags
& M_PKTHDR
) == 0) {
2606 // Locate start of control if present and start of data
2607 for (m0
= top
; m0
!= NULL
; m0
= m0
->m_next
) {
2608 if (m0
->m_flags
& M_PKTHDR
) {
2611 } else if (m0
->m_type
== MT_CONTROL
) {
2612 if (control
== NULL
) {
2613 // Found start of control
2616 if (control
!= NULL
&& m0
->m_next
!= NULL
&& m0
->m_next
->m_type
!= MT_CONTROL
) {
2617 // Found end of control
2622 if (control_end
!= NULL
) {
2623 control_end
->m_next
= NULL
;
2627 int error
= (*so
->so_proto
->pr_usrreqs
->pru_send
)
2628 (so
, sendflags
, top
, addr
, control
, current_proc());
2634 * Supported only connected sockets (no address) without ancillary data
2635 * (control mbuf) for atomic protocols
2638 sosend_list(struct socket
*so
, struct uio
**uioarray
, u_int uiocnt
, int flags
)
2640 struct mbuf
*m
, *freelist
= NULL
;
2641 user_ssize_t len
, resid
;
2642 int error
, dontroute
, mlen
;
2643 int atomic
= sosendallatonce(so
);
2645 struct proc
*p
= current_proc();
2648 struct mbuf
*top
= NULL
;
2649 uint16_t headroom
= 0;
2652 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST
| DBG_FUNC_START
), so
, uiocnt
,
2653 so
->so_snd
.sb_cc
, so
->so_snd
.sb_lowat
, so
->so_snd
.sb_hiwat
);
2655 if (so
->so_type
!= SOCK_DGRAM
) {
2663 if (so
->so_proto
->pr_usrreqs
->pru_send_list
== NULL
) {
2664 error
= EPROTONOSUPPORT
;
2667 if (flags
& ~(MSG_DONTWAIT
| MSG_NBIO
)) {
2671 resid
= uio_array_resid(uioarray
, uiocnt
);
2674 * In theory resid should be unsigned.
2675 * However, space must be signed, as it might be less than 0
2676 * if we over-committed, and we must use a signed comparison
2677 * of space and resid. On the other hand, a negative resid
2678 * causes us to loop sending 0-length segments to the protocol.
2680 * Note: We limit resid to be a positive int value as we use
2681 * imin() to set bytes_to_copy -- radr://14558484
2683 if (resid
< 0 || resid
> INT_MAX
) {
2689 so_update_last_owner_locked(so
, p
);
2690 so_update_policy(so
);
2693 so_update_necp_policy(so
, NULL
, NULL
);
2696 dontroute
= (flags
& MSG_DONTROUTE
) &&
2697 (so
->so_options
& SO_DONTROUTE
) == 0 &&
2698 (so
->so_proto
->pr_flags
& PR_ATOMIC
);
2699 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgsnd
);
2701 error
= sosendcheck(so
, NULL
, resid
, 0, atomic
, flags
,
2708 * Use big 4 KB clusters when the outgoing interface does not prefer
2711 bigcl
= !(so
->so_flags1
& SOF1_IF_2KCL
) || sosendbigcl_ignore_capab
;
2713 if (soreserveheadroom
!= 0) {
2714 headroom
= so
->so_pktheadroom
;
2721 size_t maxpktlen
= 0;
2724 if (sosendminchain
> 0) {
2727 chainlength
= sosendmaxchain
;
2730 socket_unlock(so
, 0);
2733 * Find a set of uio that fit in a reasonable number
2736 for (i
= uiofirst
; i
< uiocnt
; i
++) {
2737 struct uio
*auio
= uioarray
[i
];
2739 len
= uio_resid(auio
);
2741 /* Do nothing for empty messages */
2749 if (len
> maxpktlen
) {
2754 if (chainlength
> sosendmaxchain
) {
2759 * Nothing left to send
2761 if (num_needed
== 0) {
2766 * Allocate buffer large enough to include headroom space for
2767 * network and link header
2770 bytes_to_alloc
= maxpktlen
+ headroom
;
2773 * Allocate a single contiguous buffer of the smallest available
2774 * size when possible
2776 if (bytes_to_alloc
> MCLBYTES
&&
2777 bytes_to_alloc
<= MBIGCLBYTES
&& bigcl
) {
2778 freelist
= m_getpackets_internal(
2779 (unsigned int *)&num_needed
,
2780 num_needed
, M_WAIT
, 1,
2782 } else if (bytes_to_alloc
> _MHLEN
&&
2783 bytes_to_alloc
<= MCLBYTES
) {
2784 freelist
= m_getpackets_internal(
2785 (unsigned int *)&num_needed
,
2786 num_needed
, M_WAIT
, 1,
2789 freelist
= m_allocpacket_internal(
2790 (unsigned int *)&num_needed
,
2791 bytes_to_alloc
, NULL
, M_WAIT
, 1, 0);
2794 if (freelist
== NULL
) {
2800 * Copy each uio of the set into its own mbuf packet
2802 for (i
= uiofirst
, m
= freelist
;
2803 i
< uiolast
&& m
!= NULL
;
2807 struct uio
*auio
= uioarray
[i
];
2809 bytes_to_copy
= uio_resid(auio
);
2811 /* Do nothing for empty messages */
2812 if (bytes_to_copy
== 0) {
2816 * Leave headroom for protocol headers
2817 * in the first mbuf of the chain
2819 m
->m_data
+= headroom
;
2821 for (n
= m
; n
!= NULL
; n
= n
->m_next
) {
2822 if ((m
->m_flags
& M_EXT
)) {
2823 mlen
= m
->m_ext
.ext_size
-
2825 } else if ((m
->m_flags
& M_PKTHDR
)) {
2827 MHLEN
- M_LEADINGSPACE(m
);
2829 mlen
= MLEN
- M_LEADINGSPACE(m
);
2831 len
= imin(mlen
, bytes_to_copy
);
2834 * Note: uiomove() decrements the iovec
2837 error
= uiomove(mtod(n
, caddr_t
),
2843 m
->m_pkthdr
.len
+= len
;
2845 VERIFY(m
->m_pkthdr
.len
<= maxpktlen
);
2847 bytes_to_copy
-= len
;
2850 if (m
->m_pkthdr
.len
== 0) {
2852 "%s:%d so %llx pkt %llx type %u len null\n",
2854 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
2855 (uint64_t)DEBUG_KERNEL_ADDRPERM(m
),
2873 so
->so_options
|= SO_DONTROUTE
;
2876 if ((flags
& MSG_SKIPCFIL
) == 0) {
2877 struct mbuf
**prevnextp
= NULL
;
2879 for (i
= uiofirst
, m
= top
;
2880 i
< uiolast
&& m
!= NULL
;
2882 struct mbuf
*nextpkt
= m
->m_nextpkt
;
2885 * Socket filter processing
2887 error
= sflt_data_out(so
, NULL
, &m
,
2889 if (error
!= 0 && error
!= EJUSTRETURN
) {
2896 * Content filter processing
2898 error
= cfil_sock_data_out(so
, NULL
, m
,
2900 if (error
!= 0 && error
!= EJUSTRETURN
) {
2904 #endif /* CONTENT_FILTER */
2906 * Remove packet from the list when
2907 * swallowed by a filter
2909 if (error
== EJUSTRETURN
) {
2911 if (prevnextp
!= NULL
) {
2912 *prevnextp
= nextpkt
;
2920 prevnextp
= &m
->m_nextpkt
;
2925 error
= (*so
->so_proto
->pr_usrreqs
->pru_send_list
)
2926 (so
, 0, top
, NULL
, NULL
, p
);
2930 so
->so_options
&= ~SO_DONTROUTE
;
2935 } while (resid
> 0 && error
== 0);
2938 sbunlock(&so
->so_snd
, FALSE
); /* will unlock socket */
2940 socket_unlock(so
, 1);
2946 if (freelist
!= NULL
) {
2947 m_freem_list(freelist
);
2950 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST
| DBG_FUNC_END
, so
, resid
,
2951 so
->so_snd
.sb_cc
, 0, error
);
2957 * May return ERESTART when packet is dropped by MAC policy check
2960 soreceive_addr(struct proc
*p
, struct socket
*so
, struct sockaddr
**psa
,
2961 int flags
, struct mbuf
**mp
, struct mbuf
**nextrecordp
, int canwait
)
2964 struct mbuf
*m
= *mp
;
2965 struct mbuf
*nextrecord
= *nextrecordp
;
2967 KASSERT(m
->m_type
== MT_SONAME
, ("receive 1a"));
2968 #if CONFIG_MACF_SOCKET_SUBSET
2970 * Call the MAC framework for policy checking if we're in
2971 * the user process context and the socket isn't connected.
2973 if (p
!= kernproc
&& !(so
->so_state
& SS_ISCONNECTED
)) {
2974 struct mbuf
*m0
= m
;
2976 * Dequeue this record (temporarily) from the receive
2977 * list since we're about to drop the socket's lock
2978 * where a new record may arrive and be appended to
2979 * the list. Upon MAC policy failure, the record
2980 * will be freed. Otherwise, we'll add it back to
2981 * the head of the list. We cannot rely on SB_LOCK
2982 * because append operation uses the socket's lock.
2985 m
->m_nextpkt
= NULL
;
2986 sbfree(&so
->so_rcv
, m
);
2988 } while (m
!= NULL
);
2990 so
->so_rcv
.sb_mb
= nextrecord
;
2991 SB_EMPTY_FIXUP(&so
->so_rcv
);
2992 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1a");
2993 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1a");
2994 socket_unlock(so
, 0);
2996 if (mac_socket_check_received(proc_ucred(p
), so
,
2997 mtod(m
, struct sockaddr
*)) != 0) {
2999 * MAC policy failure; free this record and
3000 * process the next record (or block until
3001 * one is available). We have adjusted sb_cc
3002 * and sb_mbcnt above so there is no need to
3003 * call sbfree() again.
3007 * Clear SB_LOCK but don't unlock the socket.
3008 * Process the next record or wait for one.
3011 sbunlock(&so
->so_rcv
, TRUE
); /* stay locked */
3017 * If the socket has been defunct'd, drop it.
3019 if (so
->so_flags
& SOF_DEFUNCT
) {
3025 * Re-adjust the socket receive list and re-enqueue
3026 * the record in front of any packets which may have
3027 * been appended while we dropped the lock.
3029 for (m
= m0
; m
->m_next
!= NULL
; m
= m
->m_next
) {
3030 sballoc(&so
->so_rcv
, m
);
3032 sballoc(&so
->so_rcv
, m
);
3033 if (so
->so_rcv
.sb_mb
== NULL
) {
3034 so
->so_rcv
.sb_lastrecord
= m0
;
3035 so
->so_rcv
.sb_mbtail
= m
;
3038 nextrecord
= m
->m_nextpkt
= so
->so_rcv
.sb_mb
;
3039 so
->so_rcv
.sb_mb
= m
;
3040 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1b");
3041 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1b");
3043 #endif /* CONFIG_MACF_SOCKET_SUBSET */
3045 *psa
= dup_sockaddr(mtod(m
, struct sockaddr
*), canwait
);
3046 if ((*psa
== NULL
) && (flags
& MSG_NEEDSA
)) {
3047 error
= EWOULDBLOCK
;
3051 if (flags
& MSG_PEEK
) {
3054 sbfree(&so
->so_rcv
, m
);
3055 if (m
->m_next
== NULL
&& so
->so_rcv
.sb_cc
!= 0) {
3056 panic("%s: about to create invalid socketbuf",
3060 MFREE(m
, so
->so_rcv
.sb_mb
);
3061 m
= so
->so_rcv
.sb_mb
;
3063 m
->m_nextpkt
= nextrecord
;
3065 so
->so_rcv
.sb_mb
= nextrecord
;
3066 SB_EMPTY_FIXUP(&so
->so_rcv
);
3071 *nextrecordp
= nextrecord
;
3077 * Process one or more MT_CONTROL mbufs present before any data mbufs
3078 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3079 * just copy the data; if !MSG_PEEK, we call into the protocol to
3080 * perform externalization.
3083 soreceive_ctl(struct socket
*so
, struct mbuf
**controlp
, int flags
,
3084 struct mbuf
**mp
, struct mbuf
**nextrecordp
)
3087 struct mbuf
*cm
= NULL
, *cmn
;
3088 struct mbuf
**cme
= &cm
;
3089 struct sockbuf
*sb_rcv
= &so
->so_rcv
;
3090 struct mbuf
**msgpcm
= NULL
;
3091 struct mbuf
*m
= *mp
;
3092 struct mbuf
*nextrecord
= *nextrecordp
;
3093 struct protosw
*pr
= so
->so_proto
;
3096 * Externalizing the control messages would require us to
3097 * drop the socket's lock below. Once we re-acquire the
3098 * lock, the mbuf chain might change. In order to preserve
3099 * consistency, we unlink all control messages from the
3100 * first mbuf chain in one shot and link them separately
3101 * onto a different chain.
3104 if (flags
& MSG_PEEK
) {
3105 if (controlp
!= NULL
) {
3106 if (*controlp
== NULL
) {
3109 *controlp
= m_copy(m
, 0, m
->m_len
);
3112 * If we failed to allocate an mbuf,
3113 * release any previously allocated
3114 * mbufs for control data. Return
3115 * an error. Keep the mbufs in the
3116 * socket as this is using
3119 if (*controlp
== NULL
) {
3124 controlp
= &(*controlp
)->m_next
;
3128 m
->m_nextpkt
= NULL
;
3130 sb_rcv
->sb_mb
= m
->m_next
;
3133 cme
= &(*cme
)->m_next
;
3136 } while (m
!= NULL
&& m
->m_type
== MT_CONTROL
);
3138 if (!(flags
& MSG_PEEK
)) {
3139 if (sb_rcv
->sb_mb
!= NULL
) {
3140 sb_rcv
->sb_mb
->m_nextpkt
= nextrecord
;
3142 sb_rcv
->sb_mb
= nextrecord
;
3143 SB_EMPTY_FIXUP(sb_rcv
);
3145 if (nextrecord
== NULL
) {
3146 sb_rcv
->sb_lastrecord
= m
;
3150 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive ctl");
3151 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive ctl");
3153 while (cm
!= NULL
) {
3158 cmsg_type
= mtod(cm
, struct cmsghdr
*)->cmsg_type
;
3161 * Call the protocol to externalize SCM_RIGHTS message
3162 * and return the modified message to the caller upon
3163 * success. Otherwise, all other control messages are
3164 * returned unmodified to the caller. Note that we
3165 * only get into this loop if MSG_PEEK is not set.
3167 if (pr
->pr_domain
->dom_externalize
!= NULL
&&
3168 cmsg_type
== SCM_RIGHTS
) {
3170 * Release socket lock: see 3903171. This
3171 * would also allow more records to be appended
3172 * to the socket buffer. We still have SB_LOCK
3173 * set on it, so we can be sure that the head
3174 * of the mbuf chain won't change.
3176 socket_unlock(so
, 0);
3177 error
= (*pr
->pr_domain
->dom_externalize
)(cm
);
3183 if (controlp
!= NULL
&& error
== 0) {
3185 controlp
= &(*controlp
)->m_next
;
3192 * Update the value of nextrecord in case we received new
3193 * records when the socket was unlocked above for
3194 * externalizing SCM_RIGHTS.
3197 nextrecord
= sb_rcv
->sb_mb
->m_nextpkt
;
3199 nextrecord
= sb_rcv
->sb_mb
;
3204 *nextrecordp
= nextrecord
;
3210 * Implement receive operations on a socket.
3211 * We depend on the way that records are added to the sockbuf
3212 * by sbappend*. In particular, each record (mbufs linked through m_next)
3213 * must begin with an address if the protocol so specifies,
3214 * followed by an optional mbuf or mbufs containing ancillary data,
3215 * and then zero or more mbufs of data.
3216 * In order to avoid blocking network interrupts for the entire time here,
3217 * we splx() while doing the actual copy to user space.
3218 * Although the sockbuf is locked, new data may still be appended,
3219 * and thus we must maintain consistency of the sockbuf during that time.
3221 * The caller may receive the data as a single mbuf chain by supplying
3222 * an mbuf **mp0 for use in returning the chain. The uio is then used
3223 * only for the count in uio_resid.
3225 * Returns: 0 Success
3230 * sblock:EWOULDBLOCK
3234 * sodelayed_copy:EFAULT
3235 * <pru_rcvoob>:EINVAL[TCP]
3236 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3238 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3239 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3240 * <pr_domain->dom_externalize>:???
3242 * Notes: Additional return values from calls through <pru_rcvoob> and
3243 * <pr_domain->dom_externalize> depend on protocols other than
3244 * TCP or AF_UNIX, which are documented above.
3247 soreceive(struct socket
*so
, struct sockaddr
**psa
, struct uio
*uio
,
3248 struct mbuf
**mp0
, struct mbuf
**controlp
, int *flagsp
)
3250 struct mbuf
*m
, **mp
, *ml
= NULL
;
3251 struct mbuf
*nextrecord
, *free_list
;
3252 int flags
, error
, offset
;
3254 struct protosw
*pr
= so
->so_proto
;
3256 user_ssize_t orig_resid
= uio_resid(uio
);
3257 user_ssize_t delayed_copy_len
;
3260 struct proc
*p
= current_proc();
3261 boolean_t en_tracing
= FALSE
;
3264 * Sanity check on the length passed by caller as we are making 'int'
3267 if (orig_resid
< 0 || orig_resid
> INT_MAX
) {
3271 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_START
, so
,
3272 uio_resid(uio
), so
->so_rcv
.sb_cc
, so
->so_rcv
.sb_lowat
,
3273 so
->so_rcv
.sb_hiwat
);
3276 so_update_last_owner_locked(so
, p
);
3277 so_update_policy(so
);
3279 #ifdef MORE_LOCKING_DEBUG
3280 if (so
->so_usecount
== 1) {
3281 panic("%s: so=%x no other reference on socket\n", __func__
, so
);
3289 if (controlp
!= NULL
) {
3292 if (flagsp
!= NULL
) {
3293 flags
= *flagsp
& ~MSG_EOR
;
3299 * If a recv attempt is made on a previously-accepted socket
3300 * that has been marked as inactive (disconnected), reject
3303 if (so
->so_flags
& SOF_DEFUNCT
) {
3304 struct sockbuf
*sb
= &so
->so_rcv
;
3307 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3308 __func__
, proc_pid(p
), proc_best_name(p
),
3309 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
3310 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
3312 * This socket should have been disconnected and flushed
3313 * prior to being returned from sodefunct(); there should
3314 * be no data on its receive list, so panic otherwise.
3316 if (so
->so_state
& SS_DEFUNCT
) {
3317 sb_empty_assert(sb
, __func__
);
3319 socket_unlock(so
, 1);
3323 if ((so
->so_flags1
& SOF1_PRECONNECT_DATA
) &&
3324 pr
->pr_usrreqs
->pru_preconnect
) {
3326 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3327 * calling write() right after this. *If* the app calls a read
3328 * we do not want to block this read indefinetely. Thus,
3329 * we trigger a connect so that the session gets initiated.
3331 error
= (*pr
->pr_usrreqs
->pru_preconnect
)(so
);
3334 socket_unlock(so
, 1);
3339 if (ENTR_SHOULDTRACE
&&
3340 (SOCK_CHECK_DOM(so
, AF_INET
) || SOCK_CHECK_DOM(so
, AF_INET6
))) {
3342 * enable energy tracing for inet sockets that go over
3343 * non-loopback interfaces only.
3345 struct inpcb
*inp
= sotoinpcb(so
);
3346 if (inp
->inp_last_outifp
!= NULL
&&
3347 !(inp
->inp_last_outifp
->if_flags
& IFF_LOOPBACK
)) {
3349 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_START
,
3350 VM_KERNEL_ADDRPERM(so
),
3351 ((so
->so_state
& SS_NBIO
) ?
3352 kEnTrFlagNonBlocking
: 0),
3353 (int64_t)orig_resid
);
3358 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3359 * regardless of the flags argument. Here is the case were
3360 * out-of-band data is not inline.
3362 if ((flags
& MSG_OOB
) ||
3363 ((so
->so_options
& SO_WANTOOBFLAG
) != 0 &&
3364 (so
->so_options
& SO_OOBINLINE
) == 0 &&
3365 (so
->so_oobmark
|| (so
->so_state
& SS_RCVATMARK
)))) {
3366 m
= m_get(M_WAIT
, MT_DATA
);
3368 socket_unlock(so
, 1);
3369 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
,
3370 ENOBUFS
, 0, 0, 0, 0);
3373 error
= (*pr
->pr_usrreqs
->pru_rcvoob
)(so
, m
, flags
& MSG_PEEK
);
3377 socket_unlock(so
, 0);
3379 error
= uiomove(mtod(m
, caddr_t
),
3380 imin(uio_resid(uio
), m
->m_len
), uio
);
3382 } while (uio_resid(uio
) && error
== 0 && m
!= NULL
);
3389 if ((so
->so_options
& SO_WANTOOBFLAG
) != 0) {
3390 if (error
== EWOULDBLOCK
|| error
== EINVAL
) {
3392 * Let's try to get normal data:
3393 * EWOULDBLOCK: out-of-band data not
3394 * receive yet. EINVAL: out-of-band data
3399 } else if (error
== 0 && flagsp
!= NULL
) {
3403 socket_unlock(so
, 1);
3405 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_END
,
3406 VM_KERNEL_ADDRPERM(so
), 0,
3407 (int64_t)(orig_resid
- uio_resid(uio
)));
3409 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
3419 if (so
->so_state
& SS_ISCONFIRMING
&& uio_resid(uio
)) {
3420 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, 0);
3424 delayed_copy_len
= 0;
3426 #ifdef MORE_LOCKING_DEBUG
3427 if (so
->so_usecount
<= 1) {
3428 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3429 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), so
->so_usecount
);
3433 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3434 * and if so just return to the caller. This could happen when
3435 * soreceive() is called by a socket upcall function during the
3436 * time the socket is freed. The socket buffer would have been
3437 * locked across the upcall, therefore we cannot put this thread
3438 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3439 * we may livelock), because the lock on the socket buffer will
3440 * only be released when the upcall routine returns to its caller.
3441 * Because the socket has been officially closed, there can be
3442 * no further read on it.
3444 * A multipath subflow socket would have its SS_NOFDREF set by
3445 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3446 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3448 if ((so
->so_state
& (SS_NOFDREF
| SS_CANTRCVMORE
)) ==
3449 (SS_NOFDREF
| SS_CANTRCVMORE
) && !(so
->so_flags
& SOF_MP_SUBFLOW
)) {
3450 socket_unlock(so
, 1);
3454 error
= sblock(&so
->so_rcv
, SBLOCKWAIT(flags
));
3456 socket_unlock(so
, 1);
3457 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
3460 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_END
,
3461 VM_KERNEL_ADDRPERM(so
), 0,
3462 (int64_t)(orig_resid
- uio_resid(uio
)));
3467 m
= so
->so_rcv
.sb_mb
;
3469 * If we have less data than requested, block awaiting more
3470 * (subject to any timeout) if:
3471 * 1. the current count is less than the low water mark, or
3472 * 2. MSG_WAITALL is set, and it is possible to do the entire
3473 * receive operation at once if we block (resid <= hiwat).
3474 * 3. MSG_DONTWAIT is not set
3475 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3476 * we have to do the receive in sections, and thus risk returning
3477 * a short count if a timeout or signal occurs after we start.
3479 if (m
== NULL
|| (((flags
& MSG_DONTWAIT
) == 0 &&
3480 so
->so_rcv
.sb_cc
< uio_resid(uio
)) &&
3481 (so
->so_rcv
.sb_cc
< so
->so_rcv
.sb_lowat
||
3482 ((flags
& MSG_WAITALL
) && uio_resid(uio
) <= so
->so_rcv
.sb_hiwat
)) &&
3483 m
->m_nextpkt
== NULL
&& (pr
->pr_flags
& PR_ATOMIC
) == 0)) {
3485 * Panic if we notice inconsistencies in the socket's
3486 * receive list; both sb_mb and sb_cc should correctly
3487 * reflect the contents of the list, otherwise we may
3488 * end up with false positives during select() or poll()
3489 * which could put the application in a bad state.
3491 SB_MB_CHECK(&so
->so_rcv
);
3497 error
= so
->so_error
;
3498 if ((flags
& MSG_PEEK
) == 0) {
3503 if (so
->so_state
& SS_CANTRCVMORE
) {
3506 * Deal with half closed connections
3508 if ((so
->so_state
& SS_ISDISCONNECTED
) == 0 &&
3509 cfil_sock_data_pending(&so
->so_rcv
) != 0) {
3511 "so %llx ignore SS_CANTRCVMORE",
3512 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
));
3514 #endif /* CONTENT_FILTER */
3521 for (; m
!= NULL
; m
= m
->m_next
) {
3522 if (m
->m_type
== MT_OOBDATA
|| (m
->m_flags
& M_EOR
)) {
3523 m
= so
->so_rcv
.sb_mb
;
3527 if ((so
->so_state
& (SS_ISCONNECTED
| SS_ISCONNECTING
)) == 0 &&
3528 (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
)) {
3532 if (uio_resid(uio
) == 0) {
3536 if ((so
->so_state
& SS_NBIO
) ||
3537 (flags
& (MSG_DONTWAIT
| MSG_NBIO
))) {
3538 error
= EWOULDBLOCK
;
3541 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 1");
3542 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 1");
3543 sbunlock(&so
->so_rcv
, TRUE
); /* keep socket locked */
3544 #if EVEN_MORE_LOCKING_DEBUG
3546 printf("Waiting for socket data\n");
3550 error
= sbwait(&so
->so_rcv
);
3551 #if EVEN_MORE_LOCKING_DEBUG
3553 printf("SORECEIVE - sbwait returned %d\n", error
);
3556 if (so
->so_usecount
< 1) {
3557 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3558 __func__
, so
, so
->so_usecount
);
3562 socket_unlock(so
, 1);
3563 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
3566 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_END
,
3567 VM_KERNEL_ADDRPERM(so
), 0,
3568 (int64_t)(orig_resid
- uio_resid(uio
)));
3575 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgrcv
);
3576 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1");
3577 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1");
3578 nextrecord
= m
->m_nextpkt
;
3580 if ((pr
->pr_flags
& PR_ADDR
) && m
->m_type
== MT_SONAME
) {
3581 error
= soreceive_addr(p
, so
, psa
, flags
, &m
, &nextrecord
,
3583 if (error
== ERESTART
) {
3585 } else if (error
!= 0) {
3592 * Process one or more MT_CONTROL mbufs present before any data mbufs
3593 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3594 * just copy the data; if !MSG_PEEK, we call into the protocol to
3595 * perform externalization.
3597 if (m
!= NULL
&& m
->m_type
== MT_CONTROL
) {
3598 error
= soreceive_ctl(so
, controlp
, flags
, &m
, &nextrecord
);
3606 * If the socket is a TCP socket with message delivery
3607 * enabled, then create a control msg to deliver the
3608 * relative TCP sequence number for this data. Waiting
3609 * until this point will protect against failures to
3610 * allocate an mbuf for control msgs.
3612 if (so
->so_type
== SOCK_STREAM
&& SOCK_PROTO(so
) == IPPROTO_TCP
&&
3613 (so
->so_flags
& SOF_ENABLE_MSGS
) && controlp
!= NULL
) {
3614 struct mbuf
*seq_cm
;
3616 seq_cm
= sbcreatecontrol((caddr_t
)&m
->m_pkthdr
.msg_seq
,
3617 sizeof(uint32_t), SCM_SEQNUM
, SOL_SOCKET
);
3618 if (seq_cm
== NULL
) {
3619 /* unable to allocate a control mbuf */
3624 controlp
= &seq_cm
->m_next
;
3628 if (!(flags
& MSG_PEEK
)) {
3630 * We get here because m points to an mbuf following
3631 * any MT_SONAME or MT_CONTROL mbufs which have been
3632 * processed above. In any case, m should be pointing
3633 * to the head of the mbuf chain, and the nextrecord
3634 * should be either NULL or equal to m->m_nextpkt.
3635 * See comments above about SB_LOCK.
3637 if (m
!= so
->so_rcv
.sb_mb
||
3638 m
->m_nextpkt
!= nextrecord
) {
3639 panic("%s: post-control !sync so=%p m=%p "
3640 "nextrecord=%p\n", __func__
, so
, m
,
3644 if (nextrecord
== NULL
) {
3645 so
->so_rcv
.sb_lastrecord
= m
;
3649 if (type
== MT_OOBDATA
) {
3653 if (!(flags
& MSG_PEEK
)) {
3654 SB_EMPTY_FIXUP(&so
->so_rcv
);
3657 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 2");
3658 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 2");
3663 if (!(flags
& MSG_PEEK
) && uio_resid(uio
) > sorecvmincopy
) {
3672 (uio_resid(uio
) - delayed_copy_len
) > 0 && error
== 0) {
3673 if (m
->m_type
== MT_OOBDATA
) {
3674 if (type
!= MT_OOBDATA
) {
3677 } else if (type
== MT_OOBDATA
) {
3681 * Make sure to allways set MSG_OOB event when getting
3682 * out of band data inline.
3684 if ((so
->so_options
& SO_WANTOOBFLAG
) != 0 &&
3685 (so
->so_options
& SO_OOBINLINE
) != 0 &&
3686 (so
->so_state
& SS_RCVATMARK
) != 0) {
3689 so
->so_state
&= ~SS_RCVATMARK
;
3690 len
= uio_resid(uio
) - delayed_copy_len
;
3691 if (so
->so_oobmark
&& len
> so
->so_oobmark
- offset
) {
3692 len
= so
->so_oobmark
- offset
;
3694 if (len
> m
->m_len
- moff
) {
3695 len
= m
->m_len
- moff
;
3698 * If mp is set, just pass back the mbufs.
3699 * Otherwise copy them out via the uio, then free.
3700 * Sockbuf must be consistent here (points to current mbuf,
3701 * it points to next record) when we drop priority;
3702 * we must note any additions to the sockbuf when we
3703 * block interrupts again.
3706 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive uiomove");
3707 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive uiomove");
3708 if (can_delay
&& len
== m
->m_len
) {
3710 * only delay the copy if we're consuming the
3711 * mbuf and we're NOT in MSG_PEEK mode
3712 * and we have enough data to make it worthwile
3713 * to drop and retake the lock... can_delay
3714 * reflects the state of the 2 latter
3715 * constraints moff should always be zero
3718 delayed_copy_len
+= len
;
3720 if (delayed_copy_len
) {
3721 error
= sodelayed_copy(so
, uio
,
3722 &free_list
, &delayed_copy_len
);
3728 * can only get here if MSG_PEEK is not
3729 * set therefore, m should point at the
3730 * head of the rcv queue; if it doesn't,
3731 * it means something drastically
3732 * changed while we were out from behind
3733 * the lock in sodelayed_copy. perhaps
3734 * a RST on the stream. in any event,
3735 * the stream has been interrupted. it's
3736 * probably best just to return whatever
3737 * data we've moved and let the caller
3740 if (m
!= so
->so_rcv
.sb_mb
) {
3744 socket_unlock(so
, 0);
3745 error
= uiomove(mtod(m
, caddr_t
) + moff
,
3754 uio_setresid(uio
, (uio_resid(uio
) - len
));
3756 if (len
== m
->m_len
- moff
) {
3757 if (m
->m_flags
& M_EOR
) {
3760 if (flags
& MSG_PEEK
) {
3764 nextrecord
= m
->m_nextpkt
;
3765 sbfree(&so
->so_rcv
, m
);
3766 m
->m_nextpkt
= NULL
;
3769 * If this packet is an unordered packet
3770 * (indicated by M_UNORDERED_DATA flag), remove
3771 * the additional bytes added to the
3772 * receive socket buffer size.
3774 if ((so
->so_flags
& SOF_ENABLE_MSGS
) &&
3776 (m
->m_flags
& M_UNORDERED_DATA
) &&
3777 sbreserve(&so
->so_rcv
,
3778 so
->so_rcv
.sb_hiwat
- m
->m_len
)) {
3779 if (so
->so_msg_state
->msg_uno_bytes
>
3782 msg_uno_bytes
-= m
->m_len
;
3787 m
->m_flags
&= ~M_UNORDERED_DATA
;
3793 so
->so_rcv
.sb_mb
= m
= m
->m_next
;
3796 if (free_list
== NULL
) {
3802 so
->so_rcv
.sb_mb
= m
= m
->m_next
;
3806 m
->m_nextpkt
= nextrecord
;
3807 if (nextrecord
== NULL
) {
3808 so
->so_rcv
.sb_lastrecord
= m
;
3811 so
->so_rcv
.sb_mb
= nextrecord
;
3812 SB_EMPTY_FIXUP(&so
->so_rcv
);
3814 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 3");
3815 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 3");
3818 if (flags
& MSG_PEEK
) {
3824 if (flags
& MSG_DONTWAIT
) {
3825 copy_flag
= M_DONTWAIT
;
3829 *mp
= m_copym(m
, 0, len
, copy_flag
);
3831 * Failed to allocate an mbuf?
3832 * Adjust uio_resid back, it was
3833 * adjusted down by len bytes which
3834 * we didn't copy over.
3838 (uio_resid(uio
) + len
));
3844 so
->so_rcv
.sb_cc
-= len
;
3847 if (so
->so_oobmark
) {
3848 if ((flags
& MSG_PEEK
) == 0) {
3849 so
->so_oobmark
-= len
;
3850 if (so
->so_oobmark
== 0) {
3851 so
->so_state
|= SS_RCVATMARK
;
3853 * delay posting the actual event until
3854 * after any delayed copy processing
3862 if (offset
== so
->so_oobmark
) {
3867 if (flags
& MSG_EOR
) {
3871 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3872 * (for non-atomic socket), we must not quit until
3873 * "uio->uio_resid == 0" or an error termination.
3874 * If a signal/timeout occurs, return with a short
3875 * count but without error. Keep sockbuf locked
3876 * against other readers.
3878 while (flags
& (MSG_WAITALL
| MSG_WAITSTREAM
) && m
== NULL
&&
3879 (uio_resid(uio
) - delayed_copy_len
) > 0 &&
3880 !sosendallatonce(so
) && !nextrecord
) {
3881 if (so
->so_error
|| ((so
->so_state
& SS_CANTRCVMORE
)
3883 && cfil_sock_data_pending(&so
->so_rcv
) == 0
3884 #endif /* CONTENT_FILTER */
3890 * Depending on the protocol (e.g. TCP), the following
3891 * might cause the socket lock to be dropped and later
3892 * be reacquired, and more data could have arrived and
3893 * have been appended to the receive socket buffer by
3894 * the time it returns. Therefore, we only sleep in
3895 * sbwait() below if and only if the socket buffer is
3896 * empty, in order to avoid a false sleep.
3898 if (pr
->pr_flags
& PR_WANTRCVD
&& so
->so_pcb
&&
3899 (((struct inpcb
*)so
->so_pcb
)->inp_state
!=
3900 INPCB_STATE_DEAD
)) {
3901 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
3904 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 2");
3905 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 2");
3907 if (so
->so_rcv
.sb_mb
== NULL
&& sbwait(&so
->so_rcv
)) {
3912 * have to wait until after we get back from the sbwait
3913 * to do the copy because we will drop the lock if we
3914 * have enough data that has been delayed... by dropping
3915 * the lock we open up a window allowing the netisr
3916 * thread to process the incoming packets and to change
3917 * the state of this socket... we're issuing the sbwait
3918 * because the socket is empty and we're expecting the
3919 * netisr thread to wake us up when more packets arrive;
3920 * if we allow that processing to happen and then sbwait
3921 * we could stall forever with packets sitting in the
3922 * socket if no further packets arrive from the remote
3925 * we want to copy before we've collected all the data
3926 * to satisfy this request to allow the copy to overlap
3927 * the incoming packet processing on an MP system
3929 if (delayed_copy_len
> sorecvmincopy
&&
3930 (delayed_copy_len
> (so
->so_rcv
.sb_hiwat
/ 2))) {
3931 error
= sodelayed_copy(so
, uio
,
3932 &free_list
, &delayed_copy_len
);
3938 m
= so
->so_rcv
.sb_mb
;
3940 nextrecord
= m
->m_nextpkt
;
3942 SB_MB_CHECK(&so
->so_rcv
);
3945 #ifdef MORE_LOCKING_DEBUG
3946 if (so
->so_usecount
<= 1) {
3947 panic("%s: after big while so=%p ref=%d on socket\n",
3948 __func__
, so
, so
->so_usecount
);
3953 if (m
!= NULL
&& pr
->pr_flags
& PR_ATOMIC
) {
3954 if (so
->so_options
& SO_DONTTRUNC
) {
3955 flags
|= MSG_RCVMORE
;
3958 if ((flags
& MSG_PEEK
) == 0) {
3959 (void) sbdroprecord(&so
->so_rcv
);
3965 * pru_rcvd below (for TCP) may cause more data to be received
3966 * if the socket lock is dropped prior to sending the ACK; some
3967 * legacy OpenTransport applications don't handle this well
3968 * (if it receives less data than requested while MSG_HAVEMORE
3969 * is set), and so we set the flag now based on what we know
3970 * prior to calling pru_rcvd.
3972 if ((so
->so_options
& SO_WANTMORE
) && so
->so_rcv
.sb_cc
> 0) {
3973 flags
|= MSG_HAVEMORE
;
3976 if ((flags
& MSG_PEEK
) == 0) {
3978 so
->so_rcv
.sb_mb
= nextrecord
;
3980 * First part is an inline SB_EMPTY_FIXUP(). Second
3981 * part makes sure sb_lastrecord is up-to-date if
3982 * there is still data in the socket buffer.
3984 if (so
->so_rcv
.sb_mb
== NULL
) {
3985 so
->so_rcv
.sb_mbtail
= NULL
;
3986 so
->so_rcv
.sb_lastrecord
= NULL
;
3987 } else if (nextrecord
->m_nextpkt
== NULL
) {
3988 so
->so_rcv
.sb_lastrecord
= nextrecord
;
3990 SB_MB_CHECK(&so
->so_rcv
);
3992 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 4");
3993 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 4");
3994 if (pr
->pr_flags
& PR_WANTRCVD
&& so
->so_pcb
) {
3995 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
3999 if (delayed_copy_len
) {
4000 error
= sodelayed_copy(so
, uio
, &free_list
, &delayed_copy_len
);
4005 if (free_list
!= NULL
) {
4006 m_freem_list(free_list
);
4010 postevent(so
, 0, EV_OOB
);
4013 if (orig_resid
== uio_resid(uio
) && orig_resid
&&
4014 (flags
& MSG_EOR
) == 0 && (so
->so_state
& SS_CANTRCVMORE
) == 0) {
4015 sbunlock(&so
->so_rcv
, TRUE
); /* keep socket locked */
4019 if (flagsp
!= NULL
) {
4023 #ifdef MORE_LOCKING_DEBUG
4024 if (so
->so_usecount
<= 1) {
4025 panic("%s: release so=%p ref=%d on socket\n", __func__
,
4026 so
, so
->so_usecount
);
4030 if (delayed_copy_len
) {
4031 error
= sodelayed_copy(so
, uio
, &free_list
, &delayed_copy_len
);
4034 if (free_list
!= NULL
) {
4035 m_freem_list(free_list
);
4038 sbunlock(&so
->so_rcv
, FALSE
); /* will unlock socket */
4041 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_END
,
4042 VM_KERNEL_ADDRPERM(so
),
4043 ((error
== EWOULDBLOCK
) ? kEnTrFlagNoWork
: 0),
4044 (int64_t)(orig_resid
- uio_resid(uio
)));
4046 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, so
, uio_resid(uio
),
4047 so
->so_rcv
.sb_cc
, 0, error
);
4053 * Returns: 0 Success
4057 sodelayed_copy(struct socket
*so
, struct uio
*uio
, struct mbuf
**free_list
,
4058 user_ssize_t
*resid
)
4065 socket_unlock(so
, 0);
4067 while (m
!= NULL
&& error
== 0) {
4068 error
= uiomove(mtod(m
, caddr_t
), (int)m
->m_len
, uio
);
4071 m_freem_list(*free_list
);
4082 sodelayed_copy_list(struct socket
*so
, struct recv_msg_elem
*msgarray
,
4083 u_int uiocnt
, struct mbuf
**free_list
, user_ssize_t
*resid
)
4087 struct mbuf
*ml
, *m
;
4091 for (ml
= *free_list
, i
= 0; ml
!= NULL
&& i
< uiocnt
;
4092 ml
= ml
->m_nextpkt
, i
++) {
4093 auio
= msgarray
[i
].uio
;
4094 for (m
= ml
; m
!= NULL
; m
= m
->m_next
) {
4095 error
= uiomove(mtod(m
, caddr_t
), m
->m_len
, auio
);
4102 m_freem_list(*free_list
);
4111 soreceive_list(struct socket
*so
, struct recv_msg_elem
*msgarray
, u_int uiocnt
,
4115 struct mbuf
*nextrecord
;
4116 struct mbuf
*ml
= NULL
, *free_list
= NULL
, *free_tail
= NULL
;
4118 user_ssize_t len
, pktlen
, delayed_copy_len
= 0;
4119 struct protosw
*pr
= so
->so_proto
;
4121 struct proc
*p
= current_proc();
4122 struct uio
*auio
= NULL
;
4125 struct sockaddr
**psa
= NULL
;
4126 struct mbuf
**controlp
= NULL
;
4129 struct mbuf
*free_others
= NULL
;
4131 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST
| DBG_FUNC_START
,
4133 so
->so_rcv
.sb_cc
, so
->so_rcv
.sb_lowat
, so
->so_rcv
.sb_hiwat
);
4137 * - Only supports don't wait flags
4138 * - Only support datagram sockets (could be extended to raw)
4140 * - Protocol must support packet chains
4141 * - The uio array is NULL (should we panic?)
4143 if (flagsp
!= NULL
) {
4148 if (flags
& ~(MSG_PEEK
| MSG_WAITALL
| MSG_DONTWAIT
| MSG_NEEDSA
|
4150 printf("%s invalid flags 0x%x\n", __func__
, flags
);
4154 if (so
->so_type
!= SOCK_DGRAM
) {
4158 if (sosendallatonce(so
) == 0) {
4162 if (so
->so_proto
->pr_usrreqs
->pru_send_list
== NULL
) {
4163 error
= EPROTONOSUPPORT
;
4166 if (msgarray
== NULL
) {
4167 printf("%s uioarray is NULL\n", __func__
);
4172 printf("%s uiocnt is 0\n", __func__
);
4177 * Sanity check on the length passed by caller as we are making 'int'
4180 resid
= recv_msg_array_resid(msgarray
, uiocnt
);
4181 if (resid
< 0 || resid
> INT_MAX
) {
4186 if (!(flags
& MSG_PEEK
) && sorecvmincopy
> 0) {
4193 so_update_last_owner_locked(so
, p
);
4194 so_update_policy(so
);
4197 so_update_necp_policy(so
, NULL
, NULL
);
4201 * If a recv attempt is made on a previously-accepted socket
4202 * that has been marked as inactive (disconnected), reject
4205 if (so
->so_flags
& SOF_DEFUNCT
) {
4206 struct sockbuf
*sb
= &so
->so_rcv
;
4209 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4210 __func__
, proc_pid(p
), proc_best_name(p
),
4211 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
4212 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
4214 * This socket should have been disconnected and flushed
4215 * prior to being returned from sodefunct(); there should
4216 * be no data on its receive list, so panic otherwise.
4218 if (so
->so_state
& SS_DEFUNCT
) {
4219 sb_empty_assert(sb
, __func__
);
4226 * The uio may be empty
4228 if (npkts
>= uiocnt
) {
4234 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4235 * and if so just return to the caller. This could happen when
4236 * soreceive() is called by a socket upcall function during the
4237 * time the socket is freed. The socket buffer would have been
4238 * locked across the upcall, therefore we cannot put this thread
4239 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4240 * we may livelock), because the lock on the socket buffer will
4241 * only be released when the upcall routine returns to its caller.
4242 * Because the socket has been officially closed, there can be
4243 * no further read on it.
4245 if ((so
->so_state
& (SS_NOFDREF
| SS_CANTRCVMORE
)) ==
4246 (SS_NOFDREF
| SS_CANTRCVMORE
)) {
4251 error
= sblock(&so
->so_rcv
, SBLOCKWAIT(flags
));
4257 m
= so
->so_rcv
.sb_mb
;
4259 * Block awaiting more datagram if needed
4261 if (m
== NULL
|| (((flags
& MSG_DONTWAIT
) == 0 &&
4262 (so
->so_rcv
.sb_cc
< so
->so_rcv
.sb_lowat
||
4263 ((flags
& MSG_WAITALL
) && npkts
< uiocnt
))))) {
4265 * Panic if we notice inconsistencies in the socket's
4266 * receive list; both sb_mb and sb_cc should correctly
4267 * reflect the contents of the list, otherwise we may
4268 * end up with false positives during select() or poll()
4269 * which could put the application in a bad state.
4271 SB_MB_CHECK(&so
->so_rcv
);
4274 error
= so
->so_error
;
4275 if ((flags
& MSG_PEEK
) == 0) {
4280 if (so
->so_state
& SS_CANTRCVMORE
) {
4283 if ((so
->so_state
& (SS_ISCONNECTED
| SS_ISCONNECTING
)) == 0 &&
4284 (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
)) {
4288 if ((so
->so_state
& SS_NBIO
) ||
4289 (flags
& (MSG_DONTWAIT
| MSG_NBIO
))) {
4290 error
= EWOULDBLOCK
;
4294 * Do not block if we got some data
4296 if (free_list
!= NULL
) {
4301 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 1");
4302 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 1");
4304 sbunlock(&so
->so_rcv
, TRUE
); /* keep socket locked */
4307 error
= sbwait(&so
->so_rcv
);
4314 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgrcv
);
4315 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1");
4316 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1");
4319 * Consume the current uio index as we have a datagram
4321 auio
= msgarray
[npkts
].uio
;
4322 resid
= uio_resid(auio
);
4323 msgarray
[npkts
].which
|= SOCK_MSG_DATA
;
4324 psa
= (msgarray
[npkts
].which
& SOCK_MSG_SA
) ?
4325 &msgarray
[npkts
].psa
: NULL
;
4326 controlp
= (msgarray
[npkts
].which
& SOCK_MSG_CONTROL
) ?
4327 &msgarray
[npkts
].controlp
: NULL
;
4329 nextrecord
= m
->m_nextpkt
;
4331 if ((pr
->pr_flags
& PR_ADDR
) && m
->m_type
== MT_SONAME
) {
4332 error
= soreceive_addr(p
, so
, psa
, flags
, &m
, &nextrecord
, 1);
4333 if (error
== ERESTART
) {
4335 } else if (error
!= 0) {
4340 if (m
!= NULL
&& m
->m_type
== MT_CONTROL
) {
4341 error
= soreceive_ctl(so
, controlp
, flags
, &m
, &nextrecord
);
4347 if (m
->m_pkthdr
.len
== 0) {
4348 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4350 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
4351 (uint64_t)DEBUG_KERNEL_ADDRPERM(m
),
4356 * Loop to copy the mbufs of the current record
4357 * Support zero length packets
4361 while (m
!= NULL
&& (len
= resid
- pktlen
) >= 0 && error
== 0) {
4362 if (m
->m_len
== 0) {
4363 panic("%p m_len zero", m
);
4365 if (m
->m_type
== 0) {
4366 panic("%p m_type zero", m
);
4369 * Clip to the residual length
4371 if (len
> m
->m_len
) {
4376 * Copy the mbufs via the uio or delay the copy
4377 * Sockbuf must be consistent here (points to current mbuf,
4378 * it points to next record) when we drop priority;
4379 * we must note any additions to the sockbuf when we
4380 * block interrupts again.
4382 if (len
> 0 && can_delay
== 0) {
4383 socket_unlock(so
, 0);
4384 error
= uiomove(mtod(m
, caddr_t
), (int)len
, auio
);
4390 delayed_copy_len
+= len
;
4393 if (len
== m
->m_len
) {
4395 * m was entirely copied
4397 sbfree(&so
->so_rcv
, m
);
4398 nextrecord
= m
->m_nextpkt
;
4399 m
->m_nextpkt
= NULL
;
4402 * Set the first packet to the head of the free list
4404 if (free_list
== NULL
) {
4408 * Link current packet to tail of free list
4411 if (free_tail
!= NULL
) {
4412 free_tail
->m_nextpkt
= m
;
4417 * Link current mbuf to last mbuf of current packet
4425 * Move next buf to head of socket buffer
4427 so
->so_rcv
.sb_mb
= m
= ml
->m_next
;
4431 m
->m_nextpkt
= nextrecord
;
4432 if (nextrecord
== NULL
) {
4433 so
->so_rcv
.sb_lastrecord
= m
;
4436 so
->so_rcv
.sb_mb
= nextrecord
;
4437 SB_EMPTY_FIXUP(&so
->so_rcv
);
4439 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 3");
4440 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 3");
4443 * Stop the loop on partial copy
4448 #ifdef MORE_LOCKING_DEBUG
4449 if (so
->so_usecount
<= 1) {
4450 panic("%s: after big while so=%llx ref=%d on socket\n",
4452 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), so
->so_usecount
);
4457 * Tell the caller we made a partial copy
4460 if (so
->so_options
& SO_DONTTRUNC
) {
4462 * Copyout first the freelist then the partial mbuf
4464 socket_unlock(so
, 0);
4465 if (delayed_copy_len
) {
4466 error
= sodelayed_copy_list(so
, msgarray
,
4467 uiocnt
, &free_list
, &delayed_copy_len
);
4471 error
= uiomove(mtod(m
, caddr_t
), (int)len
,
4481 so
->so_rcv
.sb_cc
-= len
;
4482 flags
|= MSG_RCVMORE
;
4484 (void) sbdroprecord(&so
->so_rcv
);
4485 nextrecord
= so
->so_rcv
.sb_mb
;
4492 so
->so_rcv
.sb_mb
= nextrecord
;
4494 * First part is an inline SB_EMPTY_FIXUP(). Second
4495 * part makes sure sb_lastrecord is up-to-date if
4496 * there is still data in the socket buffer.
4498 if (so
->so_rcv
.sb_mb
== NULL
) {
4499 so
->so_rcv
.sb_mbtail
= NULL
;
4500 so
->so_rcv
.sb_lastrecord
= NULL
;
4501 } else if (nextrecord
->m_nextpkt
== NULL
) {
4502 so
->so_rcv
.sb_lastrecord
= nextrecord
;
4504 SB_MB_CHECK(&so
->so_rcv
);
4506 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 4");
4507 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 4");
4510 * We can continue to the next packet as long as:
4511 * - We haven't exhausted the uio array
4512 * - There was no error
4513 * - A packet was not truncated
4514 * - We can still receive more data
4516 if (npkts
< uiocnt
&& error
== 0 &&
4517 (flags
& (MSG_RCVMORE
| MSG_TRUNC
)) == 0 &&
4518 (so
->so_state
& SS_CANTRCVMORE
) == 0) {
4519 sbunlock(&so
->so_rcv
, TRUE
); /* keep socket locked */
4524 if (flagsp
!= NULL
) {
4530 * pru_rcvd may cause more data to be received if the socket lock
4531 * is dropped so we set MSG_HAVEMORE now based on what we know.
4532 * That way the caller won't be surprised if it receives less data
4535 if ((so
->so_options
& SO_WANTMORE
) && so
->so_rcv
.sb_cc
> 0) {
4536 flags
|= MSG_HAVEMORE
;
4539 if (pr
->pr_flags
& PR_WANTRCVD
&& so
->so_pcb
) {
4540 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
4544 sbunlock(&so
->so_rcv
, FALSE
); /* will unlock socket */
4546 socket_unlock(so
, 1);
4549 if (delayed_copy_len
) {
4550 error
= sodelayed_copy_list(so
, msgarray
, uiocnt
,
4551 &free_list
, &delayed_copy_len
);
4555 * Amortize the cost of freeing the mbufs
4557 if (free_list
!= NULL
) {
4558 m_freem_list(free_list
);
4560 if (free_others
!= NULL
) {
4561 m_freem_list(free_others
);
4564 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST
| DBG_FUNC_END
, error
,
4570 * Returns: 0 Success
4573 * <pru_shutdown>:EINVAL
4574 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4575 * <pru_shutdown>:ENOBUFS[TCP]
4576 * <pru_shutdown>:EMSGSIZE[TCP]
4577 * <pru_shutdown>:EHOSTUNREACH[TCP]
4578 * <pru_shutdown>:ENETUNREACH[TCP]
4579 * <pru_shutdown>:ENETDOWN[TCP]
4580 * <pru_shutdown>:ENOMEM[TCP]
4581 * <pru_shutdown>:EACCES[TCP]
4582 * <pru_shutdown>:EMSGSIZE[TCP]
4583 * <pru_shutdown>:ENOBUFS[TCP]
4584 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4585 * <pru_shutdown>:??? [other protocol families]
4588 soshutdown(struct socket
*so
, int how
)
4592 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN
| DBG_FUNC_START
, how
, 0, 0, 0, 0);
4600 (SS_ISCONNECTED
| SS_ISCONNECTING
| SS_ISDISCONNECTING
)) == 0) {
4603 error
= soshutdownlock(so
, how
);
4605 socket_unlock(so
, 1);
4612 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN
| DBG_FUNC_END
, how
, error
, 0, 0, 0);
4618 soshutdownlock_final(struct socket
*so
, int how
)
4620 struct protosw
*pr
= so
->so_proto
;
4623 sflt_notify(so
, sock_evt_shutdown
, &how
);
4625 if (how
!= SHUT_WR
) {
4626 if ((so
->so_state
& SS_CANTRCVMORE
) != 0) {
4627 /* read already shut down */
4632 postevent(so
, 0, EV_RCLOSED
);
4634 if (how
!= SHUT_RD
) {
4635 if ((so
->so_state
& SS_CANTSENDMORE
) != 0) {
4636 /* write already shut down */
4640 error
= (*pr
->pr_usrreqs
->pru_shutdown
)(so
);
4641 postevent(so
, 0, EV_WCLOSED
);
4644 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN
, how
, 1, 0, 0, 0);
4649 soshutdownlock(struct socket
*so
, int how
)
4655 * A content filter may delay the actual shutdown until it
4656 * has processed the pending data
4658 if (so
->so_flags
& SOF_CONTENT_FILTER
) {
4659 error
= cfil_sock_shutdown(so
, &how
);
4660 if (error
== EJUSTRETURN
) {
4663 } else if (error
!= 0) {
4667 #endif /* CONTENT_FILTER */
4669 error
= soshutdownlock_final(so
, how
);
4676 sowflush(struct socket
*so
)
4678 struct sockbuf
*sb
= &so
->so_snd
;
4681 * Obtain lock on the socket buffer (SB_LOCK). This is required
4682 * to prevent the socket buffer from being unexpectedly altered
4683 * while it is used by another thread in socket send/receive.
4685 * sblock() must not fail here, hence the assertion.
4687 (void) sblock(sb
, SBL_WAIT
| SBL_NOINTR
| SBL_IGNDEFUNCT
);
4688 VERIFY(sb
->sb_flags
& SB_LOCK
);
4690 sb
->sb_flags
&= ~(SB_SEL
| SB_UPCALL
);
4691 sb
->sb_flags
|= SB_DROP
;
4692 sb
->sb_upcall
= NULL
;
4693 sb
->sb_upcallarg
= NULL
;
4695 sbunlock(sb
, TRUE
); /* keep socket locked */
4697 selthreadclear(&sb
->sb_sel
);
4702 sorflush(struct socket
*so
)
4704 struct sockbuf
*sb
= &so
->so_rcv
;
4705 struct protosw
*pr
= so
->so_proto
;
4708 lck_mtx_t
*mutex_held
;
4710 * XXX: This code is currently commented out, because we may get here
4711 * as part of sofreelastref(), and at that time, pr_getlock() may no
4712 * longer be able to return us the lock; this will be fixed in future.
4714 if (so
->so_proto
->pr_getlock
!= NULL
) {
4715 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
4717 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
4720 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
4723 sflt_notify(so
, sock_evt_flush_read
, NULL
);
4728 * Obtain lock on the socket buffer (SB_LOCK). This is required
4729 * to prevent the socket buffer from being unexpectedly altered
4730 * while it is used by another thread in socket send/receive.
4732 * sblock() must not fail here, hence the assertion.
4734 (void) sblock(sb
, SBL_WAIT
| SBL_NOINTR
| SBL_IGNDEFUNCT
);
4735 VERIFY(sb
->sb_flags
& SB_LOCK
);
4738 * Copy only the relevant fields from "sb" to "asb" which we
4739 * need for sbrelease() to function. In particular, skip
4740 * sb_sel as it contains the wait queue linkage, which would
4741 * wreak havoc if we were to issue selthreadclear() on "asb".
4742 * Make sure to not carry over SB_LOCK in "asb", as we need
4743 * to acquire it later as part of sbrelease().
4745 bzero(&asb
, sizeof(asb
));
4746 asb
.sb_cc
= sb
->sb_cc
;
4747 asb
.sb_hiwat
= sb
->sb_hiwat
;
4748 asb
.sb_mbcnt
= sb
->sb_mbcnt
;
4749 asb
.sb_mbmax
= sb
->sb_mbmax
;
4750 asb
.sb_ctl
= sb
->sb_ctl
;
4751 asb
.sb_lowat
= sb
->sb_lowat
;
4752 asb
.sb_mb
= sb
->sb_mb
;
4753 asb
.sb_mbtail
= sb
->sb_mbtail
;
4754 asb
.sb_lastrecord
= sb
->sb_lastrecord
;
4755 asb
.sb_so
= sb
->sb_so
;
4756 asb
.sb_flags
= sb
->sb_flags
;
4757 asb
.sb_flags
&= ~(SB_LOCK
| SB_SEL
| SB_KNOTE
| SB_UPCALL
);
4758 asb
.sb_flags
|= SB_DROP
;
4761 * Ideally we'd bzero() these and preserve the ones we need;
4762 * but to do that we'd need to shuffle things around in the
4763 * sockbuf, and we can't do it now because there are KEXTS
4764 * that are directly referring to the socket structure.
4766 * Setting SB_DROP acts as a barrier to prevent further appends.
4767 * Clearing SB_SEL is done for selthreadclear() below.
4776 sb
->sb_mbtail
= NULL
;
4777 sb
->sb_lastrecord
= NULL
;
4778 sb
->sb_timeo
.tv_sec
= 0;
4779 sb
->sb_timeo
.tv_usec
= 0;
4780 sb
->sb_upcall
= NULL
;
4781 sb
->sb_upcallarg
= NULL
;
4782 sb
->sb_flags
&= ~(SB_SEL
| SB_UPCALL
);
4783 sb
->sb_flags
|= SB_DROP
;
4785 sbunlock(sb
, TRUE
); /* keep socket locked */
4788 * Note that selthreadclear() is called on the original "sb" and
4789 * not the local "asb" because of the way wait queue linkage is
4790 * implemented. Given that selwakeup() may be triggered, SB_SEL
4791 * should no longer be set (cleared above.)
4793 selthreadclear(&sb
->sb_sel
);
4795 if ((pr
->pr_flags
& PR_RIGHTS
) && pr
->pr_domain
->dom_dispose
) {
4796 (*pr
->pr_domain
->dom_dispose
)(asb
.sb_mb
);
4803 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4804 * an additional variant to handle the case where the option value needs
4805 * to be some kind of integer, but not a specific size.
4806 * In addition to their use here, these functions are also called by the
4807 * protocol-level pr_ctloutput() routines.
4809 * Returns: 0 Success
4814 sooptcopyin(struct sockopt
*sopt
, void *buf
, size_t len
, size_t minlen
)
4819 * If the user gives us more than we wanted, we ignore it,
4820 * but if we don't get the minimum length the caller
4821 * wants, we return EINVAL. On success, sopt->sopt_valsize
4822 * is set to however much we actually retrieved.
4824 if ((valsize
= sopt
->sopt_valsize
) < minlen
) {
4827 if (valsize
> len
) {
4828 sopt
->sopt_valsize
= valsize
= len
;
4831 if (sopt
->sopt_p
!= kernproc
) {
4832 return copyin(sopt
->sopt_val
, buf
, valsize
);
4835 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), buf
, valsize
);
4840 * sooptcopyin_timeval
4841 * Copy in a timeval value into tv_p, and take into account whether the
4842 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4843 * code here so that we can verify the 64-bit tv_sec value before we lose
4844 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4847 sooptcopyin_timeval(struct sockopt
*sopt
, struct timeval
*tv_p
)
4851 if (proc_is64bit(sopt
->sopt_p
)) {
4852 struct user64_timeval tv64
;
4854 if (sopt
->sopt_valsize
< sizeof(tv64
)) {
4858 sopt
->sopt_valsize
= sizeof(tv64
);
4859 if (sopt
->sopt_p
!= kernproc
) {
4860 error
= copyin(sopt
->sopt_val
, &tv64
, sizeof(tv64
));
4865 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), &tv64
,
4868 if (tv64
.tv_sec
< 0 || tv64
.tv_sec
> LONG_MAX
||
4869 tv64
.tv_usec
< 0 || tv64
.tv_usec
>= 1000000) {
4873 tv_p
->tv_sec
= tv64
.tv_sec
;
4874 tv_p
->tv_usec
= tv64
.tv_usec
;
4876 struct user32_timeval tv32
;
4878 if (sopt
->sopt_valsize
< sizeof(tv32
)) {
4882 sopt
->sopt_valsize
= sizeof(tv32
);
4883 if (sopt
->sopt_p
!= kernproc
) {
4884 error
= copyin(sopt
->sopt_val
, &tv32
, sizeof(tv32
));
4889 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), &tv32
,
4894 * K64todo "comparison is always false due to
4895 * limited range of data type"
4897 if (tv32
.tv_sec
< 0 || tv32
.tv_sec
> LONG_MAX
||
4898 tv32
.tv_usec
< 0 || tv32
.tv_usec
>= 1000000) {
4902 tv_p
->tv_sec
= tv32
.tv_sec
;
4903 tv_p
->tv_usec
= tv32
.tv_usec
;
4909 soopt_cred_check(struct socket
*so
, int priv
, boolean_t allow_root
)
4911 kauth_cred_t cred
= NULL
;
4912 proc_t ep
= PROC_NULL
;
4916 if (so
->so_flags
& SOF_DELEGATED
) {
4917 ep
= proc_find(so
->e_pid
);
4919 cred
= kauth_cred_proc_ref(ep
);
4923 uid
= kauth_cred_getuid(cred
? cred
: so
->so_cred
);
4925 /* uid is 0 for root */
4926 if (uid
!= 0 || !allow_root
) {
4927 error
= priv_check_cred(cred
? cred
: so
->so_cred
, priv
, 0);
4930 kauth_cred_unref(&cred
);
4932 if (ep
!= PROC_NULL
) {
4940 * Returns: 0 Success
4945 * sooptcopyin:EINVAL
4946 * sooptcopyin:EFAULT
4947 * sooptcopyin_timeval:EINVAL
4948 * sooptcopyin_timeval:EFAULT
4949 * sooptcopyin_timeval:EDOM
4950 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4951 * <pr_ctloutput>:???w
4952 * sflt_attach_private:??? [whatever a filter author chooses]
4953 * <sf_setoption>:??? [whatever a filter author chooses]
4955 * Notes: Other <pru_listen> returns depend on the protocol family; all
4956 * <sf_listen> returns depend on what the filter author causes
4957 * their filter to return.
4960 sosetoptlock(struct socket
*so
, struct sockopt
*sopt
, int dolock
)
4965 #if CONFIG_MACF_SOCKET
4967 #endif /* MAC_SOCKET */
4969 if (sopt
->sopt_dir
!= SOPT_SET
) {
4970 sopt
->sopt_dir
= SOPT_SET
;
4977 if ((so
->so_state
& (SS_CANTRCVMORE
| SS_CANTSENDMORE
)) ==
4978 (SS_CANTRCVMORE
| SS_CANTSENDMORE
) &&
4979 (so
->so_flags
& SOF_NPX_SETOPTSHUT
) == 0) {
4980 /* the socket has been shutdown, no more sockopt's */
4985 error
= sflt_setsockopt(so
, sopt
);
4987 if (error
== EJUSTRETURN
) {
4993 if (sopt
->sopt_level
!= SOL_SOCKET
) {
4994 if (so
->so_proto
!= NULL
&&
4995 so
->so_proto
->pr_ctloutput
!= NULL
) {
4996 error
= (*so
->so_proto
->pr_ctloutput
)(so
, sopt
);
4999 error
= ENOPROTOOPT
;
5002 * Allow socket-level (SOL_SOCKET) options to be filtered by
5003 * the protocol layer, if needed. A zero value returned from
5004 * the handler means use default socket-level processing as
5005 * done by the rest of this routine. Otherwise, any other
5006 * return value indicates that the option is unsupported.
5008 if (so
->so_proto
!= NULL
&& (error
= so
->so_proto
->pr_usrreqs
->
5009 pru_socheckopt(so
, sopt
)) != 0) {
5014 switch (sopt
->sopt_name
) {
5017 error
= sooptcopyin(sopt
, &l
, sizeof(l
), sizeof(l
));
5022 so
->so_linger
= (sopt
->sopt_name
== SO_LINGER
) ?
5023 l
.l_linger
: l
.l_linger
* hz
;
5024 if (l
.l_onoff
!= 0) {
5025 so
->so_options
|= SO_LINGER
;
5027 so
->so_options
&= ~SO_LINGER
;
5034 case SO_USELOOPBACK
:
5040 case SO_TIMESTAMP_MONOTONIC
:
5041 case SO_TIMESTAMP_CONTINUOUS
:
5044 case SO_WANTOOBFLAG
:
5045 case SO_NOWAKEFROMSLEEP
:
5046 case SO_NOAPNFALLBK
:
5047 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5053 so
->so_options
|= sopt
->sopt_name
;
5055 so
->so_options
&= ~sopt
->sopt_name
;
5063 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5070 * Values < 1 make no sense for any of these
5071 * options, so disallow them.
5078 switch (sopt
->sopt_name
) {
5081 struct sockbuf
*sb
=
5082 (sopt
->sopt_name
== SO_SNDBUF
) ?
5083 &so
->so_snd
: &so
->so_rcv
;
5084 if (sbreserve(sb
, (u_int32_t
)optval
) == 0) {
5088 sb
->sb_flags
|= SB_USRSIZE
;
5089 sb
->sb_flags
&= ~SB_AUTOSIZE
;
5090 sb
->sb_idealsize
= (u_int32_t
)optval
;
5094 * Make sure the low-water is never greater than
5098 int space
= sbspace(&so
->so_snd
);
5099 u_int32_t hiwat
= so
->so_snd
.sb_hiwat
;
5101 if (so
->so_snd
.sb_flags
& SB_UNIX
) {
5103 (struct unpcb
*)(so
->so_pcb
);
5105 unp
->unp_conn
!= NULL
) {
5106 hiwat
+= unp
->unp_conn
->unp_cc
;
5110 so
->so_snd
.sb_lowat
=
5114 if (space
>= so
->so_snd
.sb_lowat
) {
5121 so
->so_rcv
.sb_lowat
=
5122 (optval
> so
->so_rcv
.sb_hiwat
) ?
5123 so
->so_rcv
.sb_hiwat
: optval
;
5124 data_len
= so
->so_rcv
.sb_cc
5125 - so
->so_rcv
.sb_ctl
;
5126 if (data_len
>= so
->so_rcv
.sb_lowat
) {
5136 error
= sooptcopyin_timeval(sopt
, &tv
);
5141 switch (sopt
->sopt_name
) {
5143 so
->so_snd
.sb_timeo
= tv
;
5146 so
->so_rcv
.sb_timeo
= tv
;
5154 error
= sooptcopyin(sopt
, &nke
, sizeof(nke
),
5160 error
= sflt_attach_internal(so
, nke
.nke_handle
);
5165 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5171 so
->so_flags
|= SOF_NOSIGPIPE
;
5173 so
->so_flags
&= ~SOF_NOSIGPIPE
;
5178 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5184 so
->so_flags
|= SOF_NOADDRAVAIL
;
5186 so
->so_flags
&= ~SOF_NOADDRAVAIL
;
5190 case SO_REUSESHAREUID
:
5191 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5197 so
->so_flags
|= SOF_REUSESHAREUID
;
5199 so
->so_flags
&= ~SOF_REUSESHAREUID
;
5203 case SO_NOTIFYCONFLICT
:
5204 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5208 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5214 so
->so_flags
|= SOF_NOTIFYCONFLICT
;
5216 so
->so_flags
&= ~SOF_NOTIFYCONFLICT
;
5220 case SO_RESTRICTIONS
:
5221 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5227 error
= so_set_restrictions(so
, optval
);
5230 case SO_AWDL_UNRESTRICTED
:
5231 if (SOCK_DOM(so
) != PF_INET
&&
5232 SOCK_DOM(so
) != PF_INET6
) {
5236 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5242 error
= soopt_cred_check(so
,
5243 PRIV_NET_RESTRICTED_AWDL
, false);
5245 inp_set_awdl_unrestricted(
5249 inp_clear_awdl_unrestricted(sotoinpcb(so
));
5252 case SO_INTCOPROC_ALLOW
:
5253 if (SOCK_DOM(so
) != PF_INET6
) {
5257 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5263 inp_get_intcoproc_allowed(sotoinpcb(so
)) == FALSE
) {
5264 error
= soopt_cred_check(so
,
5265 PRIV_NET_RESTRICTED_INTCOPROC
, false);
5267 inp_set_intcoproc_allowed(
5270 } else if (optval
== 0) {
5271 inp_clear_intcoproc_allowed(sotoinpcb(so
));
5276 #if CONFIG_MACF_SOCKET
5277 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof(extmac
),
5278 sizeof(extmac
))) != 0) {
5282 error
= mac_setsockopt_label(proc_ucred(sopt
->sopt_p
),
5286 #endif /* MAC_SOCKET */
5289 case SO_UPCALLCLOSEWAIT
:
5290 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5296 so
->so_flags
|= SOF_UPCALLCLOSEWAIT
;
5298 so
->so_flags
&= ~SOF_UPCALLCLOSEWAIT
;
5303 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5309 so
->so_flags
|= SOF_BINDRANDOMPORT
;
5311 so
->so_flags
&= ~SOF_BINDRANDOMPORT
;
5315 case SO_NP_EXTENSIONS
: {
5316 struct so_np_extensions sonpx
;
5318 error
= sooptcopyin(sopt
, &sonpx
, sizeof(sonpx
),
5323 if (sonpx
.npx_mask
& ~SONPX_MASK_VALID
) {
5328 * Only one bit defined for now
5330 if ((sonpx
.npx_mask
& SONPX_SETOPTSHUT
)) {
5331 if ((sonpx
.npx_flags
& SONPX_SETOPTSHUT
)) {
5332 so
->so_flags
|= SOF_NPX_SETOPTSHUT
;
5334 so
->so_flags
&= ~SOF_NPX_SETOPTSHUT
;
5340 case SO_TRAFFIC_CLASS
: {
5341 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5346 if (optval
>= SO_TC_NET_SERVICE_OFFSET
) {
5347 int netsvc
= optval
- SO_TC_NET_SERVICE_OFFSET
;
5348 error
= so_set_net_service_type(so
, netsvc
);
5351 error
= so_set_traffic_class(so
, optval
);
5355 so
->so_flags1
&= ~SOF1_TC_NET_SERV_TYPE
;
5356 so
->so_netsvctype
= _NET_SERVICE_TYPE_UNSPEC
;
5360 case SO_RECV_TRAFFIC_CLASS
: {
5361 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5367 so
->so_flags
&= ~SOF_RECV_TRAFFIC_CLASS
;
5369 so
->so_flags
|= SOF_RECV_TRAFFIC_CLASS
;
5374 #if (DEVELOPMENT || DEBUG)
5375 case SO_TRAFFIC_CLASS_DBG
: {
5376 struct so_tcdbg so_tcdbg
;
5378 error
= sooptcopyin(sopt
, &so_tcdbg
,
5379 sizeof(struct so_tcdbg
), sizeof(struct so_tcdbg
));
5383 error
= so_set_tcdbg(so
, &so_tcdbg
);
5389 #endif /* (DEVELOPMENT || DEBUG) */
5391 case SO_PRIVILEGED_TRAFFIC_CLASS
:
5392 error
= priv_check_cred(kauth_cred_get(),
5393 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS
, 0);
5397 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5403 so
->so_flags
&= ~SOF_PRIVILEGED_TRAFFIC_CLASS
;
5405 so
->so_flags
|= SOF_PRIVILEGED_TRAFFIC_CLASS
;
5409 #if (DEVELOPMENT || DEBUG)
5411 error
= sosetdefunct(current_proc(), so
, 0, FALSE
);
5413 error
= sodefunct(current_proc(), so
, 0);
5417 #endif /* (DEVELOPMENT || DEBUG) */
5420 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5422 if (error
!= 0 || (so
->so_flags
& SOF_DEFUNCT
)) {
5429 * Any process can set SO_DEFUNCTOK (clear
5430 * SOF_NODEFUNCT), but only root can clear
5431 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5434 kauth_cred_issuser(kauth_cred_get()) == 0) {
5439 so
->so_flags
&= ~SOF_NODEFUNCT
;
5441 so
->so_flags
|= SOF_NODEFUNCT
;
5444 if (SOCK_DOM(so
) == PF_INET
||
5445 SOCK_DOM(so
) == PF_INET6
) {
5446 char s
[MAX_IPv6_STR_LEN
];
5447 char d
[MAX_IPv6_STR_LEN
];
5448 struct inpcb
*inp
= sotoinpcb(so
);
5450 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5451 "[%s %s:%d -> %s:%d] is now marked "
5452 "as %seligible for "
5453 "defunct\n", __func__
, proc_selfpid(),
5454 proc_best_name(current_proc()),
5455 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
5456 (SOCK_TYPE(so
) == SOCK_STREAM
) ?
5457 "TCP" : "UDP", inet_ntop(SOCK_DOM(so
),
5458 ((SOCK_DOM(so
) == PF_INET
) ?
5459 (void *)&inp
->inp_laddr
.s_addr
:
5460 (void *)&inp
->in6p_laddr
), s
, sizeof(s
)),
5461 ntohs(inp
->in6p_lport
),
5462 inet_ntop(SOCK_DOM(so
),
5463 (SOCK_DOM(so
) == PF_INET
) ?
5464 (void *)&inp
->inp_faddr
.s_addr
:
5465 (void *)&inp
->in6p_faddr
, d
, sizeof(d
)),
5466 ntohs(inp
->in6p_fport
),
5467 (so
->so_flags
& SOF_NODEFUNCT
) ?
5470 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5471 "is now marked as %seligible for "
5473 __func__
, proc_selfpid(),
5474 proc_best_name(current_proc()),
5475 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
5476 SOCK_DOM(so
), SOCK_TYPE(so
),
5477 (so
->so_flags
& SOF_NODEFUNCT
) ?
5483 /* This option is not settable */
5487 case SO_OPPORTUNISTIC
:
5488 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5491 error
= so_set_opportunistic(so
, optval
);
5496 /* This option is handled by lower layer(s) */
5501 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5504 error
= so_set_recv_anyif(so
, optval
);
5508 case SO_TRAFFIC_MGT_BACKGROUND
: {
5509 /* This option is handled by lower layer(s) */
5515 case SO_FLOW_DIVERT_TOKEN
:
5516 error
= flow_divert_token_set(so
, sopt
);
5518 #endif /* FLOW_DIVERT */
5522 if ((error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5523 sizeof(optval
))) != 0) {
5527 error
= so_set_effective_pid(so
, optval
, sopt
->sopt_p
);
5530 case SO_DELEGATED_UUID
: {
5533 if ((error
= sooptcopyin(sopt
, &euuid
, sizeof(euuid
),
5534 sizeof(euuid
))) != 0) {
5538 error
= so_set_effective_uuid(so
, euuid
, sopt
->sopt_p
);
5543 case SO_NECP_ATTRIBUTES
:
5544 error
= necp_set_socket_attributes(so
, sopt
);
5547 case SO_NECP_CLIENTUUID
:
5548 if (SOCK_DOM(so
) == PF_MULTIPATH
) {
5549 /* Handled by MPTCP itself */
5553 if (SOCK_DOM(so
) != PF_INET
&& SOCK_DOM(so
) != PF_INET6
) {
5558 struct inpcb
*inp
= sotoinpcb(so
);
5559 if (!uuid_is_null(inp
->necp_client_uuid
)) {
5560 // Clear out the old client UUID if present
5561 necp_inpcb_remove_cb(inp
);
5564 error
= sooptcopyin(sopt
, &inp
->necp_client_uuid
,
5565 sizeof(uuid_t
), sizeof(uuid_t
));
5570 if (uuid_is_null(inp
->necp_client_uuid
)) {
5575 error
= necp_client_register_socket_flow(so
->last_pid
,
5576 inp
->necp_client_uuid
, inp
);
5578 uuid_clear(inp
->necp_client_uuid
);
5582 if (inp
->inp_lport
!= 0) {
5583 // There is bound local port, so this is not
5584 // a fresh socket. Assign to the client.
5585 necp_client_assign_from_socket(so
->last_pid
, inp
->necp_client_uuid
, inp
);
5591 case SO_EXTENDED_BK_IDLE
:
5592 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5595 error
= so_set_extended_bk_idle(so
, optval
);
5599 case SO_MARK_CELLFALLBACK
:
5600 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5610 so
->so_flags1
&= ~SOF1_CELLFALLBACK
;
5612 so
->so_flags1
|= SOF1_CELLFALLBACK
;
5616 case SO_NET_SERVICE_TYPE
: {
5617 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5622 error
= so_set_net_service_type(so
, optval
);
5626 case SO_QOSMARKING_POLICY_OVERRIDE
:
5627 error
= priv_check_cred(kauth_cred_get(),
5628 PRIV_NET_QOSMARKING_POLICY_OVERRIDE
, 0);
5632 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5638 so
->so_flags1
&= ~SOF1_QOSMARKING_POLICY_OVERRIDE
;
5640 so
->so_flags1
|= SOF1_QOSMARKING_POLICY_OVERRIDE
;
5645 error
= ENOPROTOOPT
;
5648 if (error
== 0 && so
->so_proto
!= NULL
&&
5649 so
->so_proto
->pr_ctloutput
!= NULL
) {
5650 (void) so
->so_proto
->pr_ctloutput(so
, sopt
);
5655 socket_unlock(so
, 1);
5660 /* Helper routines for getsockopt */
5662 sooptcopyout(struct sockopt
*sopt
, void *buf
, size_t len
)
5670 * Documented get behavior is that we always return a value,
5671 * possibly truncated to fit in the user's buffer.
5672 * Traditional behavior is that we always tell the user
5673 * precisely how much we copied, rather than something useful
5674 * like the total amount we had available for her.
5675 * Note that this interface is not idempotent; the entire answer must
5676 * generated ahead of time.
5678 valsize
= min(len
, sopt
->sopt_valsize
);
5679 sopt
->sopt_valsize
= valsize
;
5680 if (sopt
->sopt_val
!= USER_ADDR_NULL
) {
5681 if (sopt
->sopt_p
!= kernproc
) {
5682 error
= copyout(buf
, sopt
->sopt_val
, valsize
);
5684 bcopy(buf
, CAST_DOWN(caddr_t
, sopt
->sopt_val
), valsize
);
5691 sooptcopyout_timeval(struct sockopt
*sopt
, const struct timeval
*tv_p
)
5695 struct user64_timeval tv64
= {};
5696 struct user32_timeval tv32
= {};
5701 if (proc_is64bit(sopt
->sopt_p
)) {
5703 tv64
.tv_sec
= tv_p
->tv_sec
;
5704 tv64
.tv_usec
= tv_p
->tv_usec
;
5708 tv32
.tv_sec
= tv_p
->tv_sec
;
5709 tv32
.tv_usec
= tv_p
->tv_usec
;
5712 valsize
= min(len
, sopt
->sopt_valsize
);
5713 sopt
->sopt_valsize
= valsize
;
5714 if (sopt
->sopt_val
!= USER_ADDR_NULL
) {
5715 if (sopt
->sopt_p
!= kernproc
) {
5716 error
= copyout(val
, sopt
->sopt_val
, valsize
);
5718 bcopy(val
, CAST_DOWN(caddr_t
, sopt
->sopt_val
), valsize
);
5727 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5728 * <pr_ctloutput>:???
5729 * <sf_getoption>:???
5732 sogetoptlock(struct socket
*so
, struct sockopt
*sopt
, int dolock
)
5737 #if CONFIG_MACF_SOCKET
5739 #endif /* MAC_SOCKET */
5741 if (sopt
->sopt_dir
!= SOPT_GET
) {
5742 sopt
->sopt_dir
= SOPT_GET
;
5749 error
= sflt_getsockopt(so
, sopt
);
5751 if (error
== EJUSTRETURN
) {
5757 if (sopt
->sopt_level
!= SOL_SOCKET
) {
5758 if (so
->so_proto
!= NULL
&&
5759 so
->so_proto
->pr_ctloutput
!= NULL
) {
5760 error
= (*so
->so_proto
->pr_ctloutput
)(so
, sopt
);
5763 error
= ENOPROTOOPT
;
5766 * Allow socket-level (SOL_SOCKET) options to be filtered by
5767 * the protocol layer, if needed. A zero value returned from
5768 * the handler means use default socket-level processing as
5769 * done by the rest of this routine. Otherwise, any other
5770 * return value indicates that the option is unsupported.
5772 if (so
->so_proto
!= NULL
&& (error
= so
->so_proto
->pr_usrreqs
->
5773 pru_socheckopt(so
, sopt
)) != 0) {
5778 switch (sopt
->sopt_name
) {
5781 l
.l_onoff
= ((so
->so_options
& SO_LINGER
) ? 1 : 0);
5782 l
.l_linger
= (sopt
->sopt_name
== SO_LINGER
) ?
5783 so
->so_linger
: so
->so_linger
/ hz
;
5784 error
= sooptcopyout(sopt
, &l
, sizeof(l
));
5787 case SO_USELOOPBACK
:
5796 case SO_TIMESTAMP_MONOTONIC
:
5797 case SO_TIMESTAMP_CONTINUOUS
:
5800 case SO_WANTOOBFLAG
:
5801 case SO_NOWAKEFROMSLEEP
:
5802 case SO_NOAPNFALLBK
:
5803 optval
= so
->so_options
& sopt
->sopt_name
;
5805 error
= sooptcopyout(sopt
, &optval
, sizeof(optval
));
5809 optval
= so
->so_type
;
5813 if (so
->so_proto
->pr_flags
& PR_ATOMIC
) {
5818 m1
= so
->so_rcv
.sb_mb
;
5819 while (m1
!= NULL
) {
5820 if (m1
->m_type
== MT_DATA
||
5821 m1
->m_type
== MT_HEADER
||
5822 m1
->m_type
== MT_OOBDATA
) {
5823 pkt_total
+= m1
->m_len
;
5829 optval
= so
->so_rcv
.sb_cc
- so
->so_rcv
.sb_ctl
;
5834 if (so
->so_proto
->pr_flags
& PR_ATOMIC
) {
5838 m1
= so
->so_rcv
.sb_mb
;
5839 while (m1
!= NULL
) {
5840 if (m1
->m_type
== MT_DATA
||
5841 m1
->m_type
== MT_HEADER
||
5842 m1
->m_type
== MT_OOBDATA
) {
5855 optval
= so
->so_snd
.sb_cc
;
5859 optval
= so
->so_error
;
5864 u_int32_t hiwat
= so
->so_snd
.sb_hiwat
;
5866 if (so
->so_snd
.sb_flags
& SB_UNIX
) {
5868 (struct unpcb
*)(so
->so_pcb
);
5869 if (unp
!= NULL
&& unp
->unp_conn
!= NULL
) {
5870 hiwat
+= unp
->unp_conn
->unp_cc
;
5878 optval
= so
->so_rcv
.sb_hiwat
;
5882 optval
= so
->so_snd
.sb_lowat
;
5886 optval
= so
->so_rcv
.sb_lowat
;
5891 tv
= (sopt
->sopt_name
== SO_SNDTIMEO
?
5892 so
->so_snd
.sb_timeo
: so
->so_rcv
.sb_timeo
);
5894 error
= sooptcopyout_timeval(sopt
, &tv
);
5898 optval
= (so
->so_flags
& SOF_NOSIGPIPE
);
5902 optval
= (so
->so_flags
& SOF_NOADDRAVAIL
);
5905 case SO_REUSESHAREUID
:
5906 optval
= (so
->so_flags
& SOF_REUSESHAREUID
);
5910 case SO_NOTIFYCONFLICT
:
5911 optval
= (so
->so_flags
& SOF_NOTIFYCONFLICT
);
5914 case SO_RESTRICTIONS
:
5915 optval
= so_get_restrictions(so
);
5918 case SO_AWDL_UNRESTRICTED
:
5919 if (SOCK_DOM(so
) == PF_INET
||
5920 SOCK_DOM(so
) == PF_INET6
) {
5921 optval
= inp_get_awdl_unrestricted(
5929 case SO_INTCOPROC_ALLOW
:
5930 if (SOCK_DOM(so
) == PF_INET6
) {
5931 optval
= inp_get_intcoproc_allowed(
5940 #if CONFIG_MACF_SOCKET
5941 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof(extmac
),
5942 sizeof(extmac
))) != 0 ||
5943 (error
= mac_socket_label_get(proc_ucred(
5944 sopt
->sopt_p
), so
, &extmac
)) != 0) {
5948 error
= sooptcopyout(sopt
, &extmac
, sizeof(extmac
));
5951 #endif /* MAC_SOCKET */
5955 #if CONFIG_MACF_SOCKET
5956 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof(extmac
),
5957 sizeof(extmac
))) != 0 ||
5958 (error
= mac_socketpeer_label_get(proc_ucred(
5959 sopt
->sopt_p
), so
, &extmac
)) != 0) {
5963 error
= sooptcopyout(sopt
, &extmac
, sizeof(extmac
));
5966 #endif /* MAC_SOCKET */
5969 #ifdef __APPLE_API_PRIVATE
5970 case SO_UPCALLCLOSEWAIT
:
5971 optval
= (so
->so_flags
& SOF_UPCALLCLOSEWAIT
);
5975 optval
= (so
->so_flags
& SOF_BINDRANDOMPORT
);
5978 case SO_NP_EXTENSIONS
: {
5979 struct so_np_extensions sonpx
= {};
5981 sonpx
.npx_flags
= (so
->so_flags
& SOF_NPX_SETOPTSHUT
) ?
5982 SONPX_SETOPTSHUT
: 0;
5983 sonpx
.npx_mask
= SONPX_MASK_VALID
;
5985 error
= sooptcopyout(sopt
, &sonpx
,
5986 sizeof(struct so_np_extensions
));
5990 case SO_TRAFFIC_CLASS
:
5991 optval
= so
->so_traffic_class
;
5994 case SO_RECV_TRAFFIC_CLASS
:
5995 optval
= (so
->so_flags
& SOF_RECV_TRAFFIC_CLASS
);
5998 case SO_TRAFFIC_CLASS_STATS
:
5999 error
= sooptcopyout(sopt
, &so
->so_tc_stats
,
6000 sizeof(so
->so_tc_stats
));
6003 #if (DEVELOPMENT || DEBUG)
6004 case SO_TRAFFIC_CLASS_DBG
:
6005 error
= sogetopt_tcdbg(so
, sopt
);
6007 #endif /* (DEVELOPMENT || DEBUG) */
6009 case SO_PRIVILEGED_TRAFFIC_CLASS
:
6010 optval
= (so
->so_flags
& SOF_PRIVILEGED_TRAFFIC_CLASS
);
6014 optval
= !(so
->so_flags
& SOF_NODEFUNCT
);
6018 optval
= (so
->so_flags
& SOF_DEFUNCT
);
6021 case SO_OPPORTUNISTIC
:
6022 optval
= so_get_opportunistic(so
);
6026 /* This option is not gettable */
6031 optval
= so_get_recv_anyif(so
);
6034 case SO_TRAFFIC_MGT_BACKGROUND
:
6035 /* This option is handled by lower layer(s) */
6036 if (so
->so_proto
!= NULL
&&
6037 so
->so_proto
->pr_ctloutput
!= NULL
) {
6038 (void) so
->so_proto
->pr_ctloutput(so
, sopt
);
6043 case SO_FLOW_DIVERT_TOKEN
:
6044 error
= flow_divert_token_get(so
, sopt
);
6046 #endif /* FLOW_DIVERT */
6049 case SO_NECP_ATTRIBUTES
:
6050 error
= necp_get_socket_attributes(so
, sopt
);
6053 case SO_NECP_CLIENTUUID
:
6057 if (SOCK_DOM(so
) == PF_MULTIPATH
) {
6058 ncu
= &mpsotomppcb(so
)->necp_client_uuid
;
6059 } else if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
6060 ncu
= &sotoinpcb(so
)->necp_client_uuid
;
6066 error
= sooptcopyout(sopt
, ncu
, sizeof(uuid_t
));
6072 case SO_CFIL_SOCK_ID
: {
6073 cfil_sock_id_t sock_id
;
6075 sock_id
= cfil_sock_id_from_socket(so
);
6077 error
= sooptcopyout(sopt
, &sock_id
,
6078 sizeof(cfil_sock_id_t
));
6081 #endif /* CONTENT_FILTER */
6083 case SO_EXTENDED_BK_IDLE
:
6084 optval
= (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
);
6086 case SO_MARK_CELLFALLBACK
:
6087 optval
= ((so
->so_flags1
& SOF1_CELLFALLBACK
) > 0)
6090 case SO_NET_SERVICE_TYPE
: {
6091 if ((so
->so_flags1
& SOF1_TC_NET_SERV_TYPE
)) {
6092 optval
= so
->so_netsvctype
;
6094 optval
= NET_SERVICE_TYPE_BE
;
6098 case SO_NETSVC_MARKING_LEVEL
:
6099 optval
= so_get_netsvc_marking_level(so
);
6103 error
= ENOPROTOOPT
;
6109 socket_unlock(so
, 1);
6115 * The size limits on our soopt_getm is different from that on FreeBSD.
6116 * We limit the size of options to MCLBYTES. This will have to change
6117 * if we need to define options that need more space than MCLBYTES.
6120 soopt_getm(struct sockopt
*sopt
, struct mbuf
**mp
)
6122 struct mbuf
*m
, *m_prev
;
6123 int sopt_size
= sopt
->sopt_valsize
;
6126 if (sopt_size
<= 0 || sopt_size
> MCLBYTES
) {
6130 how
= sopt
->sopt_p
!= kernproc
? M_WAIT
: M_DONTWAIT
;
6131 MGET(m
, how
, MT_DATA
);
6135 if (sopt_size
> MLEN
) {
6137 if ((m
->m_flags
& M_EXT
) == 0) {
6141 m
->m_len
= min(MCLBYTES
, sopt_size
);
6143 m
->m_len
= min(MLEN
, sopt_size
);
6145 sopt_size
-= m
->m_len
;
6149 while (sopt_size
> 0) {
6150 MGET(m
, how
, MT_DATA
);
6155 if (sopt_size
> MLEN
) {
6157 if ((m
->m_flags
& M_EXT
) == 0) {
6162 m
->m_len
= min(MCLBYTES
, sopt_size
);
6164 m
->m_len
= min(MLEN
, sopt_size
);
6166 sopt_size
-= m
->m_len
;
6173 /* copyin sopt data into mbuf chain */
6175 soopt_mcopyin(struct sockopt
*sopt
, struct mbuf
*m
)
6177 struct mbuf
*m0
= m
;
6179 if (sopt
->sopt_val
== USER_ADDR_NULL
) {
6182 while (m
!= NULL
&& sopt
->sopt_valsize
>= m
->m_len
) {
6183 if (sopt
->sopt_p
!= kernproc
) {
6186 error
= copyin(sopt
->sopt_val
, mtod(m
, char *),
6193 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
),
6194 mtod(m
, char *), m
->m_len
);
6196 sopt
->sopt_valsize
-= m
->m_len
;
6197 sopt
->sopt_val
+= m
->m_len
;
6200 /* should be allocated enoughly at ip6_sooptmcopyin() */
6202 panic("soopt_mcopyin");
6208 /* copyout mbuf chain data into soopt */
6210 soopt_mcopyout(struct sockopt
*sopt
, struct mbuf
*m
)
6212 struct mbuf
*m0
= m
;
6215 if (sopt
->sopt_val
== USER_ADDR_NULL
) {
6218 while (m
!= NULL
&& sopt
->sopt_valsize
>= m
->m_len
) {
6219 if (sopt
->sopt_p
!= kernproc
) {
6222 error
= copyout(mtod(m
, char *), sopt
->sopt_val
,
6229 bcopy(mtod(m
, char *),
6230 CAST_DOWN(caddr_t
, sopt
->sopt_val
), m
->m_len
);
6232 sopt
->sopt_valsize
-= m
->m_len
;
6233 sopt
->sopt_val
+= m
->m_len
;
6234 valsize
+= m
->m_len
;
6238 /* enough soopt buffer should be given from user-land */
6242 sopt
->sopt_valsize
= valsize
;
6247 sohasoutofband(struct socket
*so
)
6249 if (so
->so_pgid
< 0) {
6250 gsignal(-so
->so_pgid
, SIGURG
);
6251 } else if (so
->so_pgid
> 0) {
6252 proc_signal(so
->so_pgid
, SIGURG
);
6254 selwakeup(&so
->so_rcv
.sb_sel
);
6255 if (so
->so_rcv
.sb_flags
& SB_KNOTE
) {
6256 KNOTE(&so
->so_rcv
.sb_sel
.si_note
,
6257 (NOTE_OOB
| SO_FILT_HINT_LOCKED
));
6262 sopoll(struct socket
*so
, int events
, kauth_cred_t cred
, void * wql
)
6264 #pragma unused(cred)
6265 struct proc
*p
= current_proc();
6269 so_update_last_owner_locked(so
, PROC_NULL
);
6270 so_update_policy(so
);
6272 if (events
& (POLLIN
| POLLRDNORM
)) {
6273 if (soreadable(so
)) {
6274 revents
|= events
& (POLLIN
| POLLRDNORM
);
6278 if (events
& (POLLOUT
| POLLWRNORM
)) {
6279 if (sowriteable(so
)) {
6280 revents
|= events
& (POLLOUT
| POLLWRNORM
);
6284 if (events
& (POLLPRI
| POLLRDBAND
)) {
6285 if (so
->so_oobmark
|| (so
->so_state
& SS_RCVATMARK
)) {
6286 revents
|= events
& (POLLPRI
| POLLRDBAND
);
6291 if (events
& (POLLIN
| POLLPRI
| POLLRDNORM
| POLLRDBAND
)) {
6293 * Darwin sets the flag first,
6294 * BSD calls selrecord first
6296 so
->so_rcv
.sb_flags
|= SB_SEL
;
6297 selrecord(p
, &so
->so_rcv
.sb_sel
, wql
);
6300 if (events
& (POLLOUT
| POLLWRNORM
)) {
6302 * Darwin sets the flag first,
6303 * BSD calls selrecord first
6305 so
->so_snd
.sb_flags
|= SB_SEL
;
6306 selrecord(p
, &so
->so_snd
.sb_sel
, wql
);
6310 socket_unlock(so
, 1);
6315 soo_kqfilter(struct fileproc
*fp
, struct knote
*kn
,
6316 struct kevent_internal_s
*kev
, vfs_context_t ctx
)
6319 #if !CONFIG_MACF_SOCKET
6321 #endif /* MAC_SOCKET */
6322 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6326 so_update_last_owner_locked(so
, PROC_NULL
);
6327 so_update_policy(so
);
6329 #if CONFIG_MACF_SOCKET
6330 if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx
)),
6332 socket_unlock(so
, 1);
6333 kn
->kn_flags
= EV_ERROR
;
6334 kn
->kn_data
= EPERM
;
6337 #endif /* MAC_SOCKET */
6339 switch (kn
->kn_filter
) {
6341 kn
->kn_filtid
= EVFILTID_SOREAD
;
6344 kn
->kn_filtid
= EVFILTID_SOWRITE
;
6347 kn
->kn_filtid
= EVFILTID_SCK
;
6350 kn
->kn_filtid
= EVFILTID_SOEXCEPT
;
6353 socket_unlock(so
, 1);
6354 kn
->kn_flags
= EV_ERROR
;
6355 kn
->kn_data
= EINVAL
;
6360 * call the appropriate sub-filter attach
6361 * with the socket still locked
6363 result
= knote_fops(kn
)->f_attach(kn
, kev
);
6365 socket_unlock(so
, 1);
6371 filt_soread_common(struct knote
*kn
, struct socket
*so
)
6373 if (so
->so_options
& SO_ACCEPTCONN
) {
6377 * Radar 6615193 handle the listen case dynamically
6378 * for kqueue read filter. This allows to call listen()
6379 * after registering the kqueue EVFILT_READ.
6382 kn
->kn_data
= so
->so_qlen
;
6383 is_not_empty
= !TAILQ_EMPTY(&so
->so_comp
);
6385 return is_not_empty
;
6388 /* socket isn't a listener */
6390 * NOTE_LOWAT specifies new low water mark in data, i.e.
6391 * the bytes of protocol data. We therefore exclude any
6394 kn
->kn_data
= so
->so_rcv
.sb_cc
- so
->so_rcv
.sb_ctl
;
6396 if (kn
->kn_sfflags
& NOTE_OOB
) {
6397 if (so
->so_oobmark
|| (so
->so_state
& SS_RCVATMARK
)) {
6398 kn
->kn_fflags
|= NOTE_OOB
;
6399 kn
->kn_data
-= so
->so_oobmark
;
6404 if ((so
->so_state
& SS_CANTRCVMORE
)
6406 && cfil_sock_data_pending(&so
->so_rcv
) == 0
6407 #endif /* CONTENT_FILTER */
6409 kn
->kn_flags
|= EV_EOF
;
6410 kn
->kn_fflags
= so
->so_error
;
6414 if (so
->so_error
) { /* temporary udp error */
6418 int64_t lowwat
= so
->so_rcv
.sb_lowat
;
6420 * Ensure that when NOTE_LOWAT is used, the derived
6421 * low water mark is bounded by socket's rcv buf's
6422 * high and low water mark values.
6424 if (kn
->kn_sfflags
& NOTE_LOWAT
) {
6425 if (kn
->kn_sdata
> so
->so_rcv
.sb_hiwat
) {
6426 lowwat
= so
->so_rcv
.sb_hiwat
;
6427 } else if (kn
->kn_sdata
> lowwat
) {
6428 lowwat
= kn
->kn_sdata
;
6433 * The order below is important. Since NOTE_LOWAT
6434 * overrides sb_lowat, check for NOTE_LOWAT case
6437 if (kn
->kn_sfflags
& NOTE_LOWAT
) {
6438 return kn
->kn_data
>= lowwat
;
6441 return so
->so_rcv
.sb_cc
>= lowwat
;
6445 filt_sorattach(struct knote
*kn
, __unused
struct kevent_internal_s
*kev
)
6447 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6452 * If the caller explicitly asked for OOB results (e.g. poll())
6453 * from EVFILT_READ, then save that off in the hookid field
6454 * and reserve the kn_flags EV_OOBAND bit for output only.
6456 if (kn
->kn_filter
== EVFILT_READ
&&
6457 kn
->kn_flags
& EV_OOBAND
) {
6458 kn
->kn_flags
&= ~EV_OOBAND
;
6459 kn
->kn_hookid
= EV_OOBAND
;
6463 if (KNOTE_ATTACH(&so
->so_rcv
.sb_sel
.si_note
, kn
)) {
6464 so
->so_rcv
.sb_flags
|= SB_KNOTE
;
6467 /* indicate if event is already fired */
6468 return filt_soread_common(kn
, so
);
6472 filt_sordetach(struct knote
*kn
)
6474 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6477 if (so
->so_rcv
.sb_flags
& SB_KNOTE
) {
6478 if (KNOTE_DETACH(&so
->so_rcv
.sb_sel
.si_note
, kn
)) {
6479 so
->so_rcv
.sb_flags
&= ~SB_KNOTE
;
6482 socket_unlock(so
, 1);
6487 filt_soread(struct knote
*kn
, long hint
)
6489 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6492 if ((hint
& SO_FILT_HINT_LOCKED
) == 0) {
6496 retval
= filt_soread_common(kn
, so
);
6498 if ((hint
& SO_FILT_HINT_LOCKED
) == 0) {
6499 socket_unlock(so
, 1);
6506 filt_sortouch(struct knote
*kn
, struct kevent_internal_s
*kev
)
6508 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6513 /* save off the new input fflags and data */
6514 kn
->kn_sfflags
= kev
->fflags
;
6515 kn
->kn_sdata
= kev
->data
;
6517 /* determine if changes result in fired events */
6518 retval
= filt_soread_common(kn
, so
);
6520 socket_unlock(so
, 1);
6526 filt_sorprocess(struct knote
*kn
, struct filt_process_s
*data
, struct kevent_internal_s
*kev
)
6528 #pragma unused(data)
6529 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6533 retval
= filt_soread_common(kn
, so
);
6535 *kev
= kn
->kn_kevent
;
6536 if (kn
->kn_flags
& EV_CLEAR
) {
6541 socket_unlock(so
, 1);
6547 so_wait_for_if_feedback(struct socket
*so
)
6549 if ((SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) &&
6550 (so
->so_state
& SS_ISCONNECTED
)) {
6551 struct inpcb
*inp
= sotoinpcb(so
);
6552 if (INP_WAIT_FOR_IF_FEEDBACK(inp
)) {
6560 filt_sowrite_common(struct knote
*kn
, struct socket
*so
)
6564 kn
->kn_data
= sbspace(&so
->so_snd
);
6565 if (so
->so_state
& SS_CANTSENDMORE
) {
6566 kn
->kn_flags
|= EV_EOF
;
6567 kn
->kn_fflags
= so
->so_error
;
6570 if (so
->so_error
) { /* temporary udp error */
6573 if (!socanwrite(so
)) {
6576 if (so
->so_flags1
& SOF1_PRECONNECT_DATA
) {
6579 int64_t lowwat
= so
->so_snd
.sb_lowat
;
6580 if (kn
->kn_sfflags
& NOTE_LOWAT
) {
6581 if (kn
->kn_sdata
> so
->so_snd
.sb_hiwat
) {
6582 lowwat
= so
->so_snd
.sb_hiwat
;
6583 } else if (kn
->kn_sdata
> lowwat
) {
6584 lowwat
= kn
->kn_sdata
;
6587 if (kn
->kn_data
>= lowwat
) {
6588 if ((so
->so_flags
& SOF_NOTSENT_LOWAT
)
6589 #if (DEBUG || DEVELOPMENT)
6590 && so_notsent_lowat_check
== 1
6591 #endif /* DEBUG || DEVELOPMENT */
6593 if ((SOCK_DOM(so
) == PF_INET
||
6594 SOCK_DOM(so
) == PF_INET6
) &&
6595 so
->so_type
== SOCK_STREAM
) {
6596 ret
= tcp_notsent_lowat_check(so
);
6599 else if ((SOCK_DOM(so
) == PF_MULTIPATH
) &&
6600 (SOCK_PROTO(so
) == IPPROTO_TCP
)) {
6601 ret
= mptcp_notsent_lowat_check(so
);
6611 if (so_wait_for_if_feedback(so
)) {
6618 filt_sowattach(struct knote
*kn
, __unused
struct kevent_internal_s
*kev
)
6620 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6623 if (KNOTE_ATTACH(&so
->so_snd
.sb_sel
.si_note
, kn
)) {
6624 so
->so_snd
.sb_flags
|= SB_KNOTE
;
6627 /* determine if its already fired */
6628 return filt_sowrite_common(kn
, so
);
6632 filt_sowdetach(struct knote
*kn
)
6634 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6637 if (so
->so_snd
.sb_flags
& SB_KNOTE
) {
6638 if (KNOTE_DETACH(&so
->so_snd
.sb_sel
.si_note
, kn
)) {
6639 so
->so_snd
.sb_flags
&= ~SB_KNOTE
;
6642 socket_unlock(so
, 1);
6647 filt_sowrite(struct knote
*kn
, long hint
)
6649 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6652 if ((hint
& SO_FILT_HINT_LOCKED
) == 0) {
6656 ret
= filt_sowrite_common(kn
, so
);
6658 if ((hint
& SO_FILT_HINT_LOCKED
) == 0) {
6659 socket_unlock(so
, 1);
6666 filt_sowtouch(struct knote
*kn
, struct kevent_internal_s
*kev
)
6668 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6673 /*save off the new input fflags and data */
6674 kn
->kn_sfflags
= kev
->fflags
;
6675 kn
->kn_sdata
= kev
->data
;
6677 /* determine if these changes result in a triggered event */
6678 ret
= filt_sowrite_common(kn
, so
);
6680 socket_unlock(so
, 1);
6686 filt_sowprocess(struct knote
*kn
, struct filt_process_s
*data
, struct kevent_internal_s
*kev
)
6688 #pragma unused(data)
6689 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6693 ret
= filt_sowrite_common(kn
, so
);
6695 *kev
= kn
->kn_kevent
;
6696 if (kn
->kn_flags
& EV_CLEAR
) {
6701 socket_unlock(so
, 1);
6706 filt_sockev_common(struct knote
*kn
, struct socket
*so
, long ev_hint
)
6709 uint32_t level_trigger
= 0;
6711 if (ev_hint
& SO_FILT_HINT_CONNRESET
) {
6712 kn
->kn_fflags
|= NOTE_CONNRESET
;
6714 if (ev_hint
& SO_FILT_HINT_TIMEOUT
) {
6715 kn
->kn_fflags
|= NOTE_TIMEOUT
;
6717 if (ev_hint
& SO_FILT_HINT_NOSRCADDR
) {
6718 kn
->kn_fflags
|= NOTE_NOSRCADDR
;
6720 if (ev_hint
& SO_FILT_HINT_IFDENIED
) {
6721 kn
->kn_fflags
|= NOTE_IFDENIED
;
6723 if (ev_hint
& SO_FILT_HINT_KEEPALIVE
) {
6724 kn
->kn_fflags
|= NOTE_KEEPALIVE
;
6726 if (ev_hint
& SO_FILT_HINT_ADAPTIVE_WTIMO
) {
6727 kn
->kn_fflags
|= NOTE_ADAPTIVE_WTIMO
;
6729 if (ev_hint
& SO_FILT_HINT_ADAPTIVE_RTIMO
) {
6730 kn
->kn_fflags
|= NOTE_ADAPTIVE_RTIMO
;
6732 if ((ev_hint
& SO_FILT_HINT_CONNECTED
) ||
6733 (so
->so_state
& SS_ISCONNECTED
)) {
6734 kn
->kn_fflags
|= NOTE_CONNECTED
;
6735 level_trigger
|= NOTE_CONNECTED
;
6737 if ((ev_hint
& SO_FILT_HINT_DISCONNECTED
) ||
6738 (so
->so_state
& SS_ISDISCONNECTED
)) {
6739 kn
->kn_fflags
|= NOTE_DISCONNECTED
;
6740 level_trigger
|= NOTE_DISCONNECTED
;
6742 if (ev_hint
& SO_FILT_HINT_CONNINFO_UPDATED
) {
6743 if (so
->so_proto
!= NULL
&&
6744 (so
->so_proto
->pr_flags
& PR_EVCONNINFO
)) {
6745 kn
->kn_fflags
|= NOTE_CONNINFO_UPDATED
;
6749 if ((ev_hint
& SO_FILT_HINT_NOTIFY_ACK
) ||
6750 tcp_notify_ack_active(so
)) {
6751 kn
->kn_fflags
|= NOTE_NOTIFY_ACK
;
6754 if ((so
->so_state
& SS_CANTRCVMORE
)
6756 && cfil_sock_data_pending(&so
->so_rcv
) == 0
6757 #endif /* CONTENT_FILTER */
6759 kn
->kn_fflags
|= NOTE_READCLOSED
;
6760 level_trigger
|= NOTE_READCLOSED
;
6763 if (so
->so_state
& SS_CANTSENDMORE
) {
6764 kn
->kn_fflags
|= NOTE_WRITECLOSED
;
6765 level_trigger
|= NOTE_WRITECLOSED
;
6768 if ((ev_hint
& SO_FILT_HINT_SUSPEND
) ||
6769 (so
->so_flags
& SOF_SUSPENDED
)) {
6770 kn
->kn_fflags
&= ~(NOTE_SUSPEND
| NOTE_RESUME
);
6772 /* If resume event was delivered before, reset it */
6773 kn
->kn_hookid
&= ~NOTE_RESUME
;
6775 kn
->kn_fflags
|= NOTE_SUSPEND
;
6776 level_trigger
|= NOTE_SUSPEND
;
6779 if ((ev_hint
& SO_FILT_HINT_RESUME
) ||
6780 (so
->so_flags
& SOF_SUSPENDED
) == 0) {
6781 kn
->kn_fflags
&= ~(NOTE_SUSPEND
| NOTE_RESUME
);
6783 /* If suspend event was delivered before, reset it */
6784 kn
->kn_hookid
&= ~NOTE_SUSPEND
;
6786 kn
->kn_fflags
|= NOTE_RESUME
;
6787 level_trigger
|= NOTE_RESUME
;
6790 if (so
->so_error
!= 0) {
6792 kn
->kn_data
= so
->so_error
;
6793 kn
->kn_flags
|= EV_EOF
;
6795 get_sockev_state(so
, (u_int32_t
*)&(kn
->kn_data
));
6798 /* Reset any events that are not requested on this knote */
6799 kn
->kn_fflags
&= (kn
->kn_sfflags
& EVFILT_SOCK_ALL_MASK
);
6800 level_trigger
&= (kn
->kn_sfflags
& EVFILT_SOCK_ALL_MASK
);
6802 /* Find the level triggerred events that are already delivered */
6803 level_trigger
&= kn
->kn_hookid
;
6804 level_trigger
&= EVFILT_SOCK_LEVEL_TRIGGER_MASK
;
6806 /* Do not deliver level triggerred events more than once */
6807 if ((kn
->kn_fflags
& ~level_trigger
) != 0) {
6815 filt_sockattach(struct knote
*kn
, __unused
struct kevent_internal_s
*kev
)
6817 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6821 if (KNOTE_ATTACH(&so
->so_klist
, kn
)) {
6822 so
->so_flags
|= SOF_KNOTE
;
6825 /* determine if event already fired */
6826 return filt_sockev_common(kn
, so
, 0);
6830 filt_sockdetach(struct knote
*kn
)
6832 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6835 if ((so
->so_flags
& SOF_KNOTE
) != 0) {
6836 if (KNOTE_DETACH(&so
->so_klist
, kn
)) {
6837 so
->so_flags
&= ~SOF_KNOTE
;
6840 socket_unlock(so
, 1);
6844 filt_sockev(struct knote
*kn
, long hint
)
6846 int ret
= 0, locked
= 0;
6847 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6848 long ev_hint
= (hint
& SO_FILT_HINT_EV
);
6850 if ((hint
& SO_FILT_HINT_LOCKED
) == 0) {
6855 ret
= filt_sockev_common(kn
, so
, ev_hint
);
6858 socket_unlock(so
, 1);
6867 * filt_socktouch - update event state
6872 struct kevent_internal_s
*kev
)
6874 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6875 uint32_t changed_flags
;
6880 /* save off the [result] data and fflags */
6881 changed_flags
= (kn
->kn_sfflags
^ kn
->kn_hookid
);
6883 /* save off the new input fflags and data */
6884 kn
->kn_sfflags
= kev
->fflags
;
6885 kn
->kn_sdata
= kev
->data
;
6887 /* restrict the current results to the (smaller?) set of new interest */
6889 * For compatibility with previous implementations, we leave kn_fflags
6890 * as they were before.
6892 //kn->kn_fflags &= kev->fflags;
6895 * Since we keep track of events that are already
6896 * delivered, if any of those events are not requested
6897 * anymore the state related to them can be reset
6900 ~(changed_flags
& EVFILT_SOCK_LEVEL_TRIGGER_MASK
);
6902 /* determine if we have events to deliver */
6903 ret
= filt_sockev_common(kn
, so
, 0);
6905 socket_unlock(so
, 1);
6911 * filt_sockprocess - query event fired state and return data
6916 struct filt_process_s
*data
,
6917 struct kevent_internal_s
*kev
)
6919 #pragma unused(data)
6921 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6926 ret
= filt_sockev_common(kn
, so
, 0);
6928 *kev
= kn
->kn_kevent
;
6931 * Store the state of the events being delivered. This
6932 * state can be used to deliver level triggered events
6933 * ateast once and still avoid waking up the application
6934 * multiple times as long as the event is active.
6936 if (kn
->kn_fflags
!= 0) {
6937 kn
->kn_hookid
|= (kn
->kn_fflags
&
6938 EVFILT_SOCK_LEVEL_TRIGGER_MASK
);
6942 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6943 * only one of them and remember the last one that was
6946 if (kn
->kn_fflags
& NOTE_SUSPEND
) {
6947 kn
->kn_hookid
&= ~NOTE_RESUME
;
6949 if (kn
->kn_fflags
& NOTE_RESUME
) {
6950 kn
->kn_hookid
&= ~NOTE_SUSPEND
;
6953 if (kn
->kn_flags
& EV_CLEAR
) {
6959 socket_unlock(so
, 1);
6965 get_sockev_state(struct socket
*so
, u_int32_t
*statep
)
6967 u_int32_t state
= *(statep
);
6970 * If the state variable is already used by a previous event,
6977 if (so
->so_state
& SS_ISCONNECTED
) {
6978 state
|= SOCKEV_CONNECTED
;
6980 state
&= ~(SOCKEV_CONNECTED
);
6982 state
|= ((so
->so_state
& SS_ISDISCONNECTED
) ? SOCKEV_DISCONNECTED
: 0);
6986 #define SO_LOCK_HISTORY_STR_LEN \
6987 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
6989 __private_extern__
const char *
6990 solockhistory_nr(struct socket
*so
)
6994 static char lock_history_str
[SO_LOCK_HISTORY_STR_LEN
];
6996 bzero(lock_history_str
, sizeof(lock_history_str
));
6997 for (i
= SO_LCKDBG_MAX
- 1; i
>= 0; i
--) {
6998 n
+= snprintf(lock_history_str
+ n
,
6999 SO_LOCK_HISTORY_STR_LEN
- n
, "%p:%p ",
7000 so
->lock_lr
[(so
->next_lock_lr
+ i
) % SO_LCKDBG_MAX
],
7001 so
->unlock_lr
[(so
->next_unlock_lr
+ i
) % SO_LCKDBG_MAX
]);
7003 return lock_history_str
;
7007 socket_lock(struct socket
*so
, int refcount
)
7011 lr_saved
= __builtin_return_address(0);
7013 if (so
->so_proto
->pr_lock
) {
7014 (*so
->so_proto
->pr_lock
)(so
, refcount
, lr_saved
);
7016 #ifdef MORE_LOCKING_DEBUG
7017 LCK_MTX_ASSERT(so
->so_proto
->pr_domain
->dom_mtx
,
7018 LCK_MTX_ASSERT_NOTOWNED
);
7020 lck_mtx_lock(so
->so_proto
->pr_domain
->dom_mtx
);
7024 so
->lock_lr
[so
->next_lock_lr
] = lr_saved
;
7025 so
->next_lock_lr
= (so
->next_lock_lr
+ 1) % SO_LCKDBG_MAX
;
7030 socket_lock_assert_owned(struct socket
*so
)
7032 lck_mtx_t
*mutex_held
;
7034 if (so
->so_proto
->pr_getlock
!= NULL
) {
7035 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
7037 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
7040 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
7044 socket_try_lock(struct socket
*so
)
7048 if (so
->so_proto
->pr_getlock
!= NULL
) {
7049 mtx
= (*so
->so_proto
->pr_getlock
)(so
, 0);
7051 mtx
= so
->so_proto
->pr_domain
->dom_mtx
;
7054 return lck_mtx_try_lock(mtx
);
7058 socket_unlock(struct socket
*so
, int refcount
)
7061 lck_mtx_t
*mutex_held
;
7063 lr_saved
= __builtin_return_address(0);
7065 if (so
->so_proto
== NULL
) {
7066 panic("%s: null so_proto so=%p\n", __func__
, so
);
7070 if (so
&& so
->so_proto
->pr_unlock
) {
7071 (*so
->so_proto
->pr_unlock
)(so
, refcount
, lr_saved
);
7073 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
7074 #ifdef MORE_LOCKING_DEBUG
7075 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
7077 so
->unlock_lr
[so
->next_unlock_lr
] = lr_saved
;
7078 so
->next_unlock_lr
= (so
->next_unlock_lr
+ 1) % SO_LCKDBG_MAX
;
7081 if (so
->so_usecount
<= 0) {
7082 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7083 "lrh=%s", __func__
, so
->so_usecount
, so
,
7084 SOCK_DOM(so
), so
->so_type
,
7085 SOCK_PROTO(so
), solockhistory_nr(so
));
7090 if (so
->so_usecount
== 0) {
7091 sofreelastref(so
, 1);
7094 lck_mtx_unlock(mutex_held
);
7098 /* Called with socket locked, will unlock socket */
7100 sofree(struct socket
*so
)
7102 lck_mtx_t
*mutex_held
;
7104 if (so
->so_proto
->pr_getlock
!= NULL
) {
7105 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
7107 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
7109 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
7111 sofreelastref(so
, 0);
7115 soreference(struct socket
*so
)
7117 socket_lock(so
, 1); /* locks & take one reference on socket */
7118 socket_unlock(so
, 0); /* unlock only */
7122 sodereference(struct socket
*so
)
7125 socket_unlock(so
, 1);
7129 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7130 * possibility of using jumbo clusters. Caller must ensure to hold
7134 somultipages(struct socket
*so
, boolean_t set
)
7137 so
->so_flags
|= SOF_MULTIPAGES
;
7139 so
->so_flags
&= ~SOF_MULTIPAGES
;
7144 soif2kcl(struct socket
*so
, boolean_t set
)
7147 so
->so_flags1
|= SOF1_IF_2KCL
;
7149 so
->so_flags1
&= ~SOF1_IF_2KCL
;
7154 so_isdstlocal(struct socket
*so
)
7156 struct inpcb
*inp
= (struct inpcb
*)so
->so_pcb
;
7158 if (SOCK_DOM(so
) == PF_INET
) {
7159 return inaddr_local(inp
->inp_faddr
);
7160 } else if (SOCK_DOM(so
) == PF_INET6
) {
7161 return in6addr_local(&inp
->in6p_faddr
);
7168 sosetdefunct(struct proc
*p
, struct socket
*so
, int level
, boolean_t noforce
)
7170 struct sockbuf
*rcv
, *snd
;
7171 int err
= 0, defunct
;
7176 defunct
= (so
->so_flags
& SOF_DEFUNCT
);
7178 if (!(snd
->sb_flags
& rcv
->sb_flags
& SB_DROP
)) {
7179 panic("%s: SB_DROP not set", __func__
);
7185 if (so
->so_flags
& SOF_NODEFUNCT
) {
7188 if (p
!= PROC_NULL
) {
7189 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7190 "name %s level %d) so 0x%llx [%d,%d] "
7191 "is not eligible for defunct "
7192 "(%d)\n", __func__
, proc_selfpid(),
7193 proc_best_name(current_proc()), proc_pid(p
),
7194 proc_best_name(p
), level
,
7195 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7196 SOCK_DOM(so
), SOCK_TYPE(so
), err
);
7200 so
->so_flags
&= ~SOF_NODEFUNCT
;
7201 if (p
!= PROC_NULL
) {
7202 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7203 "name %s level %d) so 0x%llx [%d,%d] "
7205 "(%d)\n", __func__
, proc_selfpid(),
7206 proc_best_name(current_proc()), proc_pid(p
),
7207 proc_best_name(p
), level
,
7208 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7209 SOCK_DOM(so
), SOCK_TYPE(so
), err
);
7211 } else if (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
) {
7212 struct inpcb
*inp
= (struct inpcb
*)so
->so_pcb
;
7213 struct ifnet
*ifp
= inp
->inp_last_outifp
;
7215 if (ifp
&& IFNET_IS_CELLULAR(ifp
)) {
7216 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_nocell
);
7217 } else if (so
->so_flags
& SOF_DELEGATED
) {
7218 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_nodlgtd
);
7219 } else if (soextbkidlestat
.so_xbkidle_time
== 0) {
7220 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_notime
);
7221 } else if (noforce
&& p
!= PROC_NULL
) {
7222 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_active
);
7224 so
->so_flags1
|= SOF1_EXTEND_BK_IDLE_INPROG
;
7225 so
->so_extended_bk_start
= net_uptime();
7226 OSBitOrAtomic(P_LXBKIDLEINPROG
, &p
->p_ladvflag
);
7228 inpcb_timer_sched(inp
->inp_pcbinfo
, INPCB_TIMER_LAZY
);
7231 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7232 "name %s level %d) so 0x%llx [%d,%d] "
7234 "(%d)\n", __func__
, proc_selfpid(),
7235 proc_best_name(current_proc()), proc_pid(p
),
7236 proc_best_name(p
), level
,
7237 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7238 SOCK_DOM(so
), SOCK_TYPE(so
), err
);
7241 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_forced
);
7245 so
->so_flags
|= SOF_DEFUNCT
;
7247 /* Prevent further data from being appended to the socket buffers */
7248 snd
->sb_flags
|= SB_DROP
;
7249 rcv
->sb_flags
|= SB_DROP
;
7251 /* Flush any existing data in the socket buffers */
7252 if (rcv
->sb_cc
!= 0) {
7253 rcv
->sb_flags
&= ~SB_SEL
;
7254 selthreadclear(&rcv
->sb_sel
);
7257 if (snd
->sb_cc
!= 0) {
7258 snd
->sb_flags
&= ~SB_SEL
;
7259 selthreadclear(&snd
->sb_sel
);
7264 if (p
!= PROC_NULL
) {
7265 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7266 "so 0x%llx [%d,%d] %s defunct%s\n", __func__
,
7267 proc_selfpid(), proc_best_name(current_proc()),
7268 proc_pid(p
), proc_best_name(p
), level
,
7269 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), SOCK_DOM(so
),
7270 SOCK_TYPE(so
), defunct
? "is already" : "marked as",
7271 (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
) ?
7278 sodefunct(struct proc
*p
, struct socket
*so
, int level
)
7280 struct sockbuf
*rcv
, *snd
;
7282 if (!(so
->so_flags
& SOF_DEFUNCT
)) {
7283 panic("%s improperly called", __func__
);
7286 if (so
->so_state
& SS_DEFUNCT
) {
7293 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
7294 char s
[MAX_IPv6_STR_LEN
];
7295 char d
[MAX_IPv6_STR_LEN
];
7296 struct inpcb
*inp
= sotoinpcb(so
);
7298 if (p
!= PROC_NULL
) {
7300 "%s[%d, %s]: (target pid %d name %s level %d) "
7301 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
7302 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7303 " snd_fl 0x%x]\n", __func__
,
7304 proc_selfpid(), proc_best_name(current_proc()),
7305 proc_pid(p
), proc_best_name(p
), level
,
7306 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7307 (SOCK_TYPE(so
) == SOCK_STREAM
) ? "TCP" : "UDP",
7308 inet_ntop(SOCK_DOM(so
), ((SOCK_DOM(so
) == PF_INET
) ?
7309 (void *)&inp
->inp_laddr
.s_addr
:
7310 (void *)&inp
->in6p_laddr
),
7311 s
, sizeof(s
)), ntohs(inp
->in6p_lport
),
7312 inet_ntop(SOCK_DOM(so
), (SOCK_DOM(so
) == PF_INET
) ?
7313 (void *)&inp
->inp_faddr
.s_addr
:
7314 (void *)&inp
->in6p_faddr
,
7315 d
, sizeof(d
)), ntohs(inp
->in6p_fport
),
7316 (uint32_t)rcv
->sb_sel
.si_flags
,
7317 (uint32_t)snd
->sb_sel
.si_flags
,
7318 rcv
->sb_flags
, snd
->sb_flags
);
7320 } else if (p
!= PROC_NULL
) {
7321 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7322 "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7323 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__
,
7324 proc_selfpid(), proc_best_name(current_proc()),
7325 proc_pid(p
), proc_best_name(p
), level
,
7326 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7327 SOCK_DOM(so
), SOCK_TYPE(so
),
7328 (uint32_t)rcv
->sb_sel
.si_flags
,
7329 (uint32_t)snd
->sb_sel
.si_flags
, rcv
->sb_flags
,
7334 * Unwedge threads blocked on sbwait() and sb_lock().
7339 so
->so_flags1
|= SOF1_DEFUNCTINPROG
;
7340 if (rcv
->sb_flags
& SB_LOCK
) {
7341 sbunlock(rcv
, TRUE
); /* keep socket locked */
7343 if (snd
->sb_flags
& SB_LOCK
) {
7344 sbunlock(snd
, TRUE
); /* keep socket locked */
7347 * Flush the buffers and disconnect. We explicitly call shutdown
7348 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7349 * states are set for the socket. This would also flush out data
7350 * hanging off the receive list of this socket.
7352 (void) soshutdownlock_final(so
, SHUT_RD
);
7353 (void) soshutdownlock_final(so
, SHUT_WR
);
7354 (void) sodisconnectlocked(so
);
7357 * Explicitly handle connectionless-protocol disconnection
7358 * and release any remaining data in the socket buffers.
7360 if (!(so
->so_state
& SS_ISDISCONNECTED
)) {
7361 (void) soisdisconnected(so
);
7364 if (so
->so_error
== 0) {
7365 so
->so_error
= EBADF
;
7368 if (rcv
->sb_cc
!= 0) {
7369 rcv
->sb_flags
&= ~SB_SEL
;
7370 selthreadclear(&rcv
->sb_sel
);
7373 if (snd
->sb_cc
!= 0) {
7374 snd
->sb_flags
&= ~SB_SEL
;
7375 selthreadclear(&snd
->sb_sel
);
7378 so
->so_state
|= SS_DEFUNCT
;
7379 OSIncrementAtomicLong((volatile long *)&sodefunct_calls
);
7386 soresume(struct proc
*p
, struct socket
*so
, int locked
)
7392 if (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_INPROG
) {
7393 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7394 "[%d,%d] resumed from bk idle\n",
7395 __func__
, proc_selfpid(), proc_best_name(current_proc()),
7396 proc_pid(p
), proc_best_name(p
),
7397 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7398 SOCK_DOM(so
), SOCK_TYPE(so
));
7400 so
->so_flags1
&= ~SOF1_EXTEND_BK_IDLE_INPROG
;
7401 so
->so_extended_bk_start
= 0;
7402 OSBitAndAtomic(~P_LXBKIDLEINPROG
, &p
->p_ladvflag
);
7404 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_resumed
);
7405 OSDecrementAtomic(&soextbkidlestat
.so_xbkidle_active
);
7406 VERIFY(soextbkidlestat
.so_xbkidle_active
>= 0);
7409 socket_unlock(so
, 1);
7416 * Does not attempt to account for sockets that are delegated from
7417 * the current process
7420 so_set_extended_bk_idle(struct socket
*so
, int optval
)
7424 if ((SOCK_DOM(so
) != PF_INET
&& SOCK_DOM(so
) != PF_INET6
) ||
7425 SOCK_PROTO(so
) != IPPROTO_TCP
) {
7426 OSDecrementAtomic(&soextbkidlestat
.so_xbkidle_notsupp
);
7428 } else if (optval
== 0) {
7429 so
->so_flags1
&= ~SOF1_EXTEND_BK_IDLE_WANTED
;
7431 soresume(current_proc(), so
, 1);
7433 struct proc
*p
= current_proc();
7435 struct filedesc
*fdp
;
7439 * Unlock socket to avoid lock ordering issue with
7440 * the proc fd table lock
7442 socket_unlock(so
, 0);
7447 for (i
= 0; i
< fdp
->fd_nfiles
; i
++) {
7448 struct fileproc
*fp
= fdp
->fd_ofiles
[i
];
7452 (fdp
->fd_ofileflags
[i
] & UF_RESERVED
) != 0 ||
7453 FILEGLOB_DTYPE(fp
->f_fglob
) != DTYPE_SOCKET
) {
7457 so2
= (struct socket
*)fp
->f_fglob
->fg_data
;
7459 so2
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
) {
7462 if (count
>= soextbkidlestat
.so_xbkidle_maxperproc
) {
7470 if (count
>= soextbkidlestat
.so_xbkidle_maxperproc
) {
7471 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_toomany
);
7473 } else if (so
->so_flags
& SOF_DELEGATED
) {
7474 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_nodlgtd
);
7477 so
->so_flags1
|= SOF1_EXTEND_BK_IDLE_WANTED
;
7478 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_wantok
);
7480 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7481 "%s marked for extended bk idle\n",
7482 __func__
, proc_selfpid(), proc_best_name(current_proc()),
7483 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7484 SOCK_DOM(so
), SOCK_TYPE(so
),
7485 (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
) ?
7493 so_stop_extended_bk_idle(struct socket
*so
)
7495 so
->so_flags1
&= ~SOF1_EXTEND_BK_IDLE_INPROG
;
7496 so
->so_extended_bk_start
= 0;
7498 OSDecrementAtomic(&soextbkidlestat
.so_xbkidle_active
);
7499 VERIFY(soextbkidlestat
.so_xbkidle_active
>= 0);
7503 sosetdefunct(current_proc(), so
,
7504 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL
, FALSE
);
7505 if (so
->so_flags
& SOF_DEFUNCT
) {
7506 sodefunct(current_proc(), so
,
7507 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL
);
7512 so_drain_extended_bk_idle(struct socket
*so
)
7514 if (so
&& (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_INPROG
)) {
7516 * Only penalize sockets that have outstanding data
7518 if (so
->so_rcv
.sb_cc
|| so
->so_snd
.sb_cc
) {
7519 so_stop_extended_bk_idle(so
);
7521 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_drained
);
7527 * Return values tells if socket is still in extended background idle
7530 so_check_extended_bk_idle_time(struct socket
*so
)
7534 if ((so
->so_flags1
& SOF1_EXTEND_BK_IDLE_INPROG
)) {
7535 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7536 __func__
, proc_selfpid(), proc_best_name(current_proc()),
7537 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7538 SOCK_DOM(so
), SOCK_TYPE(so
));
7539 if (net_uptime() - so
->so_extended_bk_start
>
7540 soextbkidlestat
.so_xbkidle_time
) {
7541 so_stop_extended_bk_idle(so
);
7543 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_expired
);
7547 struct inpcb
*inp
= (struct inpcb
*)so
->so_pcb
;
7549 inpcb_timer_sched(inp
->inp_pcbinfo
, INPCB_TIMER_LAZY
);
7550 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_resched
);
7558 resume_proc_sockets(proc_t p
)
7560 if (p
->p_ladvflag
& P_LXBKIDLEINPROG
) {
7561 struct filedesc
*fdp
;
7566 for (i
= 0; i
< fdp
->fd_nfiles
; i
++) {
7567 struct fileproc
*fp
;
7570 fp
= fdp
->fd_ofiles
[i
];
7572 (fdp
->fd_ofileflags
[i
] & UF_RESERVED
) != 0 ||
7573 FILEGLOB_DTYPE(fp
->f_fglob
) != DTYPE_SOCKET
) {
7577 so
= (struct socket
*)fp
->f_fglob
->fg_data
;
7578 (void) soresume(p
, so
, 0);
7582 OSBitAndAtomic(~P_LXBKIDLEINPROG
, &p
->p_ladvflag
);
7586 __private_extern__
int
7587 so_set_recv_anyif(struct socket
*so
, int optval
)
7592 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
7594 if (SOCK_DOM(so
) == PF_INET
) {
7597 sotoinpcb(so
)->inp_flags
|= INP_RECV_ANYIF
;
7599 sotoinpcb(so
)->inp_flags
&= ~INP_RECV_ANYIF
;
7607 __private_extern__
int
7608 so_get_recv_anyif(struct socket
*so
)
7613 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
7615 if (SOCK_DOM(so
) == PF_INET
) {
7617 ret
= (sotoinpcb(so
)->inp_flags
& INP_RECV_ANYIF
) ? 1 : 0;
7624 so_set_restrictions(struct socket
*so
, uint32_t vals
)
7626 int nocell_old
, nocell_new
;
7627 int noexpensive_old
, noexpensive_new
;
7630 * Deny-type restrictions are trapdoors; once set they cannot be
7631 * unset for the lifetime of the socket. This allows them to be
7632 * issued by a framework on behalf of the application without
7633 * having to worry that they can be undone.
7635 * Note here that socket-level restrictions overrides any protocol
7636 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7637 * socket restriction issued on the socket has a higher precendence
7638 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7639 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7640 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7642 nocell_old
= (so
->so_restrictions
& SO_RESTRICT_DENY_CELLULAR
);
7643 noexpensive_old
= (so
->so_restrictions
& SO_RESTRICT_DENY_EXPENSIVE
);
7644 so
->so_restrictions
|= (vals
& (SO_RESTRICT_DENY_IN
|
7645 SO_RESTRICT_DENY_OUT
| SO_RESTRICT_DENY_CELLULAR
|
7646 SO_RESTRICT_DENY_EXPENSIVE
));
7647 nocell_new
= (so
->so_restrictions
& SO_RESTRICT_DENY_CELLULAR
);
7648 noexpensive_new
= (so
->so_restrictions
& SO_RESTRICT_DENY_EXPENSIVE
);
7650 /* we can only set, not clear restrictions */
7651 if ((nocell_new
- nocell_old
) == 0 &&
7652 (noexpensive_new
- noexpensive_old
) == 0) {
7656 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
7658 if (SOCK_DOM(so
) == PF_INET
) {
7660 if (nocell_new
- nocell_old
!= 0) {
7662 * if deny cellular is now set, do what's needed
7665 inp_set_nocellular(sotoinpcb(so
));
7667 if (noexpensive_new
- noexpensive_old
!= 0) {
7668 inp_set_noexpensive(sotoinpcb(so
));
7672 if (SOCK_DOM(so
) == PF_MULTIPATH
) {
7673 mptcp_set_restrictions(so
);
7680 so_get_restrictions(struct socket
*so
)
7682 return so
->so_restrictions
& (SO_RESTRICT_DENY_IN
|
7683 SO_RESTRICT_DENY_OUT
|
7684 SO_RESTRICT_DENY_CELLULAR
| SO_RESTRICT_DENY_EXPENSIVE
);
7688 so_set_effective_pid(struct socket
*so
, int epid
, struct proc
*p
)
7690 struct proc
*ep
= PROC_NULL
;
7693 /* pid 0 is reserved for kernel */
7700 * If this is an in-kernel socket, prevent its delegate
7701 * association from changing unless the socket option is
7702 * coming from within the kernel itself.
7704 if (so
->last_pid
== 0 && p
!= kernproc
) {
7710 * If this is issued by a process that's recorded as the
7711 * real owner of the socket, or if the pid is the same as
7712 * the process's own pid, then proceed. Otherwise ensure
7713 * that the issuing process has the necessary privileges.
7715 if (epid
!= so
->last_pid
|| epid
!= proc_pid(p
)) {
7716 if ((error
= priv_check_cred(kauth_cred_get(),
7717 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE
, 0))) {
7723 /* Find the process that corresponds to the effective pid */
7724 if ((ep
= proc_find(epid
)) == PROC_NULL
) {
7730 * If a process tries to delegate the socket to itself, then
7731 * there's really nothing to do; treat it as a way for the
7732 * delegate association to be cleared. Note that we check
7733 * the passed-in proc rather than calling proc_selfpid(),
7734 * as we need to check the process issuing the socket option
7735 * which could be kernproc. Given that we don't allow 0 for
7736 * effective pid, it means that a delegated in-kernel socket
7737 * stays delegated during its lifetime (which is probably OK.)
7739 if (epid
== proc_pid(p
)) {
7740 so
->so_flags
&= ~SOF_DELEGATED
;
7743 uuid_clear(so
->e_uuid
);
7745 so
->so_flags
|= SOF_DELEGATED
;
7746 so
->e_upid
= proc_uniqueid(ep
);
7747 so
->e_pid
= proc_pid(ep
);
7748 proc_getexecutableuuid(ep
, so
->e_uuid
, sizeof(so
->e_uuid
));
7751 if (error
== 0 && net_io_policy_log
) {
7754 uuid_unparse(so
->e_uuid
, buf
);
7755 log(LOG_DEBUG
, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7756 "euuid %s%s\n", __func__
, proc_name_address(p
),
7757 proc_pid(p
), (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7758 SOCK_DOM(so
), SOCK_TYPE(so
),
7759 so
->e_pid
, proc_name_address(ep
), buf
,
7760 ((so
->so_flags
& SOF_DELEGATED
) ? " [delegated]" : ""));
7761 } else if (error
!= 0 && net_io_policy_log
) {
7762 log(LOG_ERR
, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7763 "ERROR (%d)\n", __func__
, proc_name_address(p
),
7764 proc_pid(p
), (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7765 SOCK_DOM(so
), SOCK_TYPE(so
),
7766 epid
, (ep
== PROC_NULL
) ? "PROC_NULL" :
7767 proc_name_address(ep
), error
);
7770 /* Update this socket's policy upon success */
7772 so
->so_policy_gencnt
*= -1;
7773 so_update_policy(so
);
7775 so_update_necp_policy(so
, NULL
, NULL
);
7779 if (ep
!= PROC_NULL
) {
7787 so_set_effective_uuid(struct socket
*so
, uuid_t euuid
, struct proc
*p
)
7793 /* UUID must not be all-zeroes (reserved for kernel) */
7794 if (uuid_is_null(euuid
)) {
7800 * If this is an in-kernel socket, prevent its delegate
7801 * association from changing unless the socket option is
7802 * coming from within the kernel itself.
7804 if (so
->last_pid
== 0 && p
!= kernproc
) {
7809 /* Get the UUID of the issuing process */
7810 proc_getexecutableuuid(p
, uuid
, sizeof(uuid
));
7813 * If this is issued by a process that's recorded as the
7814 * real owner of the socket, or if the uuid is the same as
7815 * the process's own uuid, then proceed. Otherwise ensure
7816 * that the issuing process has the necessary privileges.
7818 if (uuid_compare(euuid
, so
->last_uuid
) != 0 ||
7819 uuid_compare(euuid
, uuid
) != 0) {
7820 if ((error
= priv_check_cred(kauth_cred_get(),
7821 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE
, 0))) {
7828 * If a process tries to delegate the socket to itself, then
7829 * there's really nothing to do; treat it as a way for the
7830 * delegate association to be cleared. Note that we check
7831 * the uuid of the passed-in proc rather than that of the
7832 * current process, as we need to check the process issuing
7833 * the socket option which could be kernproc itself. Given
7834 * that we don't allow 0 for effective uuid, it means that
7835 * a delegated in-kernel socket stays delegated during its
7836 * lifetime (which is okay.)
7838 if (uuid_compare(euuid
, uuid
) == 0) {
7839 so
->so_flags
&= ~SOF_DELEGATED
;
7842 uuid_clear(so
->e_uuid
);
7844 so
->so_flags
|= SOF_DELEGATED
;
7846 * Unlike so_set_effective_pid(), we only have the UUID
7847 * here and the process ID is not known. Inherit the
7848 * real {pid,upid} of the socket.
7850 so
->e_upid
= so
->last_upid
;
7851 so
->e_pid
= so
->last_pid
;
7852 uuid_copy(so
->e_uuid
, euuid
);
7856 if (error
== 0 && net_io_policy_log
) {
7857 uuid_unparse(so
->e_uuid
, buf
);
7858 log(LOG_DEBUG
, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7859 "euuid %s%s\n", __func__
, proc_name_address(p
), proc_pid(p
),
7860 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), SOCK_DOM(so
),
7861 SOCK_TYPE(so
), so
->e_pid
, buf
,
7862 ((so
->so_flags
& SOF_DELEGATED
) ? " [delegated]" : ""));
7863 } else if (error
!= 0 && net_io_policy_log
) {
7864 uuid_unparse(euuid
, buf
);
7865 log(LOG_DEBUG
, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7866 "ERROR (%d)\n", __func__
, proc_name_address(p
), proc_pid(p
),
7867 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), SOCK_DOM(so
),
7868 SOCK_TYPE(so
), buf
, error
);
7871 /* Update this socket's policy upon success */
7873 so
->so_policy_gencnt
*= -1;
7874 so_update_policy(so
);
7876 so_update_necp_policy(so
, NULL
, NULL
);
7884 netpolicy_post_msg(uint32_t ev_code
, struct netpolicy_event_data
*ev_data
,
7885 uint32_t ev_datalen
)
7887 struct kev_msg ev_msg
;
7890 * A netpolicy event always starts with a netpolicy_event_data
7891 * structure, but the caller can provide for a longer event
7892 * structure to post, depending on the event code.
7894 VERIFY(ev_data
!= NULL
&& ev_datalen
>= sizeof(*ev_data
));
7896 bzero(&ev_msg
, sizeof(ev_msg
));
7897 ev_msg
.vendor_code
= KEV_VENDOR_APPLE
;
7898 ev_msg
.kev_class
= KEV_NETWORK_CLASS
;
7899 ev_msg
.kev_subclass
= KEV_NETPOLICY_SUBCLASS
;
7900 ev_msg
.event_code
= ev_code
;
7902 ev_msg
.dv
[0].data_ptr
= ev_data
;
7903 ev_msg
.dv
[0].data_length
= ev_datalen
;
7905 kev_post_msg(&ev_msg
);
7909 socket_post_kev_msg(uint32_t ev_code
,
7910 struct kev_socket_event_data
*ev_data
,
7911 uint32_t ev_datalen
)
7913 struct kev_msg ev_msg
;
7915 bzero(&ev_msg
, sizeof(ev_msg
));
7916 ev_msg
.vendor_code
= KEV_VENDOR_APPLE
;
7917 ev_msg
.kev_class
= KEV_NETWORK_CLASS
;
7918 ev_msg
.kev_subclass
= KEV_SOCKET_SUBCLASS
;
7919 ev_msg
.event_code
= ev_code
;
7921 ev_msg
.dv
[0].data_ptr
= ev_data
;
7922 ev_msg
.dv
[0].data_length
= ev_datalen
;
7924 kev_post_msg(&ev_msg
);
7928 socket_post_kev_msg_closed(struct socket
*so
)
7930 struct kev_socket_closed ev
;
7931 struct sockaddr
*socksa
= NULL
, *peersa
= NULL
;
7933 bzero(&ev
, sizeof(ev
));
7934 err
= (*so
->so_proto
->pr_usrreqs
->pru_sockaddr
)(so
, &socksa
);
7936 err
= (*so
->so_proto
->pr_usrreqs
->pru_peeraddr
)(so
,
7939 memcpy(&ev
.ev_data
.kev_sockname
, socksa
,
7941 sizeof(ev
.ev_data
.kev_sockname
)));
7942 memcpy(&ev
.ev_data
.kev_peername
, peersa
,
7944 sizeof(ev
.ev_data
.kev_peername
)));
7945 socket_post_kev_msg(KEV_SOCKET_CLOSED
,
7946 &ev
.ev_data
, sizeof(ev
));
7949 if (socksa
!= NULL
) {
7950 FREE(socksa
, M_SONAME
);
7952 if (peersa
!= NULL
) {
7953 FREE(peersa
, M_SONAME
);