2 * Copyright (c) 1998-2017 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
92 #include <sys/uio_internal.h>
94 #include <sys/kdebug.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/net_api_stats.h>
102 #include <net/ntstat.h>
103 #include <net/content_filter.h>
104 #include <netinet/in.h>
105 #include <netinet/in_pcb.h>
106 #include <netinet/in_tclass.h>
107 #include <netinet/tcp_var.h>
108 #include <netinet/ip6.h>
109 #include <netinet6/ip6_var.h>
110 #include <netinet/flow_divert.h>
111 #include <kern/zalloc.h>
112 #include <kern/locks.h>
113 #include <machine/limits.h>
114 #include <libkern/OSAtomic.h>
115 #include <pexpert/pexpert.h>
116 #include <kern/assert.h>
117 #include <kern/task.h>
118 #include <kern/policy_internal.h>
120 #include <sys/kpi_mbuf.h>
121 #include <sys/mcache.h>
122 #include <sys/unpcb.h>
123 #include <libkern/section_keywords.h>
126 #include <security/mac_framework.h>
130 #include <netinet/mp_pcb.h>
131 #include <netinet/mptcp_var.h>
132 #endif /* MULTIPATH */
134 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
136 #if DEBUG || DEVELOPMENT
137 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
139 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
142 /* TODO: this should be in a header file somewhere */
143 extern char *proc_name_address(void *p
);
144 extern char *proc_best_name(proc_t
);
146 static u_int32_t so_cache_hw
; /* High water mark for socache */
147 static u_int32_t so_cache_timeouts
; /* number of timeouts */
148 static u_int32_t so_cache_max_freed
; /* max freed per timeout */
149 static u_int32_t cached_sock_count
= 0;
150 STAILQ_HEAD(, socket
) so_cache_head
;
151 int max_cached_sock_count
= MAX_CACHED_SOCKETS
;
152 static u_int32_t so_cache_time
;
153 static int socketinit_done
;
154 static struct zone
*so_cache_zone
;
156 static lck_grp_t
*so_cache_mtx_grp
;
157 static lck_attr_t
*so_cache_mtx_attr
;
158 static lck_grp_attr_t
*so_cache_mtx_grp_attr
;
159 static lck_mtx_t
*so_cache_mtx
;
161 #include <machine/limits.h>
163 static int filt_sorattach(struct knote
*kn
, struct kevent_internal_s
*kev
);
164 static void filt_sordetach(struct knote
*kn
);
165 static int filt_soread(struct knote
*kn
, long hint
);
166 static int filt_sortouch(struct knote
*kn
, struct kevent_internal_s
*kev
);
167 static int filt_sorprocess(struct knote
*kn
, struct filt_process_s
*data
, struct kevent_internal_s
*kev
);
169 static int filt_sowattach(struct knote
*kn
, struct kevent_internal_s
*kev
);
170 static void filt_sowdetach(struct knote
*kn
);
171 static int filt_sowrite(struct knote
*kn
, long hint
);
172 static int filt_sowtouch(struct knote
*kn
, struct kevent_internal_s
*kev
);
173 static int filt_sowprocess(struct knote
*kn
, struct filt_process_s
*data
, struct kevent_internal_s
*kev
);
175 static int filt_sockattach(struct knote
*kn
, struct kevent_internal_s
*kev
);
176 static void filt_sockdetach(struct knote
*kn
);
177 static int filt_sockev(struct knote
*kn
, long hint
);
178 static int filt_socktouch(struct knote
*kn
, struct kevent_internal_s
*kev
);
179 static int filt_sockprocess(struct knote
*kn
, struct filt_process_s
*data
, struct kevent_internal_s
*kev
);
181 static int sooptcopyin_timeval(struct sockopt
*, struct timeval
*);
182 static int sooptcopyout_timeval(struct sockopt
*, const struct timeval
*);
184 SECURITY_READ_ONLY_EARLY(struct filterops
) soread_filtops
= {
186 .f_attach
= filt_sorattach
,
187 .f_detach
= filt_sordetach
,
188 .f_event
= filt_soread
,
189 .f_touch
= filt_sortouch
,
190 .f_process
= filt_sorprocess
,
193 SECURITY_READ_ONLY_EARLY(struct filterops
) sowrite_filtops
= {
195 .f_attach
= filt_sowattach
,
196 .f_detach
= filt_sowdetach
,
197 .f_event
= filt_sowrite
,
198 .f_touch
= filt_sowtouch
,
199 .f_process
= filt_sowprocess
,
202 SECURITY_READ_ONLY_EARLY(struct filterops
) sock_filtops
= {
204 .f_attach
= filt_sockattach
,
205 .f_detach
= filt_sockdetach
,
206 .f_event
= filt_sockev
,
207 .f_touch
= filt_socktouch
,
208 .f_process
= filt_sockprocess
,
211 SECURITY_READ_ONLY_EARLY(struct filterops
) soexcept_filtops
= {
213 .f_attach
= filt_sorattach
,
214 .f_detach
= filt_sordetach
,
215 .f_event
= filt_soread
,
216 .f_touch
= filt_sortouch
,
217 .f_process
= filt_sorprocess
,
220 SYSCTL_DECL(_kern_ipc
);
222 #define EVEN_MORE_LOCKING_DEBUG 0
224 int socket_debug
= 0;
225 SYSCTL_INT(_kern_ipc
, OID_AUTO
, socket_debug
,
226 CTLFLAG_RW
| CTLFLAG_LOCKED
, &socket_debug
, 0, "");
228 static unsigned long sodefunct_calls
= 0;
229 SYSCTL_LONG(_kern_ipc
, OID_AUTO
, sodefunct_calls
, CTLFLAG_LOCKED
,
230 &sodefunct_calls
, "");
232 static int socket_zone
= M_SOCKET
;
233 so_gen_t so_gencnt
; /* generation count for sockets */
235 MALLOC_DEFINE(M_SONAME
, "soname", "socket name");
236 MALLOC_DEFINE(M_PCB
, "pcb", "protocol control block");
238 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
239 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
240 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
241 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
242 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
243 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
244 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
245 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
246 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
248 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
250 int somaxconn
= SOMAXCONN
;
251 SYSCTL_INT(_kern_ipc
, KIPC_SOMAXCONN
, somaxconn
,
252 CTLFLAG_RW
| CTLFLAG_LOCKED
, &somaxconn
, 0, "");
254 /* Should we get a maximum also ??? */
255 static int sosendmaxchain
= 65536;
256 static int sosendminchain
= 16384;
257 static int sorecvmincopy
= 16384;
258 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendminchain
,
259 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sosendminchain
, 0, "");
260 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sorecvmincopy
,
261 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sorecvmincopy
, 0, "");
264 * Set to enable jumbo clusters (if available) for large writes when
265 * the socket is marked with SOF_MULTIPAGES; see below.
268 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendjcl
,
269 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sosendjcl
, 0, "");
272 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
273 * writes on the socket for all protocols on any network interfaces,
274 * depending upon sosendjcl above. Be extra careful when setting this
275 * to 1, because sending down packets that cross physical pages down to
276 * broken drivers (those that falsely assume that the physical pages
277 * are contiguous) might lead to system panics or silent data corruption.
278 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
279 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
280 * capable. Set this to 1 only for testing/debugging purposes.
282 int sosendjcl_ignore_capab
= 0;
283 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendjcl_ignore_capab
,
284 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sosendjcl_ignore_capab
, 0, "");
287 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
288 * writes on the socket for all protocols on any network interfaces.
289 * Be extra careful when setting this to 1, because sending down packets with
290 * clusters larger that 2 KB might lead to system panics or data corruption.
291 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
292 * on the outgoing interface
293 * Set this to 1 for testing/debugging purposes only.
295 int sosendbigcl_ignore_capab
= 0;
296 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendbigcl_ignore_capab
,
297 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sosendbigcl_ignore_capab
, 0, "");
299 int sodefunctlog
= 0;
300 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sodefunctlog
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
301 &sodefunctlog
, 0, "");
303 int sothrottlelog
= 0;
304 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sothrottlelog
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
305 &sothrottlelog
, 0, "");
307 int sorestrictrecv
= 1;
308 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sorestrictrecv
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
309 &sorestrictrecv
, 0, "Enable inbound interface restrictions");
311 int sorestrictsend
= 1;
312 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sorestrictsend
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
313 &sorestrictsend
, 0, "Enable outbound interface restrictions");
315 int soreserveheadroom
= 1;
316 SYSCTL_INT(_kern_ipc
, OID_AUTO
, soreserveheadroom
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
317 &soreserveheadroom
, 0, "To allocate contiguous datagram buffers");
319 #if (DEBUG || DEVELOPMENT)
320 int so_notsent_lowat_check
= 1;
321 SYSCTL_INT(_kern_ipc
, OID_AUTO
, notsent_lowat
, CTLFLAG_RW
|CTLFLAG_LOCKED
,
322 &so_notsent_lowat_check
, 0, "enable/disable notsnet lowat check");
323 #endif /* DEBUG || DEVELOPMENT */
325 int so_accept_list_waits
= 0;
326 #if (DEBUG || DEVELOPMENT)
327 SYSCTL_INT(_kern_ipc
, OID_AUTO
, accept_list_waits
, CTLFLAG_RW
|CTLFLAG_LOCKED
,
328 &so_accept_list_waits
, 0, "number of waits for listener incomp list");
329 #endif /* DEBUG || DEVELOPMENT */
331 extern struct inpcbinfo tcbinfo
;
333 /* TODO: these should be in header file */
334 extern int get_inpcb_str_size(void);
335 extern int get_tcp_str_size(void);
337 vm_size_t so_cache_zone_element_size
;
339 static int sodelayed_copy(struct socket
*, struct uio
*, struct mbuf
**,
341 static void cached_sock_alloc(struct socket
**, int);
342 static void cached_sock_free(struct socket
*);
345 * Maximum of extended background idle sockets per process
346 * Set to zero to disable further setting of the option
349 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
350 #define SO_IDLE_BK_IDLE_TIME 600
351 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
353 struct soextbkidlestat soextbkidlestat
;
355 SYSCTL_UINT(_kern_ipc
, OID_AUTO
, maxextbkidleperproc
,
356 CTLFLAG_RW
| CTLFLAG_LOCKED
, &soextbkidlestat
.so_xbkidle_maxperproc
, 0,
357 "Maximum of extended background idle sockets per process");
359 SYSCTL_UINT(_kern_ipc
, OID_AUTO
, extbkidletime
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
360 &soextbkidlestat
.so_xbkidle_time
, 0,
361 "Time in seconds to keep extended background idle sockets");
363 SYSCTL_UINT(_kern_ipc
, OID_AUTO
, extbkidlercvhiwat
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
364 &soextbkidlestat
.so_xbkidle_rcvhiwat
, 0,
365 "High water mark for extended background idle sockets");
367 SYSCTL_STRUCT(_kern_ipc
, OID_AUTO
, extbkidlestat
, CTLFLAG_RD
| CTLFLAG_LOCKED
,
368 &soextbkidlestat
, soextbkidlestat
, "");
370 int so_set_extended_bk_idle(struct socket
*, int);
374 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
375 * setting the DSCP code on the packet based on the service class; see
376 * <rdar://problem/11277343> for details.
378 __private_extern__ u_int32_t sotcdb
= 0;
379 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sotcdb
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
385 _CASSERT(sizeof(so_gencnt
) == sizeof(uint64_t));
386 VERIFY(IS_P2ALIGNED(&so_gencnt
, sizeof(uint32_t)));
389 _CASSERT(sizeof(struct sa_endpoints
) == sizeof(struct user64_sa_endpoints
));
390 _CASSERT(offsetof(struct sa_endpoints
, sae_srcif
) == offsetof(struct user64_sa_endpoints
, sae_srcif
));
391 _CASSERT(offsetof(struct sa_endpoints
, sae_srcaddr
) == offsetof(struct user64_sa_endpoints
, sae_srcaddr
));
392 _CASSERT(offsetof(struct sa_endpoints
, sae_srcaddrlen
) == offsetof(struct user64_sa_endpoints
, sae_srcaddrlen
));
393 _CASSERT(offsetof(struct sa_endpoints
, sae_dstaddr
) == offsetof(struct user64_sa_endpoints
, sae_dstaddr
));
394 _CASSERT(offsetof(struct sa_endpoints
, sae_dstaddrlen
) == offsetof(struct user64_sa_endpoints
, sae_dstaddrlen
));
396 _CASSERT(sizeof(struct sa_endpoints
) == sizeof(struct user32_sa_endpoints
));
397 _CASSERT(offsetof(struct sa_endpoints
, sae_srcif
) == offsetof(struct user32_sa_endpoints
, sae_srcif
));
398 _CASSERT(offsetof(struct sa_endpoints
, sae_srcaddr
) == offsetof(struct user32_sa_endpoints
, sae_srcaddr
));
399 _CASSERT(offsetof(struct sa_endpoints
, sae_srcaddrlen
) == offsetof(struct user32_sa_endpoints
, sae_srcaddrlen
));
400 _CASSERT(offsetof(struct sa_endpoints
, sae_dstaddr
) == offsetof(struct user32_sa_endpoints
, sae_dstaddr
));
401 _CASSERT(offsetof(struct sa_endpoints
, sae_dstaddrlen
) == offsetof(struct user32_sa_endpoints
, sae_dstaddrlen
));
404 if (socketinit_done
) {
405 printf("socketinit: already called...\n");
410 PE_parse_boot_argn("socket_debug", &socket_debug
,
411 sizeof (socket_debug
));
414 * allocate lock group attribute and group for socket cache mutex
416 so_cache_mtx_grp_attr
= lck_grp_attr_alloc_init();
417 so_cache_mtx_grp
= lck_grp_alloc_init("so_cache",
418 so_cache_mtx_grp_attr
);
421 * allocate the lock attribute for socket cache mutex
423 so_cache_mtx_attr
= lck_attr_alloc_init();
425 /* cached sockets mutex */
426 so_cache_mtx
= lck_mtx_alloc_init(so_cache_mtx_grp
, so_cache_mtx_attr
);
427 if (so_cache_mtx
== NULL
) {
428 panic("%s: unable to allocate so_cache_mtx\n", __func__
);
431 STAILQ_INIT(&so_cache_head
);
433 so_cache_zone_element_size
= (vm_size_t
)(sizeof (struct socket
) + 4
434 + get_inpcb_str_size() + 4 + get_tcp_str_size());
436 so_cache_zone
= zinit(so_cache_zone_element_size
,
437 (120000 * so_cache_zone_element_size
), 8192, "socache zone");
438 zone_change(so_cache_zone
, Z_CALLERACCT
, FALSE
);
439 zone_change(so_cache_zone
, Z_NOENCRYPT
, TRUE
);
441 bzero(&soextbkidlestat
, sizeof(struct soextbkidlestat
));
442 soextbkidlestat
.so_xbkidle_maxperproc
= SO_IDLE_BK_IDLE_MAX_PER_PROC
;
443 soextbkidlestat
.so_xbkidle_time
= SO_IDLE_BK_IDLE_TIME
;
444 soextbkidlestat
.so_xbkidle_rcvhiwat
= SO_IDLE_BK_IDLE_RCV_HIWAT
;
448 socket_tclass_init();
451 #endif /* MULTIPATH */
455 cached_sock_alloc(struct socket
**so
, int waitok
)
460 lck_mtx_lock(so_cache_mtx
);
462 if (!STAILQ_EMPTY(&so_cache_head
)) {
463 VERIFY(cached_sock_count
> 0);
465 *so
= STAILQ_FIRST(&so_cache_head
);
466 STAILQ_REMOVE_HEAD(&so_cache_head
, so_cache_ent
);
467 STAILQ_NEXT((*so
), so_cache_ent
) = NULL
;
470 lck_mtx_unlock(so_cache_mtx
);
472 temp
= (*so
)->so_saved_pcb
;
473 bzero((caddr_t
)*so
, sizeof (struct socket
));
475 (*so
)->so_saved_pcb
= temp
;
478 lck_mtx_unlock(so_cache_mtx
);
481 *so
= (struct socket
*)zalloc(so_cache_zone
);
483 *so
= (struct socket
*)zalloc_noblock(so_cache_zone
);
488 bzero((caddr_t
)*so
, sizeof (struct socket
));
491 * Define offsets for extra structures into our
492 * single block of memory. Align extra structures
493 * on longword boundaries.
496 offset
= (uintptr_t)*so
;
497 offset
+= sizeof (struct socket
);
499 offset
= ALIGN(offset
);
501 (*so
)->so_saved_pcb
= (caddr_t
)offset
;
502 offset
+= get_inpcb_str_size();
504 offset
= ALIGN(offset
);
506 ((struct inpcb
*)(void *)(*so
)->so_saved_pcb
)->inp_saved_ppcb
=
510 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER
, &(*so
)->so_flags1
);
514 cached_sock_free(struct socket
*so
)
517 lck_mtx_lock(so_cache_mtx
);
519 so_cache_time
= net_uptime();
520 if (++cached_sock_count
> max_cached_sock_count
) {
522 lck_mtx_unlock(so_cache_mtx
);
523 zfree(so_cache_zone
, so
);
525 if (so_cache_hw
< cached_sock_count
)
526 so_cache_hw
= cached_sock_count
;
528 STAILQ_INSERT_TAIL(&so_cache_head
, so
, so_cache_ent
);
530 so
->cache_timestamp
= so_cache_time
;
531 lck_mtx_unlock(so_cache_mtx
);
536 so_update_last_owner_locked(struct socket
*so
, proc_t self
)
538 if (so
->last_pid
!= 0) {
540 * last_pid and last_upid should remain zero for sockets
541 * created using sock_socket. The check above achieves that
543 if (self
== PROC_NULL
)
544 self
= current_proc();
546 if (so
->last_upid
!= proc_uniqueid(self
) ||
547 so
->last_pid
!= proc_pid(self
)) {
548 so
->last_upid
= proc_uniqueid(self
);
549 so
->last_pid
= proc_pid(self
);
550 proc_getexecutableuuid(self
, so
->last_uuid
,
551 sizeof (so
->last_uuid
));
553 proc_pidoriginatoruuid(so
->so_vuuid
, sizeof(so
->so_vuuid
));
558 so_update_policy(struct socket
*so
)
560 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
)
561 (void) inp_update_policy(sotoinpcb(so
));
566 so_update_necp_policy(struct socket
*so
, struct sockaddr
*override_local_addr
,
567 struct sockaddr
*override_remote_addr
)
569 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
)
570 inp_update_necp_policy(sotoinpcb(so
), override_local_addr
,
571 override_remote_addr
, 0);
580 boolean_t rc
= FALSE
;
582 lck_mtx_lock(so_cache_mtx
);
584 so_cache_time
= net_uptime();
586 while (!STAILQ_EMPTY(&so_cache_head
)) {
587 VERIFY(cached_sock_count
> 0);
588 p
= STAILQ_FIRST(&so_cache_head
);
589 if ((so_cache_time
- p
->cache_timestamp
) <
593 STAILQ_REMOVE_HEAD(&so_cache_head
, so_cache_ent
);
596 zfree(so_cache_zone
, p
);
598 if (++n_freed
>= SO_CACHE_MAX_FREE_BATCH
) {
599 so_cache_max_freed
++;
604 /* Schedule again if there is more to cleanup */
605 if (!STAILQ_EMPTY(&so_cache_head
))
608 lck_mtx_unlock(so_cache_mtx
);
613 * Get a socket structure from our zone, and initialize it.
614 * We don't implement `waitok' yet (see comments in uipc_domain.c).
615 * Note that it would probably be better to allocate socket
616 * and PCB at the same time, but I'm not convinced that all
617 * the protocols can be easily modified to do this.
620 soalloc(int waitok
, int dom
, int type
)
624 if ((dom
== PF_INET
) && (type
== SOCK_STREAM
)) {
625 cached_sock_alloc(&so
, waitok
);
627 MALLOC_ZONE(so
, struct socket
*, sizeof (*so
), socket_zone
,
630 bzero(so
, sizeof (*so
));
633 so
->so_gencnt
= OSIncrementAtomic64((SInt64
*)&so_gencnt
);
634 so
->so_zone
= socket_zone
;
637 * Increment the socket allocation statistics
639 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_alloc_total
);
641 #if CONFIG_MACF_SOCKET
642 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
643 if (mac_socket_label_init(so
, !waitok
) != 0) {
647 #endif /* MAC_SOCKET */
654 socreate_internal(int dom
, struct socket
**aso
, int type
, int proto
,
655 struct proc
*p
, uint32_t flags
, struct proc
*ep
)
662 extern int tcpconsdebug
;
669 prp
= pffindproto(dom
, proto
, type
);
671 prp
= pffindtype(dom
, type
);
673 if (prp
== NULL
|| prp
->pr_usrreqs
->pru_attach
== NULL
) {
674 if (pffinddomain(dom
) == NULL
)
675 return (EAFNOSUPPORT
);
677 if (pffindprotonotype(dom
, proto
) != NULL
)
680 return (EPROTONOSUPPORT
);
682 if (prp
->pr_type
!= type
)
684 so
= soalloc(1, dom
, type
);
690 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_local_total
);
693 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_inet_total
);
694 if (type
== SOCK_STREAM
) {
695 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_inet_stream_total
);
697 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_inet_dgram_total
);
701 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_route_total
);
704 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_ndrv_total
);
707 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_key_total
);
710 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_inet6_total
);
711 if (type
== SOCK_STREAM
) {
712 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_inet6_stream_total
);
714 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_inet6_dgram_total
);
718 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_system_total
);
721 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_multipath_total
);
724 INC_ATOMIC_INT64_LIM(net_api_stats
.nas_socket_domain_other_total
);
728 if (flags
& SOCF_ASYNC
)
729 so
->so_state
|= SS_NBIO
;
731 TAILQ_INIT(&so
->so_incomp
);
732 TAILQ_INIT(&so
->so_comp
);
734 so
->last_upid
= proc_uniqueid(p
);
735 so
->last_pid
= proc_pid(p
);
736 proc_getexecutableuuid(p
, so
->last_uuid
, sizeof (so
->last_uuid
));
737 proc_pidoriginatoruuid(so
->so_vuuid
, sizeof(so
->so_vuuid
));
739 if (ep
!= PROC_NULL
&& ep
!= p
) {
740 so
->e_upid
= proc_uniqueid(ep
);
741 so
->e_pid
= proc_pid(ep
);
742 proc_getexecutableuuid(ep
, so
->e_uuid
, sizeof (so
->e_uuid
));
743 so
->so_flags
|= SOF_DELEGATED
;
746 so
->so_cred
= kauth_cred_proc_ref(p
);
747 if (!suser(kauth_cred_get(), NULL
))
748 so
->so_state
|= SS_PRIV
;
751 so
->so_rcv
.sb_flags
|= SB_RECV
;
752 so
->so_rcv
.sb_so
= so
->so_snd
.sb_so
= so
;
753 so
->next_lock_lr
= 0;
754 so
->next_unlock_lr
= 0;
756 #if CONFIG_MACF_SOCKET
757 mac_socket_label_associate(kauth_cred_get(), so
);
758 #endif /* MAC_SOCKET */
761 * Attachment will create the per pcb lock if necessary and
762 * increase refcount for creation, make sure it's done before
763 * socket is inserted in lists.
767 error
= (*prp
->pr_usrreqs
->pru_attach
)(so
, proto
, p
);
771 * If so_pcb is not zero, the socket will be leaked,
772 * so protocol attachment handler must be coded carefuly
774 so
->so_state
|= SS_NOFDREF
;
775 VERIFY(so
->so_usecount
> 0);
777 sofreelastref(so
, 1); /* will deallocate the socket */
781 atomic_add_32(&prp
->pr_domain
->dom_refs
, 1);
782 TAILQ_INIT(&so
->so_evlist
);
784 /* Attach socket filters for this protocol */
787 if (tcpconsdebug
== 2)
788 so
->so_options
|= SO_DEBUG
;
790 so_set_default_traffic_class(so
);
793 * If this thread or task is marked to create backgrounded sockets,
794 * mark the socket as background.
796 if (proc_get_effective_thread_policy(current_thread(),
797 TASK_POLICY_NEW_SOCKETS_BG
)) {
798 socket_set_traffic_mgt_flags(so
, TRAFFIC_MGT_SO_BACKGROUND
);
799 so
->so_background_thread
= current_thread();
804 * Don't mark Unix domain, system or multipath sockets as
805 * eligible for defunct by default.
810 so
->so_flags
|= SOF_NODEFUNCT
;
817 * Entitlements can't be checked at socket creation time except if the
818 * application requested a feature guarded by a privilege (c.f., socket
820 * The priv(9) and the Sandboxing APIs are designed with the idea that
821 * a privilege check should only be triggered by a userland request.
822 * A privilege check at socket creation time is time consuming and
823 * could trigger many authorisation error messages from the security
838 * <pru_attach>:ENOBUFS[AF_UNIX]
839 * <pru_attach>:ENOBUFS[TCP]
840 * <pru_attach>:ENOMEM[TCP]
841 * <pru_attach>:??? [other protocol families, IPSEC]
844 socreate(int dom
, struct socket
**aso
, int type
, int proto
)
846 return (socreate_internal(dom
, aso
, type
, proto
, current_proc(), 0,
851 socreate_delegate(int dom
, struct socket
**aso
, int type
, int proto
, pid_t epid
)
854 struct proc
*ep
= PROC_NULL
;
856 if ((proc_selfpid() != epid
) && ((ep
= proc_find(epid
)) == PROC_NULL
)) {
861 error
= socreate_internal(dom
, aso
, type
, proto
, current_proc(), 0, ep
);
864 * It might not be wise to hold the proc reference when calling
865 * socreate_internal since it calls soalloc with M_WAITOK
876 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
877 * <pru_bind>:EAFNOSUPPORT Address family not supported
878 * <pru_bind>:EADDRNOTAVAIL Address not available.
879 * <pru_bind>:EINVAL Invalid argument
880 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
881 * <pru_bind>:EACCES Permission denied
882 * <pru_bind>:EADDRINUSE Address in use
883 * <pru_bind>:EAGAIN Resource unavailable, try again
884 * <pru_bind>:EPERM Operation not permitted
888 * Notes: It's not possible to fully enumerate the return codes above,
889 * since socket filter authors and protocol family authors may
890 * not choose to limit their error returns to those listed, even
891 * though this may result in some software operating incorrectly.
893 * The error codes which are enumerated above are those known to
894 * be returned by the tcp_usr_bind function supplied.
897 sobindlock(struct socket
*so
, struct sockaddr
*nam
, int dolock
)
899 struct proc
*p
= current_proc();
905 so_update_last_owner_locked(so
, p
);
906 so_update_policy(so
);
909 so_update_necp_policy(so
, nam
, NULL
);
913 * If this is a bind request on a socket that has been marked
914 * as inactive, reject it now before we go any further.
916 if (so
->so_flags
& SOF_DEFUNCT
) {
918 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
919 __func__
, proc_pid(p
), proc_best_name(p
),
920 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
921 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
926 error
= sflt_bind(so
, nam
);
929 error
= (*so
->so_proto
->pr_usrreqs
->pru_bind
)(so
, nam
, p
);
932 socket_unlock(so
, 1);
934 if (error
== EJUSTRETURN
)
941 sodealloc(struct socket
*so
)
943 kauth_cred_unref(&so
->so_cred
);
945 /* Remove any filters */
949 cfil_sock_detach(so
);
950 #endif /* CONTENT_FILTER */
952 /* Delete the state allocated for msg queues on a socket */
953 if (so
->so_flags
& SOF_ENABLE_MSGS
) {
954 FREE(so
->so_msg_state
, M_TEMP
);
955 so
->so_msg_state
= NULL
;
957 VERIFY(so
->so_msg_state
== NULL
);
959 so
->so_gencnt
= OSIncrementAtomic64((SInt64
*)&so_gencnt
);
961 #if CONFIG_MACF_SOCKET
962 mac_socket_label_destroy(so
);
963 #endif /* MAC_SOCKET */
965 if (so
->so_flags1
& SOF1_CACHED_IN_SOCK_LAYER
) {
966 cached_sock_free(so
);
968 FREE_ZONE(so
, sizeof (*so
), so
->so_zone
);
976 * <pru_listen>:EINVAL[AF_UNIX]
977 * <pru_listen>:EINVAL[TCP]
978 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
979 * <pru_listen>:EINVAL[TCP] Invalid argument
980 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
981 * <pru_listen>:EACCES[TCP] Permission denied
982 * <pru_listen>:EADDRINUSE[TCP] Address in use
983 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
984 * <pru_listen>:EPERM[TCP] Operation not permitted
987 * Notes: Other <pru_listen> returns depend on the protocol family; all
988 * <sf_listen> returns depend on what the filter author causes
989 * their filter to return.
992 solisten(struct socket
*so
, int backlog
)
994 struct proc
*p
= current_proc();
999 so_update_last_owner_locked(so
, p
);
1000 so_update_policy(so
);
1003 so_update_necp_policy(so
, NULL
, NULL
);
1006 if (so
->so_proto
== NULL
) {
1010 if ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) == 0) {
1016 * If the listen request is made on a socket that is not fully
1017 * disconnected, or on a socket that has been marked as inactive,
1018 * reject the request now.
1021 (SS_ISCONNECTED
|SS_ISCONNECTING
|SS_ISDISCONNECTING
)) ||
1022 (so
->so_flags
& SOF_DEFUNCT
)) {
1024 if (so
->so_flags
& SOF_DEFUNCT
) {
1025 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1026 "(%d)\n", __func__
, proc_pid(p
),
1028 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
1029 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
1034 if ((so
->so_restrictions
& SO_RESTRICT_DENY_IN
) != 0) {
1039 error
= sflt_listen(so
);
1041 error
= (*so
->so_proto
->pr_usrreqs
->pru_listen
)(so
, p
);
1044 if (error
== EJUSTRETURN
)
1049 if (TAILQ_EMPTY(&so
->so_comp
))
1050 so
->so_options
|= SO_ACCEPTCONN
;
1052 * POSIX: The implementation may have an upper limit on the length of
1053 * the listen queue-either global or per accepting socket. If backlog
1054 * exceeds this limit, the length of the listen queue is set to the
1057 * If listen() is called with a backlog argument value that is less
1058 * than 0, the function behaves as if it had been called with a backlog
1059 * argument value of 0.
1061 * A backlog argument of 0 may allow the socket to accept connections,
1062 * in which case the length of the listen queue may be set to an
1063 * implementation-defined minimum value.
1065 if (backlog
<= 0 || backlog
> somaxconn
)
1066 backlog
= somaxconn
;
1068 so
->so_qlimit
= backlog
;
1070 socket_unlock(so
, 1);
1075 * The "accept list lock" protects the fields related to the listener queues
1076 * because we can unlock a socket to respect the lock ordering between
1077 * the listener socket and its clients sockets. The lock ordering is first to
1078 * acquire the client socket before the listener socket.
1080 * The accept list lock serializes access to the following fields:
1081 * - of the listener socket:
1086 * - of client sockets that are in so_comp or so_incomp:
1090 * As one can see the accept list lock protects the consistent of the
1091 * linkage of the client sockets.
1093 * Note that those fields may be read without holding the accept list lock
1094 * for a preflight provided the accept list lock is taken when committing
1095 * to take an action based on the result of the preflight. The preflight
1096 * saves the cost of doing the unlock/lock dance.
1099 so_acquire_accept_list(struct socket
*head
, struct socket
*so
)
1101 lck_mtx_t
*mutex_held
;
1103 if (head
->so_proto
->pr_getlock
== NULL
) {
1106 mutex_held
= (*head
->so_proto
->pr_getlock
)(head
, PR_F_WILLUNLOCK
);
1107 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1109 if (!(head
->so_flags1
& SOF1_ACCEPT_LIST_HELD
)) {
1110 head
->so_flags1
|= SOF1_ACCEPT_LIST_HELD
;
1114 socket_unlock(so
, 0);
1116 while (head
->so_flags1
& SOF1_ACCEPT_LIST_HELD
) {
1117 so_accept_list_waits
+= 1;
1118 msleep((caddr_t
)&head
->so_incomp
, mutex_held
,
1119 PSOCK
| PCATCH
, __func__
, NULL
);
1121 head
->so_flags1
|= SOF1_ACCEPT_LIST_HELD
;
1123 socket_unlock(head
, 0);
1125 socket_lock(head
, 0);
1130 so_release_accept_list(struct socket
*head
)
1132 if (head
->so_proto
->pr_getlock
!= NULL
) {
1133 lck_mtx_t
*mutex_held
;
1135 mutex_held
= (*head
->so_proto
->pr_getlock
)(head
, 0);
1136 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1138 head
->so_flags1
&= ~SOF1_ACCEPT_LIST_HELD
;
1139 wakeup((caddr_t
)&head
->so_incomp
);
1144 sofreelastref(struct socket
*so
, int dealloc
)
1146 struct socket
*head
= so
->so_head
;
1148 /* Assume socket is locked */
1150 if (!(so
->so_flags
& SOF_PCBCLEARING
) || !(so
->so_state
& SS_NOFDREF
)) {
1151 selthreadclear(&so
->so_snd
.sb_sel
);
1152 selthreadclear(&so
->so_rcv
.sb_sel
);
1153 so
->so_rcv
.sb_flags
&= ~(SB_SEL
|SB_UPCALL
);
1154 so
->so_snd
.sb_flags
&= ~(SB_SEL
|SB_UPCALL
);
1155 so
->so_event
= sonullevent
;
1160 * Need to lock the listener when the protocol has
1163 if (head
->so_proto
->pr_getlock
!= NULL
) {
1164 socket_lock(head
, 1);
1165 so_acquire_accept_list(head
, so
);
1167 if (so
->so_state
& SS_INCOMP
) {
1168 so
->so_state
&= ~SS_INCOMP
;
1169 TAILQ_REMOVE(&head
->so_incomp
, so
, so_list
);
1174 if (head
->so_proto
->pr_getlock
!= NULL
) {
1175 so_release_accept_list(head
);
1176 socket_unlock(head
, 1);
1178 } else if (so
->so_state
& SS_COMP
) {
1179 if (head
->so_proto
->pr_getlock
!= NULL
) {
1180 so_release_accept_list(head
);
1181 socket_unlock(head
, 1);
1184 * We must not decommission a socket that's
1185 * on the accept(2) queue. If we do, then
1186 * accept(2) may hang after select(2) indicated
1187 * that the listening socket was ready.
1189 selthreadclear(&so
->so_snd
.sb_sel
);
1190 selthreadclear(&so
->so_rcv
.sb_sel
);
1191 so
->so_rcv
.sb_flags
&= ~(SB_SEL
|SB_UPCALL
);
1192 so
->so_snd
.sb_flags
&= ~(SB_SEL
|SB_UPCALL
);
1193 so
->so_event
= sonullevent
;
1196 if (head
->so_proto
->pr_getlock
!= NULL
) {
1197 so_release_accept_list(head
);
1198 socket_unlock(head
, 1);
1200 printf("sofree: not queued\n");
1207 if (so
->so_flags
& SOF_FLOW_DIVERT
) {
1208 flow_divert_detach(so
);
1210 #endif /* FLOW_DIVERT */
1212 /* 3932268: disable upcall */
1213 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
1214 so
->so_snd
.sb_flags
&= ~(SB_UPCALL
|SB_SNDBYTE_CNT
);
1215 so
->so_event
= sonullevent
;
1222 soclose_wait_locked(struct socket
*so
)
1224 lck_mtx_t
*mutex_held
;
1226 if (so
->so_proto
->pr_getlock
!= NULL
)
1227 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, PR_F_WILLUNLOCK
);
1229 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1230 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1233 * Double check here and return if there's no outstanding upcall;
1234 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1236 if (!so
->so_upcallusecount
|| !(so
->so_flags
& SOF_UPCALLCLOSEWAIT
))
1238 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
1239 so
->so_snd
.sb_flags
&= ~SB_UPCALL
;
1240 so
->so_flags
|= SOF_CLOSEWAIT
;
1242 (void) msleep((caddr_t
)&so
->so_upcallusecount
, mutex_held
, (PZERO
- 1),
1243 "soclose_wait_locked", NULL
);
1244 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1245 so
->so_flags
&= ~SOF_CLOSEWAIT
;
1249 * Close a socket on last file table reference removal.
1250 * Initiate disconnect if connected.
1251 * Free socket when disconnect complete.
1254 soclose_locked(struct socket
*so
)
1259 if (so
->so_usecount
== 0) {
1260 panic("soclose: so=%p refcount=0\n", so
);
1264 sflt_notify(so
, sock_evt_closing
, NULL
);
1266 if (so
->so_upcallusecount
)
1267 soclose_wait_locked(so
);
1271 * We have to wait until the content filters are done
1273 if ((so
->so_flags
& SOF_CONTENT_FILTER
) != 0) {
1274 cfil_sock_close_wait(so
);
1275 cfil_sock_is_closed(so
);
1276 cfil_sock_detach(so
);
1278 #endif /* CONTENT_FILTER */
1280 if (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_INPROG
) {
1281 soresume(current_proc(), so
, 1);
1282 so
->so_flags1
&= ~SOF1_EXTEND_BK_IDLE_WANTED
;
1285 if ((so
->so_options
& SO_ACCEPTCONN
)) {
1286 struct socket
*sp
, *sonext
;
1287 int persocklock
= 0;
1288 int incomp_overflow_only
;
1291 * We do not want new connection to be added
1292 * to the connection queues
1294 so
->so_options
&= ~SO_ACCEPTCONN
;
1297 * We can drop the lock on the listener once
1298 * we've acquired the incoming list
1300 if (so
->so_proto
->pr_getlock
!= NULL
) {
1302 so_acquire_accept_list(so
, NULL
);
1303 socket_unlock(so
, 0);
1306 incomp_overflow_only
= 1;
1308 TAILQ_FOREACH_SAFE(sp
, &so
->so_incomp
, so_list
, sonext
) {
1311 * skip sockets thrown away by tcpdropdropblreq
1312 * they will get cleanup by the garbage collection.
1313 * otherwise, remove the incomp socket from the queue
1314 * and let soabort trigger the appropriate cleanup.
1316 if (sp
->so_flags
& SOF_OVERFLOW
)
1319 if (persocklock
!= 0)
1324 * The extra reference for the list insure the
1325 * validity of the socket pointer when we perform the
1326 * unlock of the head above
1328 if (sp
->so_state
& SS_INCOMP
) {
1329 sp
->so_state
&= ~SS_INCOMP
;
1331 TAILQ_REMOVE(&so
->so_incomp
, sp
, so_list
);
1337 panic("%s sp %p in so_incomp but !SS_INCOMP",
1341 if (persocklock
!= 0)
1342 socket_unlock(sp
, 1);
1345 TAILQ_FOREACH_SAFE(sp
, &so
->so_comp
, so_list
, sonext
) {
1346 /* Dequeue from so_comp since sofree() won't do it */
1347 if (persocklock
!= 0)
1350 if (sp
->so_state
& SS_COMP
) {
1351 sp
->so_state
&= ~SS_COMP
;
1353 TAILQ_REMOVE(&so
->so_comp
, sp
, so_list
);
1358 panic("%s sp %p in so_comp but !SS_COMP",
1363 socket_unlock(sp
, 1);
1366 if (incomp_overflow_only
== 0 && !TAILQ_EMPTY(&so
->so_incomp
)) {
1367 #if (DEBUG|DEVELOPMENT)
1368 panic("%s head %p so_comp not empty\n", __func__
, so
);
1369 #endif /* (DEVELOPMENT || DEBUG) */
1374 if (!TAILQ_EMPTY(&so
->so_comp
)) {
1375 #if (DEBUG|DEVELOPMENT)
1376 panic("%s head %p so_comp not empty\n", __func__
, so
);
1377 #endif /* (DEVELOPMENT || DEBUG) */
1384 so_release_accept_list(so
);
1387 if (so
->so_pcb
== NULL
) {
1388 /* 3915887: mark the socket as ready for dealloc */
1389 so
->so_flags
|= SOF_PCBCLEARING
;
1392 if (so
->so_state
& SS_ISCONNECTED
) {
1393 if ((so
->so_state
& SS_ISDISCONNECTING
) == 0) {
1394 error
= sodisconnectlocked(so
);
1398 if (so
->so_options
& SO_LINGER
) {
1399 lck_mtx_t
*mutex_held
;
1401 if ((so
->so_state
& SS_ISDISCONNECTING
) &&
1402 (so
->so_state
& SS_NBIO
))
1404 if (so
->so_proto
->pr_getlock
!= NULL
)
1405 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, PR_F_WILLUNLOCK
);
1407 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1408 while (so
->so_state
& SS_ISCONNECTED
) {
1409 ts
.tv_sec
= (so
->so_linger
/100);
1410 ts
.tv_nsec
= (so
->so_linger
% 100) *
1411 NSEC_PER_USEC
* 1000 * 10;
1412 error
= msleep((caddr_t
)&so
->so_timeo
,
1413 mutex_held
, PSOCK
| PCATCH
, "soclose", &ts
);
1416 * It's OK when the time fires,
1417 * don't report an error
1419 if (error
== EWOULDBLOCK
)
1427 if (so
->so_usecount
== 0) {
1428 panic("soclose: usecount is zero so=%p\n", so
);
1431 if (so
->so_pcb
!= NULL
&& !(so
->so_flags
& SOF_PCBCLEARING
)) {
1432 int error2
= (*so
->so_proto
->pr_usrreqs
->pru_detach
)(so
);
1436 if (so
->so_usecount
<= 0) {
1437 panic("soclose: usecount is zero so=%p\n", so
);
1441 if (so
->so_pcb
!= NULL
&& !(so
->so_flags
& SOF_MP_SUBFLOW
) &&
1442 (so
->so_state
& SS_NOFDREF
)) {
1443 panic("soclose: NOFDREF");
1446 so
->so_state
|= SS_NOFDREF
;
1448 if ((so
->so_flags
& SOF_KNOTE
) != 0)
1449 KNOTE(&so
->so_klist
, SO_FILT_HINT_LOCKED
);
1451 atomic_add_32(&so
->so_proto
->pr_domain
->dom_refs
, -1);
1454 VERIFY(so
->so_usecount
> 0);
1461 soclose(struct socket
*so
)
1466 if (so
->so_retaincnt
== 0) {
1467 error
= soclose_locked(so
);
1470 * if the FD is going away, but socket is
1471 * retained in kernel remove its reference
1474 if (so
->so_usecount
< 2)
1475 panic("soclose: retaincnt non null and so=%p "
1476 "usecount=%d\n", so
, so
->so_usecount
);
1478 socket_unlock(so
, 1);
1483 * Must be called at splnet...
1485 /* Should already be locked */
1487 soabort(struct socket
*so
)
1491 #ifdef MORE_LOCKING_DEBUG
1492 lck_mtx_t
*mutex_held
;
1494 if (so
->so_proto
->pr_getlock
!= NULL
)
1495 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
1497 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1498 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1501 if ((so
->so_flags
& SOF_ABORTED
) == 0) {
1502 so
->so_flags
|= SOF_ABORTED
;
1503 error
= (*so
->so_proto
->pr_usrreqs
->pru_abort
)(so
);
1513 soacceptlock(struct socket
*so
, struct sockaddr
**nam
, int dolock
)
1520 so_update_last_owner_locked(so
, PROC_NULL
);
1521 so_update_policy(so
);
1523 so_update_necp_policy(so
, NULL
, NULL
);
1526 if ((so
->so_state
& SS_NOFDREF
) == 0)
1527 panic("soaccept: !NOFDREF");
1528 so
->so_state
&= ~SS_NOFDREF
;
1529 error
= (*so
->so_proto
->pr_usrreqs
->pru_accept
)(so
, nam
);
1532 socket_unlock(so
, 1);
1537 soaccept(struct socket
*so
, struct sockaddr
**nam
)
1539 return (soacceptlock(so
, nam
, 1));
1543 soacceptfilter(struct socket
*so
, struct socket
*head
)
1545 struct sockaddr
*local
= NULL
, *remote
= NULL
;
1549 * Hold the lock even if this socket has not been made visible
1550 * to the filter(s). For sockets with global locks, this protects
1551 * against the head or peer going away
1554 if (sogetaddr_locked(so
, &remote
, 1) != 0 ||
1555 sogetaddr_locked(so
, &local
, 0) != 0) {
1556 so
->so_state
&= ~SS_NOFDREF
;
1557 socket_unlock(so
, 1);
1559 /* Out of resources; try it again next time */
1560 error
= ECONNABORTED
;
1564 error
= sflt_accept(head
, so
, local
, remote
);
1567 * If we get EJUSTRETURN from one of the filters, mark this socket
1568 * as inactive and return it anyway. This newly accepted socket
1569 * will be disconnected later before we hand it off to the caller.
1571 if (error
== EJUSTRETURN
) {
1573 (void) sosetdefunct(current_proc(), so
,
1574 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL
, FALSE
);
1579 * This may seem like a duplication to the above error
1580 * handling part when we return ECONNABORTED, except
1581 * the following is done while holding the lock since
1582 * the socket has been exposed to the filter(s) earlier.
1584 so
->so_state
&= ~SS_NOFDREF
;
1585 socket_unlock(so
, 1);
1587 /* Propagate socket filter's error code to the caller */
1589 socket_unlock(so
, 1);
1592 /* Callee checks for NULL pointer */
1593 sock_freeaddr(remote
);
1594 sock_freeaddr(local
);
1599 * Returns: 0 Success
1600 * EOPNOTSUPP Operation not supported on socket
1601 * EISCONN Socket is connected
1602 * <pru_connect>:EADDRNOTAVAIL Address not available.
1603 * <pru_connect>:EINVAL Invalid argument
1604 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1605 * <pru_connect>:EACCES Permission denied
1606 * <pru_connect>:EADDRINUSE Address in use
1607 * <pru_connect>:EAGAIN Resource unavailable, try again
1608 * <pru_connect>:EPERM Operation not permitted
1609 * <sf_connect_out>:??? [anything a filter writer might set]
1612 soconnectlock(struct socket
*so
, struct sockaddr
*nam
, int dolock
)
1615 struct proc
*p
= current_proc();
1620 so_update_last_owner_locked(so
, p
);
1621 so_update_policy(so
);
1624 so_update_necp_policy(so
, NULL
, nam
);
1628 * If this is a listening socket or if this is a previously-accepted
1629 * socket that has been marked as inactive, reject the connect request.
1631 if ((so
->so_options
& SO_ACCEPTCONN
) || (so
->so_flags
& SOF_DEFUNCT
)) {
1633 if (so
->so_flags
& SOF_DEFUNCT
) {
1634 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1635 "(%d)\n", __func__
, proc_pid(p
),
1637 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
1638 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
1641 socket_unlock(so
, 1);
1645 if ((so
->so_restrictions
& SO_RESTRICT_DENY_OUT
) != 0) {
1647 socket_unlock(so
, 1);
1652 * If protocol is connection-based, can only connect once.
1653 * Otherwise, if connected, try to disconnect first.
1654 * This allows user to disconnect by connecting to, e.g.,
1657 if (so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
) &&
1658 ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ||
1659 (error
= sodisconnectlocked(so
)))) {
1663 * Run connect filter before calling protocol:
1664 * - non-blocking connect returns before completion;
1666 error
= sflt_connectout(so
, nam
);
1668 if (error
== EJUSTRETURN
)
1671 error
= (*so
->so_proto
->pr_usrreqs
->pru_connect
)
1676 socket_unlock(so
, 1);
1681 soconnect(struct socket
*so
, struct sockaddr
*nam
)
1683 return (soconnectlock(so
, nam
, 1));
1687 * Returns: 0 Success
1688 * <pru_connect2>:EINVAL[AF_UNIX]
1689 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1690 * <pru_connect2>:??? [other protocol families]
1692 * Notes: <pru_connect2> is not supported by [TCP].
1695 soconnect2(struct socket
*so1
, struct socket
*so2
)
1699 socket_lock(so1
, 1);
1700 if (so2
->so_proto
->pr_lock
)
1701 socket_lock(so2
, 1);
1703 error
= (*so1
->so_proto
->pr_usrreqs
->pru_connect2
)(so1
, so2
);
1705 socket_unlock(so1
, 1);
1706 if (so2
->so_proto
->pr_lock
)
1707 socket_unlock(so2
, 1);
1712 soconnectxlocked(struct socket
*so
, struct sockaddr
*src
,
1713 struct sockaddr
*dst
, struct proc
*p
, uint32_t ifscope
,
1714 sae_associd_t aid
, sae_connid_t
*pcid
, uint32_t flags
, void *arg
,
1715 uint32_t arglen
, uio_t auio
, user_ssize_t
*bytes_written
)
1719 so_update_last_owner_locked(so
, p
);
1720 so_update_policy(so
);
1723 * If this is a listening socket or if this is a previously-accepted
1724 * socket that has been marked as inactive, reject the connect request.
1726 if ((so
->so_options
& SO_ACCEPTCONN
) || (so
->so_flags
& SOF_DEFUNCT
)) {
1728 if (so
->so_flags
& SOF_DEFUNCT
) {
1729 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1730 "(%d)\n", __func__
, proc_pid(p
),
1732 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
1733 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
1738 if ((so
->so_restrictions
& SO_RESTRICT_DENY_OUT
) != 0)
1742 * If protocol is connection-based, can only connect once
1743 * unless PR_MULTICONN is set. Otherwise, if connected,
1744 * try to disconnect first. This allows user to disconnect
1745 * by connecting to, e.g., a null address.
1747 if ((so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
)) &&
1748 !(so
->so_proto
->pr_flags
& PR_MULTICONN
) &&
1749 ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ||
1750 (error
= sodisconnectlocked(so
)) != 0)) {
1754 * Run connect filter before calling protocol:
1755 * - non-blocking connect returns before completion;
1757 error
= sflt_connectout(so
, dst
);
1759 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1760 so
->so_flags1
&= ~SOF1_PRECONNECT_DATA
;
1761 if (error
== EJUSTRETURN
)
1764 error
= (*so
->so_proto
->pr_usrreqs
->pru_connectx
)
1765 (so
, src
, dst
, p
, ifscope
, aid
, pcid
,
1766 flags
, arg
, arglen
, auio
, bytes_written
);
1774 sodisconnectlocked(struct socket
*so
)
1778 if ((so
->so_state
& SS_ISCONNECTED
) == 0) {
1782 if (so
->so_state
& SS_ISDISCONNECTING
) {
1787 error
= (*so
->so_proto
->pr_usrreqs
->pru_disconnect
)(so
);
1789 sflt_notify(so
, sock_evt_disconnected
, NULL
);
1795 /* Locking version */
1797 sodisconnect(struct socket
*so
)
1802 error
= sodisconnectlocked(so
);
1803 socket_unlock(so
, 1);
1808 sodisconnectxlocked(struct socket
*so
, sae_associd_t aid
, sae_connid_t cid
)
1813 * Call the protocol disconnectx handler; let it handle all
1814 * matters related to the connection state of this session.
1816 error
= (*so
->so_proto
->pr_usrreqs
->pru_disconnectx
)(so
, aid
, cid
);
1819 * The event applies only for the session, not for
1820 * the disconnection of individual subflows.
1822 if (so
->so_state
& (SS_ISDISCONNECTING
|SS_ISDISCONNECTED
))
1823 sflt_notify(so
, sock_evt_disconnected
, NULL
);
1829 sodisconnectx(struct socket
*so
, sae_associd_t aid
, sae_connid_t cid
)
1834 error
= sodisconnectxlocked(so
, aid
, cid
);
1835 socket_unlock(so
, 1);
1839 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1842 * sosendcheck will lock the socket buffer if it isn't locked and
1843 * verify that there is space for the data being inserted.
1845 * Returns: 0 Success
1847 * sblock:EWOULDBLOCK
1854 sosendcheck(struct socket
*so
, struct sockaddr
*addr
, user_ssize_t resid
,
1855 int32_t clen
, int32_t atomic
, int flags
, int *sblocked
,
1856 struct mbuf
*control
)
1863 if (*sblocked
== 0) {
1864 if ((so
->so_snd
.sb_flags
& SB_LOCK
) != 0 &&
1865 so
->so_send_filt_thread
!= 0 &&
1866 so
->so_send_filt_thread
== current_thread()) {
1868 * We're being called recursively from a filter,
1869 * allow this to continue. Radar 4150520.
1870 * Don't set sblocked because we don't want
1871 * to perform an unlock later.
1875 error
= sblock(&so
->so_snd
, SBLOCKWAIT(flags
));
1877 if (so
->so_flags
& SOF_DEFUNCT
)
1886 * If a send attempt is made on a socket that has been marked
1887 * as inactive (disconnected), reject the request.
1889 if (so
->so_flags
& SOF_DEFUNCT
) {
1892 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1893 __func__
, proc_selfpid(), proc_best_name(current_proc()),
1894 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
1895 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
1899 if (so
->so_state
& SS_CANTSENDMORE
) {
1902 * Can re-inject data of half closed connections
1904 if ((so
->so_state
& SS_ISDISCONNECTED
) == 0 &&
1905 so
->so_snd
.sb_cfil_thread
== current_thread() &&
1906 cfil_sock_data_pending(&so
->so_snd
) != 0)
1908 "so %llx ignore SS_CANTSENDMORE",
1909 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
));
1911 #endif /* CONTENT_FILTER */
1915 error
= so
->so_error
;
1920 if ((so
->so_state
& SS_ISCONNECTED
) == 0) {
1921 if ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) != 0) {
1922 if (((so
->so_state
& SS_ISCONFIRMING
) == 0) &&
1923 (resid
!= 0 || clen
== 0) &&
1924 !(so
->so_flags1
& SOF1_PRECONNECT_DATA
))
1927 } else if (addr
== 0 && !(flags
&MSG_HOLD
)) {
1928 return ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ?
1929 ENOTCONN
: EDESTADDRREQ
);
1933 if (so
->so_flags
& SOF_ENABLE_MSGS
)
1934 space
= msgq_sbspace(so
, control
);
1936 space
= sbspace(&so
->so_snd
);
1938 if (flags
& MSG_OOB
)
1940 if ((atomic
&& resid
> so
->so_snd
.sb_hiwat
) ||
1941 clen
> so
->so_snd
.sb_hiwat
)
1944 if ((space
< resid
+ clen
&&
1945 (atomic
|| (space
< (int32_t)so
->so_snd
.sb_lowat
) ||
1947 (so
->so_type
== SOCK_STREAM
&& so_wait_for_if_feedback(so
))) {
1949 * don't block the connectx call when there's more data
1950 * than can be copied.
1952 if (so
->so_flags1
& SOF1_PRECONNECT_DATA
) {
1954 return (EWOULDBLOCK
);
1956 if (space
< (int32_t)so
->so_snd
.sb_lowat
) {
1960 if ((so
->so_state
& SS_NBIO
) || (flags
& MSG_NBIO
) ||
1962 return (EWOULDBLOCK
);
1964 sbunlock(&so
->so_snd
, TRUE
); /* keep socket locked */
1966 error
= sbwait(&so
->so_snd
);
1968 if (so
->so_flags
& SOF_DEFUNCT
)
1979 * If send must go all at once and message is larger than
1980 * send buffering, then hard error.
1981 * Lock against other senders.
1982 * If must go all at once and not enough room now, then
1983 * inform user that this would block and do nothing.
1984 * Otherwise, if nonblocking, send as much as possible.
1985 * The data to be sent is described by "uio" if nonzero,
1986 * otherwise by the mbuf chain "top" (which must be null
1987 * if uio is not). Data provided in mbuf chain must be small
1988 * enough to send all at once.
1990 * Returns nonzero on error, timeout or signal; callers
1991 * must check for short counts if EINTR/ERESTART are returned.
1992 * Data and control buffers are freed on return.
1994 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1995 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1996 * point at the mbuf chain being constructed and go from there.
1998 * Returns: 0 Success
2004 * sosendcheck:EWOULDBLOCK
2008 * sosendcheck:??? [value from so_error]
2009 * <pru_send>:ECONNRESET[TCP]
2010 * <pru_send>:EINVAL[TCP]
2011 * <pru_send>:ENOBUFS[TCP]
2012 * <pru_send>:EADDRINUSE[TCP]
2013 * <pru_send>:EADDRNOTAVAIL[TCP]
2014 * <pru_send>:EAFNOSUPPORT[TCP]
2015 * <pru_send>:EACCES[TCP]
2016 * <pru_send>:EAGAIN[TCP]
2017 * <pru_send>:EPERM[TCP]
2018 * <pru_send>:EMSGSIZE[TCP]
2019 * <pru_send>:EHOSTUNREACH[TCP]
2020 * <pru_send>:ENETUNREACH[TCP]
2021 * <pru_send>:ENETDOWN[TCP]
2022 * <pru_send>:ENOMEM[TCP]
2023 * <pru_send>:ENOBUFS[TCP]
2024 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2025 * <pru_send>:EINVAL[AF_UNIX]
2026 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2027 * <pru_send>:EPIPE[AF_UNIX]
2028 * <pru_send>:ENOTCONN[AF_UNIX]
2029 * <pru_send>:EISCONN[AF_UNIX]
2030 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2031 * <sf_data_out>:??? [whatever a filter author chooses]
2033 * Notes: Other <pru_send> returns depend on the protocol family; all
2034 * <sf_data_out> returns depend on what the filter author causes
2035 * their filter to return.
2038 sosend(struct socket
*so
, struct sockaddr
*addr
, struct uio
*uio
,
2039 struct mbuf
*top
, struct mbuf
*control
, int flags
)
2042 struct mbuf
*m
, *freelist
= NULL
;
2043 user_ssize_t space
, len
, resid
, orig_resid
;
2044 int clen
= 0, error
, dontroute
, mlen
, sendflags
;
2045 int atomic
= sosendallatonce(so
) || top
;
2047 struct proc
*p
= current_proc();
2048 struct mbuf
*control_copy
= NULL
;
2049 uint16_t headroom
= 0;
2050 boolean_t en_tracing
= FALSE
;
2053 resid
= uio_resid(uio
);
2055 resid
= top
->m_pkthdr
.len
;
2057 KERNEL_DEBUG((DBG_FNC_SOSEND
| DBG_FUNC_START
), so
, resid
,
2058 so
->so_snd
.sb_cc
, so
->so_snd
.sb_lowat
, so
->so_snd
.sb_hiwat
);
2063 * trace if tracing & network (vs. unix) sockets & and
2066 if (ENTR_SHOULDTRACE
&&
2067 (SOCK_CHECK_DOM(so
, AF_INET
) || SOCK_CHECK_DOM(so
, AF_INET6
))) {
2068 struct inpcb
*inp
= sotoinpcb(so
);
2069 if (inp
->inp_last_outifp
!= NULL
&&
2070 !(inp
->inp_last_outifp
->if_flags
& IFF_LOOPBACK
)) {
2072 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite
, DBG_FUNC_START
,
2073 VM_KERNEL_ADDRPERM(so
),
2074 ((so
->so_state
& SS_NBIO
) ? kEnTrFlagNonBlocking
: 0),
2081 * Re-injection should not affect process accounting
2083 if ((flags
& MSG_SKIPCFIL
) == 0) {
2084 so_update_last_owner_locked(so
, p
);
2085 so_update_policy(so
);
2088 so_update_necp_policy(so
, NULL
, addr
);
2092 if (so
->so_type
!= SOCK_STREAM
&& (flags
& MSG_OOB
) != 0) {
2098 * In theory resid should be unsigned.
2099 * However, space must be signed, as it might be less than 0
2100 * if we over-committed, and we must use a signed comparison
2101 * of space and resid. On the other hand, a negative resid
2102 * causes us to loop sending 0-length segments to the protocol.
2104 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2105 * But it will be used by sockets doing message delivery.
2107 * Note: We limit resid to be a positive int value as we use
2108 * imin() to set bytes_to_copy -- radr://14558484
2110 if (resid
< 0 || resid
> INT_MAX
|| (so
->so_type
== SOCK_STREAM
&&
2111 !(so
->so_flags
& SOF_ENABLE_MSGS
) && (flags
& MSG_EOR
))) {
2116 dontroute
= (flags
& MSG_DONTROUTE
) &&
2117 (so
->so_options
& SO_DONTROUTE
) == 0 &&
2118 (so
->so_proto
->pr_flags
& PR_ATOMIC
);
2119 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgsnd
);
2121 if (control
!= NULL
)
2122 clen
= control
->m_len
;
2124 if (soreserveheadroom
!= 0)
2125 headroom
= so
->so_pktheadroom
;
2128 error
= sosendcheck(so
, addr
, resid
, clen
, atomic
, flags
,
2129 &sblocked
, control
);
2134 if (so
->so_flags
& SOF_ENABLE_MSGS
)
2135 space
= msgq_sbspace(so
, control
);
2137 space
= sbspace(&so
->so_snd
) - clen
;
2138 space
+= ((flags
& MSG_OOB
) ? 1024 : 0);
2143 * Data is prepackaged in "top".
2146 if (flags
& MSG_EOR
)
2147 top
->m_flags
|= M_EOR
;
2155 bytes_to_copy
= imin(resid
, space
);
2157 bytes_to_alloc
= bytes_to_copy
;
2159 bytes_to_alloc
+= headroom
;
2161 if (sosendminchain
> 0)
2164 chainlength
= sosendmaxchain
;
2167 * Use big 4 KB cluster when the outgoing interface
2168 * does not prefer 2 KB clusters
2170 bigcl
= !(so
->so_flags1
& SOF1_IF_2KCL
) ||
2171 sosendbigcl_ignore_capab
;
2174 * Attempt to use larger than system page-size
2175 * clusters for large writes only if there is
2176 * a jumbo cluster pool and if the socket is
2177 * marked accordingly.
2179 jumbocl
= sosendjcl
&& njcl
> 0 &&
2180 ((so
->so_flags
& SOF_MULTIPAGES
) ||
2181 sosendjcl_ignore_capab
) &&
2184 socket_unlock(so
, 0);
2188 int hdrs_needed
= (top
== NULL
) ? 1 : 0;
2191 * try to maintain a local cache of mbuf
2192 * clusters needed to complete this
2193 * write the list is further limited to
2194 * the number that are currently needed
2195 * to fill the socket this mechanism
2196 * allows a large number of mbufs/
2197 * clusters to be grabbed under a single
2198 * mbuf lock... if we can't get any
2199 * clusters, than fall back to trying
2200 * for mbufs if we fail early (or
2201 * miscalcluate the number needed) make
2202 * sure to release any clusters we
2203 * haven't yet consumed.
2205 if (freelist
== NULL
&&
2206 bytes_to_alloc
> MBIGCLBYTES
&&
2209 bytes_to_alloc
/ M16KCLBYTES
;
2211 if ((bytes_to_alloc
-
2212 (num_needed
* M16KCLBYTES
))
2217 m_getpackets_internal(
2218 (unsigned int *)&num_needed
,
2219 hdrs_needed
, M_WAIT
, 0,
2222 * Fall back to 4K cluster size
2223 * if allocation failed
2227 if (freelist
== NULL
&&
2228 bytes_to_alloc
> MCLBYTES
&&
2231 bytes_to_alloc
/ MBIGCLBYTES
;
2233 if ((bytes_to_alloc
-
2234 (num_needed
* MBIGCLBYTES
)) >=
2239 m_getpackets_internal(
2240 (unsigned int *)&num_needed
,
2241 hdrs_needed
, M_WAIT
, 0,
2244 * Fall back to cluster size
2245 * if allocation failed
2250 * Allocate a cluster as we want to
2251 * avoid to split the data in more
2252 * that one segment and using MINCLSIZE
2253 * would lead us to allocate two mbufs
2255 if (soreserveheadroom
!= 0 &&
2258 bytes_to_alloc
> _MHLEN
) ||
2259 bytes_to_alloc
> _MLEN
)) {
2260 num_needed
= ROUNDUP(bytes_to_alloc
, MCLBYTES
) /
2263 m_getpackets_internal(
2264 (unsigned int *)&num_needed
,
2265 hdrs_needed
, M_WAIT
, 0,
2268 * Fall back to a single mbuf
2269 * if allocation failed
2271 } else if (freelist
== NULL
&&
2272 bytes_to_alloc
> MINCLSIZE
) {
2274 bytes_to_alloc
/ MCLBYTES
;
2276 if ((bytes_to_alloc
-
2277 (num_needed
* MCLBYTES
)) >=
2282 m_getpackets_internal(
2283 (unsigned int *)&num_needed
,
2284 hdrs_needed
, M_WAIT
, 0,
2287 * Fall back to a single mbuf
2288 * if allocation failed
2292 * For datagram protocols, leave
2293 * headroom for protocol headers
2294 * in the first cluster of the chain
2296 if (freelist
!= NULL
&& atomic
&&
2297 top
== NULL
&& headroom
> 0) {
2298 freelist
->m_data
+= headroom
;
2302 * Fall back to regular mbufs without
2303 * reserving the socket headroom
2305 if (freelist
== NULL
) {
2313 if (freelist
== NULL
) {
2319 * For datagram protocols,
2320 * leave room for protocol
2321 * headers in first mbuf.
2323 if (atomic
&& top
== NULL
&&
2324 bytes_to_copy
< MHLEN
) {
2330 freelist
= m
->m_next
;
2333 if ((m
->m_flags
& M_EXT
))
2334 mlen
= m
->m_ext
.ext_size
-
2336 else if ((m
->m_flags
& M_PKTHDR
))
2338 MHLEN
- m_leadingspace(m
);
2340 mlen
= MLEN
- m_leadingspace(m
);
2341 len
= imin(mlen
, bytes_to_copy
);
2347 error
= uiomove(mtod(m
, caddr_t
),
2350 resid
= uio_resid(uio
);
2354 top
->m_pkthdr
.len
+= len
;
2359 if (flags
& MSG_EOR
)
2360 top
->m_flags
|= M_EOR
;
2363 bytes_to_copy
= min(resid
, space
);
2365 } while (space
> 0 &&
2366 (chainlength
< sosendmaxchain
|| atomic
||
2367 resid
< MINCLSIZE
));
2375 if (flags
& (MSG_HOLD
|MSG_SEND
)) {
2376 /* Enqueue for later, go away if HOLD */
2378 if (so
->so_temp
&& (flags
& MSG_FLUSH
)) {
2379 m_freem(so
->so_temp
);
2383 so
->so_tail
->m_next
= top
;
2390 if (flags
& MSG_HOLD
) {
2397 so
->so_options
|= SO_DONTROUTE
;
2400 * Compute flags here, for pru_send and NKEs
2402 * If the user set MSG_EOF, the protocol
2403 * understands this flag and nothing left to
2404 * send then use PRU_SEND_EOF instead of PRU_SEND.
2406 sendflags
= (flags
& MSG_OOB
) ? PRUS_OOB
:
2407 ((flags
& MSG_EOF
) &&
2408 (so
->so_proto
->pr_flags
& PR_IMPLOPCL
) &&
2409 (resid
<= 0)) ? PRUS_EOF
:
2410 /* If there is more to send set PRUS_MORETOCOME */
2411 (resid
> 0 && space
> 0) ? PRUS_MORETOCOME
: 0;
2413 if ((flags
& MSG_SKIPCFIL
) == 0) {
2415 * Socket filter processing
2417 error
= sflt_data_out(so
, addr
, &top
,
2418 &control
, (sendflags
& MSG_OOB
) ?
2419 sock_data_filt_flag_oob
: 0);
2421 if (error
== EJUSTRETURN
) {
2431 * Content filter processing
2433 error
= cfil_sock_data_out(so
, addr
, top
,
2434 control
, (sendflags
& MSG_OOB
) ?
2435 sock_data_filt_flag_oob
: 0);
2437 if (error
== EJUSTRETURN
) {
2445 #endif /* CONTENT_FILTER */
2447 if (so
->so_flags
& SOF_ENABLE_MSGS
) {
2449 * Make a copy of control mbuf,
2450 * so that msg priority can be
2451 * passed to subsequent mbufs.
2453 control_copy
= m_dup(control
, M_NOWAIT
);
2455 error
= (*so
->so_proto
->pr_usrreqs
->pru_send
)
2456 (so
, sendflags
, top
, addr
, control
, p
);
2458 if (flags
& MSG_SEND
)
2462 so
->so_options
&= ~SO_DONTROUTE
;
2465 control
= control_copy
;
2466 control_copy
= NULL
;
2471 } while (resid
&& space
> 0);
2476 sbunlock(&so
->so_snd
, FALSE
); /* will unlock socket */
2478 socket_unlock(so
, 1);
2481 if (control
!= NULL
)
2483 if (freelist
!= NULL
)
2484 m_freem_list(freelist
);
2485 if (control_copy
!= NULL
)
2486 m_freem(control_copy
);
2488 soclearfastopen(so
);
2491 /* resid passed here is the bytes left in uio */
2492 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite
, DBG_FUNC_END
,
2493 VM_KERNEL_ADDRPERM(so
),
2494 ((error
== EWOULDBLOCK
) ? kEnTrFlagNoWork
: 0),
2495 (int64_t)(orig_resid
- resid
));
2497 KERNEL_DEBUG(DBG_FNC_SOSEND
| DBG_FUNC_END
, so
, resid
,
2498 so
->so_snd
.sb_cc
, space
, error
);
2504 * Supported only connected sockets (no address) without ancillary data
2505 * (control mbuf) for atomic protocols
2508 sosend_list(struct socket
*so
, struct uio
**uioarray
, u_int uiocnt
, int flags
)
2510 struct mbuf
*m
, *freelist
= NULL
;
2511 user_ssize_t len
, resid
;
2512 int error
, dontroute
, mlen
;
2513 int atomic
= sosendallatonce(so
);
2515 struct proc
*p
= current_proc();
2518 struct mbuf
*top
= NULL
;
2519 uint16_t headroom
= 0;
2522 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST
| DBG_FUNC_START
), so
, uiocnt
,
2523 so
->so_snd
.sb_cc
, so
->so_snd
.sb_lowat
, so
->so_snd
.sb_hiwat
);
2525 if (so
->so_type
!= SOCK_DGRAM
) {
2533 if (so
->so_proto
->pr_usrreqs
->pru_send_list
== NULL
) {
2534 error
= EPROTONOSUPPORT
;
2537 if (flags
& ~(MSG_DONTWAIT
| MSG_NBIO
)) {
2541 resid
= uio_array_resid(uioarray
, uiocnt
);
2544 * In theory resid should be unsigned.
2545 * However, space must be signed, as it might be less than 0
2546 * if we over-committed, and we must use a signed comparison
2547 * of space and resid. On the other hand, a negative resid
2548 * causes us to loop sending 0-length segments to the protocol.
2550 * Note: We limit resid to be a positive int value as we use
2551 * imin() to set bytes_to_copy -- radr://14558484
2553 if (resid
< 0 || resid
> INT_MAX
) {
2559 so_update_last_owner_locked(so
, p
);
2560 so_update_policy(so
);
2563 so_update_necp_policy(so
, NULL
, NULL
);
2566 dontroute
= (flags
& MSG_DONTROUTE
) &&
2567 (so
->so_options
& SO_DONTROUTE
) == 0 &&
2568 (so
->so_proto
->pr_flags
& PR_ATOMIC
);
2569 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgsnd
);
2571 error
= sosendcheck(so
, NULL
, resid
, 0, atomic
, flags
,
2577 * Use big 4 KB clusters when the outgoing interface does not prefer
2580 bigcl
= !(so
->so_flags1
& SOF1_IF_2KCL
) || sosendbigcl_ignore_capab
;
2582 if (soreserveheadroom
!= 0)
2583 headroom
= so
->so_pktheadroom
;
2589 size_t maxpktlen
= 0;
2592 if (sosendminchain
> 0)
2595 chainlength
= sosendmaxchain
;
2597 socket_unlock(so
, 0);
2600 * Find a set of uio that fit in a reasonable number
2603 for (i
= uiofirst
; i
< uiocnt
; i
++) {
2604 struct uio
*auio
= uioarray
[i
];
2606 len
= uio_resid(auio
);
2608 /* Do nothing for empty messages */
2615 if (len
> maxpktlen
)
2619 if (chainlength
> sosendmaxchain
)
2623 * Nothing left to send
2625 if (num_needed
== 0) {
2630 * Allocate buffer large enough to include headroom space for
2631 * network and link header
2634 bytes_to_alloc
= maxpktlen
+ headroom
;
2637 * Allocate a single contiguous buffer of the smallest available
2638 * size when possible
2640 if (bytes_to_alloc
> MCLBYTES
&&
2641 bytes_to_alloc
<= MBIGCLBYTES
&& bigcl
) {
2642 freelist
= m_getpackets_internal(
2643 (unsigned int *)&num_needed
,
2644 num_needed
, M_WAIT
, 1,
2646 } else if (bytes_to_alloc
> _MHLEN
&&
2647 bytes_to_alloc
<= MCLBYTES
) {
2648 freelist
= m_getpackets_internal(
2649 (unsigned int *)&num_needed
,
2650 num_needed
, M_WAIT
, 1,
2653 freelist
= m_allocpacket_internal(
2654 (unsigned int *)&num_needed
,
2655 bytes_to_alloc
, NULL
, M_WAIT
, 1, 0);
2658 if (freelist
== NULL
) {
2664 * Copy each uio of the set into its own mbuf packet
2666 for (i
= uiofirst
, m
= freelist
;
2667 i
< uiolast
&& m
!= NULL
;
2671 struct uio
*auio
= uioarray
[i
];
2673 bytes_to_copy
= uio_resid(auio
);
2675 /* Do nothing for empty messages */
2676 if (bytes_to_copy
== 0)
2679 * Leave headroom for protocol headers
2680 * in the first mbuf of the chain
2682 m
->m_data
+= headroom
;
2684 for (n
= m
; n
!= NULL
; n
= n
->m_next
) {
2685 if ((m
->m_flags
& M_EXT
))
2686 mlen
= m
->m_ext
.ext_size
-
2688 else if ((m
->m_flags
& M_PKTHDR
))
2690 MHLEN
- m_leadingspace(m
);
2692 mlen
= MLEN
- m_leadingspace(m
);
2693 len
= imin(mlen
, bytes_to_copy
);
2696 * Note: uiomove() decrements the iovec
2699 error
= uiomove(mtod(n
, caddr_t
),
2704 m
->m_pkthdr
.len
+= len
;
2706 VERIFY(m
->m_pkthdr
.len
<= maxpktlen
);
2708 bytes_to_copy
-= len
;
2711 if (m
->m_pkthdr
.len
== 0) {
2713 "%s:%d so %llx pkt %llx type %u len null\n",
2715 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
2716 (uint64_t)DEBUG_KERNEL_ADDRPERM(m
),
2732 so
->so_options
|= SO_DONTROUTE
;
2734 if ((flags
& MSG_SKIPCFIL
) == 0) {
2735 struct mbuf
**prevnextp
= NULL
;
2737 for (i
= uiofirst
, m
= top
;
2738 i
< uiolast
&& m
!= NULL
;
2740 struct mbuf
*nextpkt
= m
->m_nextpkt
;
2743 * Socket filter processing
2745 error
= sflt_data_out(so
, NULL
, &m
,
2747 if (error
!= 0 && error
!= EJUSTRETURN
)
2753 * Content filter processing
2755 error
= cfil_sock_data_out(so
, NULL
, m
,
2757 if (error
!= 0 && error
!= EJUSTRETURN
)
2760 #endif /* CONTENT_FILTER */
2762 * Remove packet from the list when
2763 * swallowed by a filter
2765 if (error
== EJUSTRETURN
) {
2767 if (prevnextp
!= NULL
)
2768 *prevnextp
= nextpkt
;
2775 prevnextp
= &m
->m_nextpkt
;
2779 error
= (*so
->so_proto
->pr_usrreqs
->pru_send_list
)
2780 (so
, 0, top
, NULL
, NULL
, p
);
2783 so
->so_options
&= ~SO_DONTROUTE
;
2787 } while (resid
> 0 && error
== 0);
2790 sbunlock(&so
->so_snd
, FALSE
); /* will unlock socket */
2792 socket_unlock(so
, 1);
2796 if (freelist
!= NULL
)
2797 m_freem_list(freelist
);
2799 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST
| DBG_FUNC_END
, so
, resid
,
2800 so
->so_snd
.sb_cc
, 0, error
);
2806 * May return ERESTART when packet is dropped by MAC policy check
2809 soreceive_addr(struct proc
*p
, struct socket
*so
, struct sockaddr
**psa
,
2810 int flags
, struct mbuf
**mp
, struct mbuf
**nextrecordp
, int canwait
)
2813 struct mbuf
*m
= *mp
;
2814 struct mbuf
*nextrecord
= *nextrecordp
;
2816 KASSERT(m
->m_type
== MT_SONAME
, ("receive 1a"));
2817 #if CONFIG_MACF_SOCKET_SUBSET
2819 * Call the MAC framework for policy checking if we're in
2820 * the user process context and the socket isn't connected.
2822 if (p
!= kernproc
&& !(so
->so_state
& SS_ISCONNECTED
)) {
2823 struct mbuf
*m0
= m
;
2825 * Dequeue this record (temporarily) from the receive
2826 * list since we're about to drop the socket's lock
2827 * where a new record may arrive and be appended to
2828 * the list. Upon MAC policy failure, the record
2829 * will be freed. Otherwise, we'll add it back to
2830 * the head of the list. We cannot rely on SB_LOCK
2831 * because append operation uses the socket's lock.
2834 m
->m_nextpkt
= NULL
;
2835 sbfree(&so
->so_rcv
, m
);
2837 } while (m
!= NULL
);
2839 so
->so_rcv
.sb_mb
= nextrecord
;
2840 SB_EMPTY_FIXUP(&so
->so_rcv
);
2841 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1a");
2842 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1a");
2843 socket_unlock(so
, 0);
2845 if (mac_socket_check_received(proc_ucred(p
), so
,
2846 mtod(m
, struct sockaddr
*)) != 0) {
2848 * MAC policy failure; free this record and
2849 * process the next record (or block until
2850 * one is available). We have adjusted sb_cc
2851 * and sb_mbcnt above so there is no need to
2852 * call sbfree() again.
2856 * Clear SB_LOCK but don't unlock the socket.
2857 * Process the next record or wait for one.
2860 sbunlock(&so
->so_rcv
, TRUE
); /* stay locked */
2866 * If the socket has been defunct'd, drop it.
2868 if (so
->so_flags
& SOF_DEFUNCT
) {
2874 * Re-adjust the socket receive list and re-enqueue
2875 * the record in front of any packets which may have
2876 * been appended while we dropped the lock.
2878 for (m
= m0
; m
->m_next
!= NULL
; m
= m
->m_next
)
2879 sballoc(&so
->so_rcv
, m
);
2880 sballoc(&so
->so_rcv
, m
);
2881 if (so
->so_rcv
.sb_mb
== NULL
) {
2882 so
->so_rcv
.sb_lastrecord
= m0
;
2883 so
->so_rcv
.sb_mbtail
= m
;
2886 nextrecord
= m
->m_nextpkt
= so
->so_rcv
.sb_mb
;
2887 so
->so_rcv
.sb_mb
= m
;
2888 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1b");
2889 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1b");
2891 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2893 *psa
= dup_sockaddr(mtod(m
, struct sockaddr
*), canwait
);
2894 if ((*psa
== NULL
) && (flags
& MSG_NEEDSA
)) {
2895 error
= EWOULDBLOCK
;
2899 if (flags
& MSG_PEEK
) {
2902 sbfree(&so
->so_rcv
, m
);
2903 if (m
->m_next
== NULL
&& so
->so_rcv
.sb_cc
!= 0) {
2904 panic("%s: about to create invalid socketbuf",
2908 MFREE(m
, so
->so_rcv
.sb_mb
);
2909 m
= so
->so_rcv
.sb_mb
;
2911 m
->m_nextpkt
= nextrecord
;
2913 so
->so_rcv
.sb_mb
= nextrecord
;
2914 SB_EMPTY_FIXUP(&so
->so_rcv
);
2919 *nextrecordp
= nextrecord
;
2925 * Process one or more MT_CONTROL mbufs present before any data mbufs
2926 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2927 * just copy the data; if !MSG_PEEK, we call into the protocol to
2928 * perform externalization.
2931 soreceive_ctl(struct socket
*so
, struct mbuf
**controlp
, int flags
,
2932 struct mbuf
**mp
, struct mbuf
**nextrecordp
)
2935 struct mbuf
*cm
= NULL
, *cmn
;
2936 struct mbuf
**cme
= &cm
;
2937 struct sockbuf
*sb_rcv
= &so
->so_rcv
;
2938 struct mbuf
**msgpcm
= NULL
;
2939 struct mbuf
*m
= *mp
;
2940 struct mbuf
*nextrecord
= *nextrecordp
;
2941 struct protosw
*pr
= so
->so_proto
;
2944 * Externalizing the control messages would require us to
2945 * drop the socket's lock below. Once we re-acquire the
2946 * lock, the mbuf chain might change. In order to preserve
2947 * consistency, we unlink all control messages from the
2948 * first mbuf chain in one shot and link them separately
2949 * onto a different chain.
2952 if (flags
& MSG_PEEK
) {
2953 if (controlp
!= NULL
) {
2954 if (*controlp
== NULL
) {
2957 *controlp
= m_copy(m
, 0, m
->m_len
);
2960 * If we failed to allocate an mbuf,
2961 * release any previously allocated
2962 * mbufs for control data. Return
2963 * an error. Keep the mbufs in the
2964 * socket as this is using
2967 if (*controlp
== NULL
) {
2972 controlp
= &(*controlp
)->m_next
;
2976 m
->m_nextpkt
= NULL
;
2978 sb_rcv
->sb_mb
= m
->m_next
;
2981 cme
= &(*cme
)->m_next
;
2984 } while (m
!= NULL
&& m
->m_type
== MT_CONTROL
);
2986 if (!(flags
& MSG_PEEK
)) {
2987 if (sb_rcv
->sb_mb
!= NULL
) {
2988 sb_rcv
->sb_mb
->m_nextpkt
= nextrecord
;
2990 sb_rcv
->sb_mb
= nextrecord
;
2991 SB_EMPTY_FIXUP(sb_rcv
);
2993 if (nextrecord
== NULL
)
2994 sb_rcv
->sb_lastrecord
= m
;
2997 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive ctl");
2998 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive ctl");
3000 while (cm
!= NULL
) {
3005 cmsg_type
= mtod(cm
, struct cmsghdr
*)->cmsg_type
;
3008 * Call the protocol to externalize SCM_RIGHTS message
3009 * and return the modified message to the caller upon
3010 * success. Otherwise, all other control messages are
3011 * returned unmodified to the caller. Note that we
3012 * only get into this loop if MSG_PEEK is not set.
3014 if (pr
->pr_domain
->dom_externalize
!= NULL
&&
3015 cmsg_type
== SCM_RIGHTS
) {
3017 * Release socket lock: see 3903171. This
3018 * would also allow more records to be appended
3019 * to the socket buffer. We still have SB_LOCK
3020 * set on it, so we can be sure that the head
3021 * of the mbuf chain won't change.
3023 socket_unlock(so
, 0);
3024 error
= (*pr
->pr_domain
->dom_externalize
)(cm
);
3030 if (controlp
!= NULL
&& error
== 0) {
3032 controlp
= &(*controlp
)->m_next
;
3039 * Update the value of nextrecord in case we received new
3040 * records when the socket was unlocked above for
3041 * externalizing SCM_RIGHTS.
3044 nextrecord
= sb_rcv
->sb_mb
->m_nextpkt
;
3046 nextrecord
= sb_rcv
->sb_mb
;
3050 *nextrecordp
= nextrecord
;
3056 * Implement receive operations on a socket.
3057 * We depend on the way that records are added to the sockbuf
3058 * by sbappend*. In particular, each record (mbufs linked through m_next)
3059 * must begin with an address if the protocol so specifies,
3060 * followed by an optional mbuf or mbufs containing ancillary data,
3061 * and then zero or more mbufs of data.
3062 * In order to avoid blocking network interrupts for the entire time here,
3063 * we splx() while doing the actual copy to user space.
3064 * Although the sockbuf is locked, new data may still be appended,
3065 * and thus we must maintain consistency of the sockbuf during that time.
3067 * The caller may receive the data as a single mbuf chain by supplying
3068 * an mbuf **mp0 for use in returning the chain. The uio is then used
3069 * only for the count in uio_resid.
3071 * Returns: 0 Success
3076 * sblock:EWOULDBLOCK
3080 * sodelayed_copy:EFAULT
3081 * <pru_rcvoob>:EINVAL[TCP]
3082 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3084 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3085 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3086 * <pr_domain->dom_externalize>:???
3088 * Notes: Additional return values from calls through <pru_rcvoob> and
3089 * <pr_domain->dom_externalize> depend on protocols other than
3090 * TCP or AF_UNIX, which are documented above.
3093 soreceive(struct socket
*so
, struct sockaddr
**psa
, struct uio
*uio
,
3094 struct mbuf
**mp0
, struct mbuf
**controlp
, int *flagsp
)
3096 struct mbuf
*m
, **mp
, *ml
= NULL
;
3097 struct mbuf
*nextrecord
, *free_list
;
3098 int flags
, error
, offset
;
3100 struct protosw
*pr
= so
->so_proto
;
3102 user_ssize_t orig_resid
= uio_resid(uio
);
3103 user_ssize_t delayed_copy_len
;
3106 struct proc
*p
= current_proc();
3107 boolean_t en_tracing
= FALSE
;
3110 * Sanity check on the length passed by caller as we are making 'int'
3113 if (orig_resid
< 0 || orig_resid
> INT_MAX
)
3116 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_START
, so
,
3117 uio_resid(uio
), so
->so_rcv
.sb_cc
, so
->so_rcv
.sb_lowat
,
3118 so
->so_rcv
.sb_hiwat
);
3121 so_update_last_owner_locked(so
, p
);
3122 so_update_policy(so
);
3124 #ifdef MORE_LOCKING_DEBUG
3125 if (so
->so_usecount
== 1) {
3126 panic("%s: so=%x no other reference on socket\n", __func__
, so
);
3133 if (controlp
!= NULL
)
3136 flags
= *flagsp
&~ MSG_EOR
;
3141 * If a recv attempt is made on a previously-accepted socket
3142 * that has been marked as inactive (disconnected), reject
3145 if (so
->so_flags
& SOF_DEFUNCT
) {
3146 struct sockbuf
*sb
= &so
->so_rcv
;
3149 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3150 __func__
, proc_pid(p
), proc_best_name(p
),
3151 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
3152 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
3154 * This socket should have been disconnected and flushed
3155 * prior to being returned from sodefunct(); there should
3156 * be no data on its receive list, so panic otherwise.
3158 if (so
->so_state
& SS_DEFUNCT
)
3159 sb_empty_assert(sb
, __func__
);
3160 socket_unlock(so
, 1);
3164 if ((so
->so_flags1
& SOF1_PRECONNECT_DATA
) &&
3165 pr
->pr_usrreqs
->pru_preconnect
) {
3167 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3168 * calling write() right after this. *If* the app calls a read
3169 * we do not want to block this read indefinetely. Thus,
3170 * we trigger a connect so that the session gets initiated.
3172 error
= (*pr
->pr_usrreqs
->pru_preconnect
)(so
);
3175 socket_unlock(so
, 1);
3180 if (ENTR_SHOULDTRACE
&&
3181 (SOCK_CHECK_DOM(so
, AF_INET
) || SOCK_CHECK_DOM(so
, AF_INET6
))) {
3183 * enable energy tracing for inet sockets that go over
3184 * non-loopback interfaces only.
3186 struct inpcb
*inp
= sotoinpcb(so
);
3187 if (inp
->inp_last_outifp
!= NULL
&&
3188 !(inp
->inp_last_outifp
->if_flags
& IFF_LOOPBACK
)) {
3190 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_START
,
3191 VM_KERNEL_ADDRPERM(so
),
3192 ((so
->so_state
& SS_NBIO
) ?
3193 kEnTrFlagNonBlocking
: 0),
3194 (int64_t)orig_resid
);
3199 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3200 * regardless of the flags argument. Here is the case were
3201 * out-of-band data is not inline.
3203 if ((flags
& MSG_OOB
) ||
3204 ((so
->so_options
& SO_WANTOOBFLAG
) != 0 &&
3205 (so
->so_options
& SO_OOBINLINE
) == 0 &&
3206 (so
->so_oobmark
|| (so
->so_state
& SS_RCVATMARK
)))) {
3207 m
= m_get(M_WAIT
, MT_DATA
);
3209 socket_unlock(so
, 1);
3210 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
,
3211 ENOBUFS
, 0, 0, 0, 0);
3214 error
= (*pr
->pr_usrreqs
->pru_rcvoob
)(so
, m
, flags
& MSG_PEEK
);
3217 socket_unlock(so
, 0);
3219 error
= uiomove(mtod(m
, caddr_t
),
3220 imin(uio_resid(uio
), m
->m_len
), uio
);
3222 } while (uio_resid(uio
) && error
== 0 && m
!= NULL
);
3228 if ((so
->so_options
& SO_WANTOOBFLAG
) != 0) {
3229 if (error
== EWOULDBLOCK
|| error
== EINVAL
) {
3231 * Let's try to get normal data:
3232 * EWOULDBLOCK: out-of-band data not
3233 * receive yet. EINVAL: out-of-band data
3238 } else if (error
== 0 && flagsp
!= NULL
) {
3242 socket_unlock(so
, 1);
3244 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_END
,
3245 VM_KERNEL_ADDRPERM(so
), 0,
3246 (int64_t)(orig_resid
- uio_resid(uio
)));
3248 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
3257 if (so
->so_state
& SS_ISCONFIRMING
&& uio_resid(uio
)) {
3258 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, 0);
3262 delayed_copy_len
= 0;
3264 #ifdef MORE_LOCKING_DEBUG
3265 if (so
->so_usecount
<= 1)
3266 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3267 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), so
->so_usecount
);
3270 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3271 * and if so just return to the caller. This could happen when
3272 * soreceive() is called by a socket upcall function during the
3273 * time the socket is freed. The socket buffer would have been
3274 * locked across the upcall, therefore we cannot put this thread
3275 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3276 * we may livelock), because the lock on the socket buffer will
3277 * only be released when the upcall routine returns to its caller.
3278 * Because the socket has been officially closed, there can be
3279 * no further read on it.
3281 * A multipath subflow socket would have its SS_NOFDREF set by
3282 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3283 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3285 if ((so
->so_state
& (SS_NOFDREF
| SS_CANTRCVMORE
)) ==
3286 (SS_NOFDREF
| SS_CANTRCVMORE
) && !(so
->so_flags
& SOF_MP_SUBFLOW
)) {
3287 socket_unlock(so
, 1);
3291 error
= sblock(&so
->so_rcv
, SBLOCKWAIT(flags
));
3293 socket_unlock(so
, 1);
3294 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
3297 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_END
,
3298 VM_KERNEL_ADDRPERM(so
), 0,
3299 (int64_t)(orig_resid
- uio_resid(uio
)));
3304 m
= so
->so_rcv
.sb_mb
;
3306 * If we have less data than requested, block awaiting more
3307 * (subject to any timeout) if:
3308 * 1. the current count is less than the low water mark, or
3309 * 2. MSG_WAITALL is set, and it is possible to do the entire
3310 * receive operation at once if we block (resid <= hiwat).
3311 * 3. MSG_DONTWAIT is not set
3312 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3313 * we have to do the receive in sections, and thus risk returning
3314 * a short count if a timeout or signal occurs after we start.
3316 if (m
== NULL
|| (((flags
& MSG_DONTWAIT
) == 0 &&
3317 so
->so_rcv
.sb_cc
< uio_resid(uio
)) &&
3318 (so
->so_rcv
.sb_cc
< so
->so_rcv
.sb_lowat
||
3319 ((flags
& MSG_WAITALL
) && uio_resid(uio
) <= so
->so_rcv
.sb_hiwat
)) &&
3320 m
->m_nextpkt
== NULL
&& (pr
->pr_flags
& PR_ATOMIC
) == 0)) {
3322 * Panic if we notice inconsistencies in the socket's
3323 * receive list; both sb_mb and sb_cc should correctly
3324 * reflect the contents of the list, otherwise we may
3325 * end up with false positives during select() or poll()
3326 * which could put the application in a bad state.
3328 SB_MB_CHECK(&so
->so_rcv
);
3333 error
= so
->so_error
;
3334 if ((flags
& MSG_PEEK
) == 0)
3338 if (so
->so_state
& SS_CANTRCVMORE
) {
3341 * Deal with half closed connections
3343 if ((so
->so_state
& SS_ISDISCONNECTED
) == 0 &&
3344 cfil_sock_data_pending(&so
->so_rcv
) != 0)
3346 "so %llx ignore SS_CANTRCVMORE",
3347 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
));
3349 #endif /* CONTENT_FILTER */
3355 for (; m
!= NULL
; m
= m
->m_next
)
3356 if (m
->m_type
== MT_OOBDATA
|| (m
->m_flags
& M_EOR
)) {
3357 m
= so
->so_rcv
.sb_mb
;
3360 if ((so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
)) == 0 &&
3361 (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
)) {
3365 if (uio_resid(uio
) == 0)
3368 if ((so
->so_state
& SS_NBIO
) ||
3369 (flags
& (MSG_DONTWAIT
|MSG_NBIO
))) {
3370 error
= EWOULDBLOCK
;
3373 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 1");
3374 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 1");
3375 sbunlock(&so
->so_rcv
, TRUE
); /* keep socket locked */
3376 #if EVEN_MORE_LOCKING_DEBUG
3378 printf("Waiting for socket data\n");
3381 error
= sbwait(&so
->so_rcv
);
3382 #if EVEN_MORE_LOCKING_DEBUG
3384 printf("SORECEIVE - sbwait returned %d\n", error
);
3386 if (so
->so_usecount
< 1) {
3387 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3388 __func__
, so
, so
->so_usecount
);
3392 socket_unlock(so
, 1);
3393 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
3396 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_END
,
3397 VM_KERNEL_ADDRPERM(so
), 0,
3398 (int64_t)(orig_resid
- uio_resid(uio
)));
3405 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgrcv
);
3406 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1");
3407 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1");
3408 nextrecord
= m
->m_nextpkt
;
3410 if ((pr
->pr_flags
& PR_ADDR
) && m
->m_type
== MT_SONAME
) {
3411 error
= soreceive_addr(p
, so
, psa
, flags
, &m
, &nextrecord
,
3413 if (error
== ERESTART
)
3415 else if (error
!= 0)
3421 * Process one or more MT_CONTROL mbufs present before any data mbufs
3422 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3423 * just copy the data; if !MSG_PEEK, we call into the protocol to
3424 * perform externalization.
3426 if (m
!= NULL
&& m
->m_type
== MT_CONTROL
) {
3427 error
= soreceive_ctl(so
, controlp
, flags
, &m
, &nextrecord
);
3434 * If the socket is a TCP socket with message delivery
3435 * enabled, then create a control msg to deliver the
3436 * relative TCP sequence number for this data. Waiting
3437 * until this point will protect against failures to
3438 * allocate an mbuf for control msgs.
3440 if (so
->so_type
== SOCK_STREAM
&& SOCK_PROTO(so
) == IPPROTO_TCP
&&
3441 (so
->so_flags
& SOF_ENABLE_MSGS
) && controlp
!= NULL
) {
3442 struct mbuf
*seq_cm
;
3444 seq_cm
= sbcreatecontrol((caddr_t
)&m
->m_pkthdr
.msg_seq
,
3445 sizeof (uint32_t), SCM_SEQNUM
, SOL_SOCKET
);
3446 if (seq_cm
== NULL
) {
3447 /* unable to allocate a control mbuf */
3452 controlp
= &seq_cm
->m_next
;
3456 if (!(flags
& MSG_PEEK
)) {
3458 * We get here because m points to an mbuf following
3459 * any MT_SONAME or MT_CONTROL mbufs which have been
3460 * processed above. In any case, m should be pointing
3461 * to the head of the mbuf chain, and the nextrecord
3462 * should be either NULL or equal to m->m_nextpkt.
3463 * See comments above about SB_LOCK.
3465 if (m
!= so
->so_rcv
.sb_mb
||
3466 m
->m_nextpkt
!= nextrecord
) {
3467 panic("%s: post-control !sync so=%p m=%p "
3468 "nextrecord=%p\n", __func__
, so
, m
,
3472 if (nextrecord
== NULL
)
3473 so
->so_rcv
.sb_lastrecord
= m
;
3476 if (type
== MT_OOBDATA
)
3479 if (!(flags
& MSG_PEEK
)) {
3480 SB_EMPTY_FIXUP(&so
->so_rcv
);
3483 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 2");
3484 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 2");
3489 if (!(flags
& MSG_PEEK
) && uio_resid(uio
) > sorecvmincopy
)
3497 (uio_resid(uio
) - delayed_copy_len
) > 0 && error
== 0) {
3498 if (m
->m_type
== MT_OOBDATA
) {
3499 if (type
!= MT_OOBDATA
)
3501 } else if (type
== MT_OOBDATA
) {
3505 * Make sure to allways set MSG_OOB event when getting
3506 * out of band data inline.
3508 if ((so
->so_options
& SO_WANTOOBFLAG
) != 0 &&
3509 (so
->so_options
& SO_OOBINLINE
) != 0 &&
3510 (so
->so_state
& SS_RCVATMARK
) != 0) {
3513 so
->so_state
&= ~SS_RCVATMARK
;
3514 len
= uio_resid(uio
) - delayed_copy_len
;
3515 if (so
->so_oobmark
&& len
> so
->so_oobmark
- offset
)
3516 len
= so
->so_oobmark
- offset
;
3517 if (len
> m
->m_len
- moff
)
3518 len
= m
->m_len
- moff
;
3520 * If mp is set, just pass back the mbufs.
3521 * Otherwise copy them out via the uio, then free.
3522 * Sockbuf must be consistent here (points to current mbuf,
3523 * it points to next record) when we drop priority;
3524 * we must note any additions to the sockbuf when we
3525 * block interrupts again.
3528 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive uiomove");
3529 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive uiomove");
3530 if (can_delay
&& len
== m
->m_len
) {
3532 * only delay the copy if we're consuming the
3533 * mbuf and we're NOT in MSG_PEEK mode
3534 * and we have enough data to make it worthwile
3535 * to drop and retake the lock... can_delay
3536 * reflects the state of the 2 latter
3537 * constraints moff should always be zero
3540 delayed_copy_len
+= len
;
3542 if (delayed_copy_len
) {
3543 error
= sodelayed_copy(so
, uio
,
3544 &free_list
, &delayed_copy_len
);
3550 * can only get here if MSG_PEEK is not
3551 * set therefore, m should point at the
3552 * head of the rcv queue; if it doesn't,
3553 * it means something drastically
3554 * changed while we were out from behind
3555 * the lock in sodelayed_copy. perhaps
3556 * a RST on the stream. in any event,
3557 * the stream has been interrupted. it's
3558 * probably best just to return whatever
3559 * data we've moved and let the caller
3562 if (m
!= so
->so_rcv
.sb_mb
) {
3566 socket_unlock(so
, 0);
3567 error
= uiomove(mtod(m
, caddr_t
) + moff
,
3575 uio_setresid(uio
, (uio_resid(uio
) - len
));
3577 if (len
== m
->m_len
- moff
) {
3578 if (m
->m_flags
& M_EOR
)
3580 if (flags
& MSG_PEEK
) {
3584 nextrecord
= m
->m_nextpkt
;
3585 sbfree(&so
->so_rcv
, m
);
3586 m
->m_nextpkt
= NULL
;
3589 * If this packet is an unordered packet
3590 * (indicated by M_UNORDERED_DATA flag), remove
3591 * the additional bytes added to the
3592 * receive socket buffer size.
3594 if ((so
->so_flags
& SOF_ENABLE_MSGS
) &&
3596 (m
->m_flags
& M_UNORDERED_DATA
) &&
3597 sbreserve(&so
->so_rcv
,
3598 so
->so_rcv
.sb_hiwat
- m
->m_len
)) {
3599 if (so
->so_msg_state
->msg_uno_bytes
>
3602 msg_uno_bytes
-= m
->m_len
;
3607 m
->m_flags
&= ~M_UNORDERED_DATA
;
3613 so
->so_rcv
.sb_mb
= m
= m
->m_next
;
3616 if (free_list
== NULL
)
3621 so
->so_rcv
.sb_mb
= m
= m
->m_next
;
3625 m
->m_nextpkt
= nextrecord
;
3626 if (nextrecord
== NULL
)
3627 so
->so_rcv
.sb_lastrecord
= m
;
3629 so
->so_rcv
.sb_mb
= nextrecord
;
3630 SB_EMPTY_FIXUP(&so
->so_rcv
);
3632 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 3");
3633 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 3");
3636 if (flags
& MSG_PEEK
) {
3642 if (flags
& MSG_DONTWAIT
)
3643 copy_flag
= M_DONTWAIT
;
3646 *mp
= m_copym(m
, 0, len
, copy_flag
);
3648 * Failed to allocate an mbuf?
3649 * Adjust uio_resid back, it was
3650 * adjusted down by len bytes which
3651 * we didn't copy over.
3655 (uio_resid(uio
) + len
));
3661 so
->so_rcv
.sb_cc
-= len
;
3664 if (so
->so_oobmark
) {
3665 if ((flags
& MSG_PEEK
) == 0) {
3666 so
->so_oobmark
-= len
;
3667 if (so
->so_oobmark
== 0) {
3668 so
->so_state
|= SS_RCVATMARK
;
3670 * delay posting the actual event until
3671 * after any delayed copy processing
3679 if (offset
== so
->so_oobmark
)
3683 if (flags
& MSG_EOR
)
3686 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3687 * (for non-atomic socket), we must not quit until
3688 * "uio->uio_resid == 0" or an error termination.
3689 * If a signal/timeout occurs, return with a short
3690 * count but without error. Keep sockbuf locked
3691 * against other readers.
3693 while (flags
& (MSG_WAITALL
|MSG_WAITSTREAM
) && m
== NULL
&&
3694 (uio_resid(uio
) - delayed_copy_len
) > 0 &&
3695 !sosendallatonce(so
) && !nextrecord
) {
3696 if (so
->so_error
|| ((so
->so_state
& SS_CANTRCVMORE
)
3698 && cfil_sock_data_pending(&so
->so_rcv
) == 0
3699 #endif /* CONTENT_FILTER */
3704 * Depending on the protocol (e.g. TCP), the following
3705 * might cause the socket lock to be dropped and later
3706 * be reacquired, and more data could have arrived and
3707 * have been appended to the receive socket buffer by
3708 * the time it returns. Therefore, we only sleep in
3709 * sbwait() below if and only if the socket buffer is
3710 * empty, in order to avoid a false sleep.
3712 if (pr
->pr_flags
& PR_WANTRCVD
&& so
->so_pcb
&&
3713 (((struct inpcb
*)so
->so_pcb
)->inp_state
!=
3715 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
3717 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 2");
3718 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 2");
3720 if (so
->so_rcv
.sb_mb
== NULL
&& sbwait(&so
->so_rcv
)) {
3725 * have to wait until after we get back from the sbwait
3726 * to do the copy because we will drop the lock if we
3727 * have enough data that has been delayed... by dropping
3728 * the lock we open up a window allowing the netisr
3729 * thread to process the incoming packets and to change
3730 * the state of this socket... we're issuing the sbwait
3731 * because the socket is empty and we're expecting the
3732 * netisr thread to wake us up when more packets arrive;
3733 * if we allow that processing to happen and then sbwait
3734 * we could stall forever with packets sitting in the
3735 * socket if no further packets arrive from the remote
3738 * we want to copy before we've collected all the data
3739 * to satisfy this request to allow the copy to overlap
3740 * the incoming packet processing on an MP system
3742 if (delayed_copy_len
> sorecvmincopy
&&
3743 (delayed_copy_len
> (so
->so_rcv
.sb_hiwat
/ 2))) {
3744 error
= sodelayed_copy(so
, uio
,
3745 &free_list
, &delayed_copy_len
);
3750 m
= so
->so_rcv
.sb_mb
;
3752 nextrecord
= m
->m_nextpkt
;
3754 SB_MB_CHECK(&so
->so_rcv
);
3757 #ifdef MORE_LOCKING_DEBUG
3758 if (so
->so_usecount
<= 1) {
3759 panic("%s: after big while so=%p ref=%d on socket\n",
3760 __func__
, so
, so
->so_usecount
);
3765 if (m
!= NULL
&& pr
->pr_flags
& PR_ATOMIC
) {
3766 if (so
->so_options
& SO_DONTTRUNC
) {
3767 flags
|= MSG_RCVMORE
;
3770 if ((flags
& MSG_PEEK
) == 0)
3771 (void) sbdroprecord(&so
->so_rcv
);
3776 * pru_rcvd below (for TCP) may cause more data to be received
3777 * if the socket lock is dropped prior to sending the ACK; some
3778 * legacy OpenTransport applications don't handle this well
3779 * (if it receives less data than requested while MSG_HAVEMORE
3780 * is set), and so we set the flag now based on what we know
3781 * prior to calling pru_rcvd.
3783 if ((so
->so_options
& SO_WANTMORE
) && so
->so_rcv
.sb_cc
> 0)
3784 flags
|= MSG_HAVEMORE
;
3786 if ((flags
& MSG_PEEK
) == 0) {
3788 so
->so_rcv
.sb_mb
= nextrecord
;
3790 * First part is an inline SB_EMPTY_FIXUP(). Second
3791 * part makes sure sb_lastrecord is up-to-date if
3792 * there is still data in the socket buffer.
3794 if (so
->so_rcv
.sb_mb
== NULL
) {
3795 so
->so_rcv
.sb_mbtail
= NULL
;
3796 so
->so_rcv
.sb_lastrecord
= NULL
;
3797 } else if (nextrecord
->m_nextpkt
== NULL
) {
3798 so
->so_rcv
.sb_lastrecord
= nextrecord
;
3800 SB_MB_CHECK(&so
->so_rcv
);
3802 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 4");
3803 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 4");
3804 if (pr
->pr_flags
& PR_WANTRCVD
&& so
->so_pcb
)
3805 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
3808 if (delayed_copy_len
) {
3809 error
= sodelayed_copy(so
, uio
, &free_list
, &delayed_copy_len
);
3813 if (free_list
!= NULL
) {
3814 m_freem_list(free_list
);
3818 postevent(so
, 0, EV_OOB
);
3820 if (orig_resid
== uio_resid(uio
) && orig_resid
&&
3821 (flags
& MSG_EOR
) == 0 && (so
->so_state
& SS_CANTRCVMORE
) == 0) {
3822 sbunlock(&so
->so_rcv
, TRUE
); /* keep socket locked */
3829 #ifdef MORE_LOCKING_DEBUG
3830 if (so
->so_usecount
<= 1) {
3831 panic("%s: release so=%p ref=%d on socket\n", __func__
,
3832 so
, so
->so_usecount
);
3836 if (delayed_copy_len
)
3837 error
= sodelayed_copy(so
, uio
, &free_list
, &delayed_copy_len
);
3839 if (free_list
!= NULL
)
3840 m_freem_list(free_list
);
3842 sbunlock(&so
->so_rcv
, FALSE
); /* will unlock socket */
3845 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_END
,
3846 VM_KERNEL_ADDRPERM(so
),
3847 ((error
== EWOULDBLOCK
) ? kEnTrFlagNoWork
: 0),
3848 (int64_t)(orig_resid
- uio_resid(uio
)));
3850 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, so
, uio_resid(uio
),
3851 so
->so_rcv
.sb_cc
, 0, error
);
3857 * Returns: 0 Success
3861 sodelayed_copy(struct socket
*so
, struct uio
*uio
, struct mbuf
**free_list
,
3862 user_ssize_t
*resid
)
3869 socket_unlock(so
, 0);
3871 while (m
!= NULL
&& error
== 0) {
3872 error
= uiomove(mtod(m
, caddr_t
), (int)m
->m_len
, uio
);
3875 m_freem_list(*free_list
);
3886 sodelayed_copy_list(struct socket
*so
, struct recv_msg_elem
*msgarray
,
3887 u_int uiocnt
, struct mbuf
**free_list
, user_ssize_t
*resid
)
3891 struct mbuf
*ml
, *m
;
3895 for (ml
= *free_list
, i
= 0; ml
!= NULL
&& i
< uiocnt
;
3896 ml
= ml
->m_nextpkt
, i
++) {
3897 auio
= msgarray
[i
].uio
;
3898 for (m
= ml
; m
!= NULL
; m
= m
->m_next
) {
3899 error
= uiomove(mtod(m
, caddr_t
), m
->m_len
, auio
);
3905 m_freem_list(*free_list
);
3914 soreceive_list(struct socket
*so
, struct recv_msg_elem
*msgarray
, u_int uiocnt
,
3918 struct mbuf
*nextrecord
;
3919 struct mbuf
*ml
= NULL
, *free_list
= NULL
, *free_tail
= NULL
;
3921 user_ssize_t len
, pktlen
, delayed_copy_len
= 0;
3922 struct protosw
*pr
= so
->so_proto
;
3924 struct proc
*p
= current_proc();
3925 struct uio
*auio
= NULL
;
3928 struct sockaddr
**psa
= NULL
;
3929 struct mbuf
**controlp
= NULL
;
3932 struct mbuf
*free_others
= NULL
;
3934 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST
| DBG_FUNC_START
,
3936 so
->so_rcv
.sb_cc
, so
->so_rcv
.sb_lowat
, so
->so_rcv
.sb_hiwat
);
3940 * - Only supports don't wait flags
3941 * - Only support datagram sockets (could be extended to raw)
3943 * - Protocol must support packet chains
3944 * - The uio array is NULL (should we panic?)
3950 if (flags
& ~(MSG_PEEK
| MSG_WAITALL
| MSG_DONTWAIT
| MSG_NEEDSA
|
3952 printf("%s invalid flags 0x%x\n", __func__
, flags
);
3956 if (so
->so_type
!= SOCK_DGRAM
) {
3960 if (sosendallatonce(so
) == 0) {
3964 if (so
->so_proto
->pr_usrreqs
->pru_send_list
== NULL
) {
3965 error
= EPROTONOSUPPORT
;
3968 if (msgarray
== NULL
) {
3969 printf("%s uioarray is NULL\n", __func__
);
3974 printf("%s uiocnt is 0\n", __func__
);
3979 * Sanity check on the length passed by caller as we are making 'int'
3982 resid
= recv_msg_array_resid(msgarray
, uiocnt
);
3983 if (resid
< 0 || resid
> INT_MAX
) {
3988 if (!(flags
& MSG_PEEK
) && sorecvmincopy
> 0)
3994 so_update_last_owner_locked(so
, p
);
3995 so_update_policy(so
);
3998 so_update_necp_policy(so
, NULL
, NULL
);
4002 * If a recv attempt is made on a previously-accepted socket
4003 * that has been marked as inactive (disconnected), reject
4006 if (so
->so_flags
& SOF_DEFUNCT
) {
4007 struct sockbuf
*sb
= &so
->so_rcv
;
4010 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4011 __func__
, proc_pid(p
), proc_best_name(p
),
4012 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
4013 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
4015 * This socket should have been disconnected and flushed
4016 * prior to being returned from sodefunct(); there should
4017 * be no data on its receive list, so panic otherwise.
4019 if (so
->so_state
& SS_DEFUNCT
)
4020 sb_empty_assert(sb
, __func__
);
4026 * The uio may be empty
4028 if (npkts
>= uiocnt
) {
4034 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4035 * and if so just return to the caller. This could happen when
4036 * soreceive() is called by a socket upcall function during the
4037 * time the socket is freed. The socket buffer would have been
4038 * locked across the upcall, therefore we cannot put this thread
4039 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4040 * we may livelock), because the lock on the socket buffer will
4041 * only be released when the upcall routine returns to its caller.
4042 * Because the socket has been officially closed, there can be
4043 * no further read on it.
4045 if ((so
->so_state
& (SS_NOFDREF
| SS_CANTRCVMORE
)) ==
4046 (SS_NOFDREF
| SS_CANTRCVMORE
)) {
4051 error
= sblock(&so
->so_rcv
, SBLOCKWAIT(flags
));
4057 m
= so
->so_rcv
.sb_mb
;
4059 * Block awaiting more datagram if needed
4061 if (m
== NULL
|| (((flags
& MSG_DONTWAIT
) == 0 &&
4062 (so
->so_rcv
.sb_cc
< so
->so_rcv
.sb_lowat
||
4063 ((flags
& MSG_WAITALL
) && npkts
< uiocnt
))))) {
4065 * Panic if we notice inconsistencies in the socket's
4066 * receive list; both sb_mb and sb_cc should correctly
4067 * reflect the contents of the list, otherwise we may
4068 * end up with false positives during select() or poll()
4069 * which could put the application in a bad state.
4071 SB_MB_CHECK(&so
->so_rcv
);
4074 error
= so
->so_error
;
4075 if ((flags
& MSG_PEEK
) == 0)
4079 if (so
->so_state
& SS_CANTRCVMORE
) {
4082 if ((so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
)) == 0 &&
4083 (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
)) {
4087 if ((so
->so_state
& SS_NBIO
) ||
4088 (flags
& (MSG_DONTWAIT
|MSG_NBIO
))) {
4089 error
= EWOULDBLOCK
;
4093 * Do not block if we got some data
4095 if (free_list
!= NULL
) {
4100 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 1");
4101 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 1");
4103 sbunlock(&so
->so_rcv
, TRUE
); /* keep socket locked */
4106 error
= sbwait(&so
->so_rcv
);
4113 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgrcv
);
4114 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1");
4115 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1");
4118 * Consume the current uio index as we have a datagram
4120 auio
= msgarray
[npkts
].uio
;
4121 resid
= uio_resid(auio
);
4122 msgarray
[npkts
].which
|= SOCK_MSG_DATA
;
4123 psa
= (msgarray
[npkts
].which
& SOCK_MSG_SA
) ?
4124 &msgarray
[npkts
].psa
: NULL
;
4125 controlp
= (msgarray
[npkts
].which
& SOCK_MSG_CONTROL
) ?
4126 &msgarray
[npkts
].controlp
: NULL
;
4128 nextrecord
= m
->m_nextpkt
;
4130 if ((pr
->pr_flags
& PR_ADDR
) && m
->m_type
== MT_SONAME
) {
4131 error
= soreceive_addr(p
, so
, psa
, flags
, &m
, &nextrecord
, 1);
4132 if (error
== ERESTART
)
4134 else if (error
!= 0)
4138 if (m
!= NULL
&& m
->m_type
== MT_CONTROL
) {
4139 error
= soreceive_ctl(so
, controlp
, flags
, &m
, &nextrecord
);
4144 if (m
->m_pkthdr
.len
== 0) {
4145 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4147 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
4148 (uint64_t)DEBUG_KERNEL_ADDRPERM(m
),
4153 * Loop to copy the mbufs of the current record
4154 * Support zero length packets
4158 while (m
!= NULL
&& (len
= resid
- pktlen
) >= 0 && error
== 0) {
4160 panic("%p m_len zero", m
);
4162 panic("%p m_type zero", m
);
4164 * Clip to the residual length
4170 * Copy the mbufs via the uio or delay the copy
4171 * Sockbuf must be consistent here (points to current mbuf,
4172 * it points to next record) when we drop priority;
4173 * we must note any additions to the sockbuf when we
4174 * block interrupts again.
4176 if (len
> 0 && can_delay
== 0) {
4177 socket_unlock(so
, 0);
4178 error
= uiomove(mtod(m
, caddr_t
), (int)len
, auio
);
4183 delayed_copy_len
+= len
;
4186 if (len
== m
->m_len
) {
4188 * m was entirely copied
4190 sbfree(&so
->so_rcv
, m
);
4191 nextrecord
= m
->m_nextpkt
;
4192 m
->m_nextpkt
= NULL
;
4195 * Set the first packet to the head of the free list
4197 if (free_list
== NULL
)
4200 * Link current packet to tail of free list
4203 if (free_tail
!= NULL
)
4204 free_tail
->m_nextpkt
= m
;
4208 * Link current mbuf to last mbuf of current packet
4215 * Move next buf to head of socket buffer
4217 so
->so_rcv
.sb_mb
= m
= ml
->m_next
;
4221 m
->m_nextpkt
= nextrecord
;
4222 if (nextrecord
== NULL
)
4223 so
->so_rcv
.sb_lastrecord
= m
;
4225 so
->so_rcv
.sb_mb
= nextrecord
;
4226 SB_EMPTY_FIXUP(&so
->so_rcv
);
4228 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 3");
4229 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 3");
4232 * Stop the loop on partial copy
4237 #ifdef MORE_LOCKING_DEBUG
4238 if (so
->so_usecount
<= 1) {
4239 panic("%s: after big while so=%llx ref=%d on socket\n",
4241 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), so
->so_usecount
);
4246 * Tell the caller we made a partial copy
4249 if (so
->so_options
& SO_DONTTRUNC
) {
4251 * Copyout first the freelist then the partial mbuf
4253 socket_unlock(so
, 0);
4254 if (delayed_copy_len
)
4255 error
= sodelayed_copy_list(so
, msgarray
,
4256 uiocnt
, &free_list
, &delayed_copy_len
);
4259 error
= uiomove(mtod(m
, caddr_t
), (int)len
,
4268 so
->so_rcv
.sb_cc
-= len
;
4269 flags
|= MSG_RCVMORE
;
4271 (void) sbdroprecord(&so
->so_rcv
);
4272 nextrecord
= so
->so_rcv
.sb_mb
;
4279 so
->so_rcv
.sb_mb
= nextrecord
;
4281 * First part is an inline SB_EMPTY_FIXUP(). Second
4282 * part makes sure sb_lastrecord is up-to-date if
4283 * there is still data in the socket buffer.
4285 if (so
->so_rcv
.sb_mb
== NULL
) {
4286 so
->so_rcv
.sb_mbtail
= NULL
;
4287 so
->so_rcv
.sb_lastrecord
= NULL
;
4288 } else if (nextrecord
->m_nextpkt
== NULL
) {
4289 so
->so_rcv
.sb_lastrecord
= nextrecord
;
4291 SB_MB_CHECK(&so
->so_rcv
);
4293 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 4");
4294 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 4");
4297 * We can continue to the next packet as long as:
4298 * - We haven't exhausted the uio array
4299 * - There was no error
4300 * - A packet was not truncated
4301 * - We can still receive more data
4303 if (npkts
< uiocnt
&& error
== 0 &&
4304 (flags
& (MSG_RCVMORE
| MSG_TRUNC
)) == 0 &&
4305 (so
->so_state
& SS_CANTRCVMORE
) == 0) {
4306 sbunlock(&so
->so_rcv
, TRUE
); /* keep socket locked */
4316 * pru_rcvd may cause more data to be received if the socket lock
4317 * is dropped so we set MSG_HAVEMORE now based on what we know.
4318 * That way the caller won't be surprised if it receives less data
4321 if ((so
->so_options
& SO_WANTMORE
) && so
->so_rcv
.sb_cc
> 0)
4322 flags
|= MSG_HAVEMORE
;
4324 if (pr
->pr_flags
& PR_WANTRCVD
&& so
->so_pcb
)
4325 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
4328 sbunlock(&so
->so_rcv
, FALSE
); /* will unlock socket */
4330 socket_unlock(so
, 1);
4332 if (delayed_copy_len
)
4333 error
= sodelayed_copy_list(so
, msgarray
, uiocnt
,
4334 &free_list
, &delayed_copy_len
);
4337 * Amortize the cost of freeing the mbufs
4339 if (free_list
!= NULL
)
4340 m_freem_list(free_list
);
4341 if (free_others
!= NULL
)
4342 m_freem_list(free_others
);
4344 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST
| DBG_FUNC_END
, error
,
4350 * Returns: 0 Success
4353 * <pru_shutdown>:EINVAL
4354 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4355 * <pru_shutdown>:ENOBUFS[TCP]
4356 * <pru_shutdown>:EMSGSIZE[TCP]
4357 * <pru_shutdown>:EHOSTUNREACH[TCP]
4358 * <pru_shutdown>:ENETUNREACH[TCP]
4359 * <pru_shutdown>:ENETDOWN[TCP]
4360 * <pru_shutdown>:ENOMEM[TCP]
4361 * <pru_shutdown>:EACCES[TCP]
4362 * <pru_shutdown>:EMSGSIZE[TCP]
4363 * <pru_shutdown>:ENOBUFS[TCP]
4364 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4365 * <pru_shutdown>:??? [other protocol families]
4368 soshutdown(struct socket
*so
, int how
)
4372 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN
| DBG_FUNC_START
, how
, 0, 0, 0, 0);
4380 (SS_ISCONNECTED
|SS_ISCONNECTING
|SS_ISDISCONNECTING
)) == 0) {
4383 error
= soshutdownlock(so
, how
);
4385 socket_unlock(so
, 1);
4392 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN
| DBG_FUNC_END
, how
, error
, 0, 0, 0);
4398 soshutdownlock_final(struct socket
*so
, int how
)
4400 struct protosw
*pr
= so
->so_proto
;
4403 sflt_notify(so
, sock_evt_shutdown
, &how
);
4405 if (how
!= SHUT_WR
) {
4406 if ((so
->so_state
& SS_CANTRCVMORE
) != 0) {
4407 /* read already shut down */
4412 postevent(so
, 0, EV_RCLOSED
);
4414 if (how
!= SHUT_RD
) {
4415 if ((so
->so_state
& SS_CANTSENDMORE
) != 0) {
4416 /* write already shut down */
4420 error
= (*pr
->pr_usrreqs
->pru_shutdown
)(so
);
4421 postevent(so
, 0, EV_WCLOSED
);
4424 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN
, how
, 1, 0, 0, 0);
4429 soshutdownlock(struct socket
*so
, int how
)
4435 * A content filter may delay the actual shutdown until it
4436 * has processed the pending data
4438 if (so
->so_flags
& SOF_CONTENT_FILTER
) {
4439 error
= cfil_sock_shutdown(so
, &how
);
4440 if (error
== EJUSTRETURN
) {
4443 } else if (error
!= 0) {
4447 #endif /* CONTENT_FILTER */
4449 error
= soshutdownlock_final(so
, how
);
4456 sowflush(struct socket
*so
)
4458 struct sockbuf
*sb
= &so
->so_snd
;
4461 * Obtain lock on the socket buffer (SB_LOCK). This is required
4462 * to prevent the socket buffer from being unexpectedly altered
4463 * while it is used by another thread in socket send/receive.
4465 * sblock() must not fail here, hence the assertion.
4467 (void) sblock(sb
, SBL_WAIT
| SBL_NOINTR
| SBL_IGNDEFUNCT
);
4468 VERIFY(sb
->sb_flags
& SB_LOCK
);
4470 sb
->sb_flags
&= ~(SB_SEL
|SB_UPCALL
);
4471 sb
->sb_flags
|= SB_DROP
;
4472 sb
->sb_upcall
= NULL
;
4473 sb
->sb_upcallarg
= NULL
;
4475 sbunlock(sb
, TRUE
); /* keep socket locked */
4477 selthreadclear(&sb
->sb_sel
);
4482 sorflush(struct socket
*so
)
4484 struct sockbuf
*sb
= &so
->so_rcv
;
4485 struct protosw
*pr
= so
->so_proto
;
4488 lck_mtx_t
*mutex_held
;
4490 * XXX: This code is currently commented out, because we may get here
4491 * as part of sofreelastref(), and at that time, pr_getlock() may no
4492 * longer be able to return us the lock; this will be fixed in future.
4494 if (so
->so_proto
->pr_getlock
!= NULL
)
4495 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
4497 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
4499 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
4502 sflt_notify(so
, sock_evt_flush_read
, NULL
);
4507 * Obtain lock on the socket buffer (SB_LOCK). This is required
4508 * to prevent the socket buffer from being unexpectedly altered
4509 * while it is used by another thread in socket send/receive.
4511 * sblock() must not fail here, hence the assertion.
4513 (void) sblock(sb
, SBL_WAIT
| SBL_NOINTR
| SBL_IGNDEFUNCT
);
4514 VERIFY(sb
->sb_flags
& SB_LOCK
);
4517 * Copy only the relevant fields from "sb" to "asb" which we
4518 * need for sbrelease() to function. In particular, skip
4519 * sb_sel as it contains the wait queue linkage, which would
4520 * wreak havoc if we were to issue selthreadclear() on "asb".
4521 * Make sure to not carry over SB_LOCK in "asb", as we need
4522 * to acquire it later as part of sbrelease().
4524 bzero(&asb
, sizeof (asb
));
4525 asb
.sb_cc
= sb
->sb_cc
;
4526 asb
.sb_hiwat
= sb
->sb_hiwat
;
4527 asb
.sb_mbcnt
= sb
->sb_mbcnt
;
4528 asb
.sb_mbmax
= sb
->sb_mbmax
;
4529 asb
.sb_ctl
= sb
->sb_ctl
;
4530 asb
.sb_lowat
= sb
->sb_lowat
;
4531 asb
.sb_mb
= sb
->sb_mb
;
4532 asb
.sb_mbtail
= sb
->sb_mbtail
;
4533 asb
.sb_lastrecord
= sb
->sb_lastrecord
;
4534 asb
.sb_so
= sb
->sb_so
;
4535 asb
.sb_flags
= sb
->sb_flags
;
4536 asb
.sb_flags
&= ~(SB_LOCK
|SB_SEL
|SB_KNOTE
|SB_UPCALL
);
4537 asb
.sb_flags
|= SB_DROP
;
4540 * Ideally we'd bzero() these and preserve the ones we need;
4541 * but to do that we'd need to shuffle things around in the
4542 * sockbuf, and we can't do it now because there are KEXTS
4543 * that are directly referring to the socket structure.
4545 * Setting SB_DROP acts as a barrier to prevent further appends.
4546 * Clearing SB_SEL is done for selthreadclear() below.
4555 sb
->sb_mbtail
= NULL
;
4556 sb
->sb_lastrecord
= NULL
;
4557 sb
->sb_timeo
.tv_sec
= 0;
4558 sb
->sb_timeo
.tv_usec
= 0;
4559 sb
->sb_upcall
= NULL
;
4560 sb
->sb_upcallarg
= NULL
;
4561 sb
->sb_flags
&= ~(SB_SEL
|SB_UPCALL
);
4562 sb
->sb_flags
|= SB_DROP
;
4564 sbunlock(sb
, TRUE
); /* keep socket locked */
4567 * Note that selthreadclear() is called on the original "sb" and
4568 * not the local "asb" because of the way wait queue linkage is
4569 * implemented. Given that selwakeup() may be triggered, SB_SEL
4570 * should no longer be set (cleared above.)
4572 selthreadclear(&sb
->sb_sel
);
4574 if ((pr
->pr_flags
& PR_RIGHTS
) && pr
->pr_domain
->dom_dispose
)
4575 (*pr
->pr_domain
->dom_dispose
)(asb
.sb_mb
);
4581 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4582 * an additional variant to handle the case where the option value needs
4583 * to be some kind of integer, but not a specific size.
4584 * In addition to their use here, these functions are also called by the
4585 * protocol-level pr_ctloutput() routines.
4587 * Returns: 0 Success
4592 sooptcopyin(struct sockopt
*sopt
, void *buf
, size_t len
, size_t minlen
)
4597 * If the user gives us more than we wanted, we ignore it,
4598 * but if we don't get the minimum length the caller
4599 * wants, we return EINVAL. On success, sopt->sopt_valsize
4600 * is set to however much we actually retrieved.
4602 if ((valsize
= sopt
->sopt_valsize
) < minlen
)
4605 sopt
->sopt_valsize
= valsize
= len
;
4607 if (sopt
->sopt_p
!= kernproc
)
4608 return (copyin(sopt
->sopt_val
, buf
, valsize
));
4610 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), buf
, valsize
);
4615 * sooptcopyin_timeval
4616 * Copy in a timeval value into tv_p, and take into account whether the
4617 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4618 * code here so that we can verify the 64-bit tv_sec value before we lose
4619 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4622 sooptcopyin_timeval(struct sockopt
*sopt
, struct timeval
*tv_p
)
4626 if (proc_is64bit(sopt
->sopt_p
)) {
4627 struct user64_timeval tv64
;
4629 if (sopt
->sopt_valsize
< sizeof (tv64
))
4632 sopt
->sopt_valsize
= sizeof (tv64
);
4633 if (sopt
->sopt_p
!= kernproc
) {
4634 error
= copyin(sopt
->sopt_val
, &tv64
, sizeof (tv64
));
4638 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), &tv64
,
4641 if (tv64
.tv_sec
< 0 || tv64
.tv_sec
> LONG_MAX
||
4642 tv64
.tv_usec
< 0 || tv64
.tv_usec
>= 1000000)
4645 tv_p
->tv_sec
= tv64
.tv_sec
;
4646 tv_p
->tv_usec
= tv64
.tv_usec
;
4648 struct user32_timeval tv32
;
4650 if (sopt
->sopt_valsize
< sizeof (tv32
))
4653 sopt
->sopt_valsize
= sizeof (tv32
);
4654 if (sopt
->sopt_p
!= kernproc
) {
4655 error
= copyin(sopt
->sopt_val
, &tv32
, sizeof (tv32
));
4660 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), &tv32
,
4665 * K64todo "comparison is always false due to
4666 * limited range of data type"
4668 if (tv32
.tv_sec
< 0 || tv32
.tv_sec
> LONG_MAX
||
4669 tv32
.tv_usec
< 0 || tv32
.tv_usec
>= 1000000)
4672 tv_p
->tv_sec
= tv32
.tv_sec
;
4673 tv_p
->tv_usec
= tv32
.tv_usec
;
4679 soopt_cred_check(struct socket
*so
, int priv
, boolean_t allow_root
)
4681 kauth_cred_t cred
= NULL
;
4682 proc_t ep
= PROC_NULL
;
4686 if (so
->so_flags
& SOF_DELEGATED
) {
4687 ep
= proc_find(so
->e_pid
);
4689 cred
= kauth_cred_proc_ref(ep
);
4692 uid
= kauth_cred_getuid(cred
? cred
: so
->so_cred
);
4694 /* uid is 0 for root */
4695 if (uid
!= 0 || !allow_root
)
4696 error
= priv_check_cred(cred
? cred
: so
->so_cred
, priv
, 0);
4698 kauth_cred_unref(&cred
);
4699 if (ep
!= PROC_NULL
)
4706 * Returns: 0 Success
4711 * sooptcopyin:EINVAL
4712 * sooptcopyin:EFAULT
4713 * sooptcopyin_timeval:EINVAL
4714 * sooptcopyin_timeval:EFAULT
4715 * sooptcopyin_timeval:EDOM
4716 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4717 * <pr_ctloutput>:???w
4718 * sflt_attach_private:??? [whatever a filter author chooses]
4719 * <sf_setoption>:??? [whatever a filter author chooses]
4721 * Notes: Other <pru_listen> returns depend on the protocol family; all
4722 * <sf_listen> returns depend on what the filter author causes
4723 * their filter to return.
4726 sosetoptlock(struct socket
*so
, struct sockopt
*sopt
, int dolock
)
4731 #if CONFIG_MACF_SOCKET
4733 #endif /* MAC_SOCKET */
4735 if (sopt
->sopt_dir
!= SOPT_SET
)
4736 sopt
->sopt_dir
= SOPT_SET
;
4741 if ((so
->so_state
& (SS_CANTRCVMORE
| SS_CANTSENDMORE
)) ==
4742 (SS_CANTRCVMORE
| SS_CANTSENDMORE
) &&
4743 (so
->so_flags
& SOF_NPX_SETOPTSHUT
) == 0) {
4744 /* the socket has been shutdown, no more sockopt's */
4749 error
= sflt_setsockopt(so
, sopt
);
4751 if (error
== EJUSTRETURN
)
4756 if (sopt
->sopt_level
!= SOL_SOCKET
) {
4757 if (so
->so_proto
!= NULL
&&
4758 so
->so_proto
->pr_ctloutput
!= NULL
) {
4759 error
= (*so
->so_proto
->pr_ctloutput
)(so
, sopt
);
4762 error
= ENOPROTOOPT
;
4765 * Allow socket-level (SOL_SOCKET) options to be filtered by
4766 * the protocol layer, if needed. A zero value returned from
4767 * the handler means use default socket-level processing as
4768 * done by the rest of this routine. Otherwise, any other
4769 * return value indicates that the option is unsupported.
4771 if (so
->so_proto
!= NULL
&& (error
= so
->so_proto
->pr_usrreqs
->
4772 pru_socheckopt(so
, sopt
)) != 0)
4776 switch (sopt
->sopt_name
) {
4779 error
= sooptcopyin(sopt
, &l
, sizeof (l
), sizeof (l
));
4783 so
->so_linger
= (sopt
->sopt_name
== SO_LINGER
) ?
4784 l
.l_linger
: l
.l_linger
* hz
;
4786 so
->so_options
|= SO_LINGER
;
4788 so
->so_options
&= ~SO_LINGER
;
4794 case SO_USELOOPBACK
:
4800 case SO_TIMESTAMP_MONOTONIC
:
4803 case SO_WANTOOBFLAG
:
4804 case SO_NOWAKEFROMSLEEP
:
4805 case SO_NOAPNFALLBK
:
4806 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4811 so
->so_options
|= sopt
->sopt_name
;
4813 so
->so_options
&= ~sopt
->sopt_name
;
4820 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4826 * Values < 1 make no sense for any of these
4827 * options, so disallow them.
4834 switch (sopt
->sopt_name
) {
4837 struct sockbuf
*sb
=
4838 (sopt
->sopt_name
== SO_SNDBUF
) ?
4839 &so
->so_snd
: &so
->so_rcv
;
4840 if (sbreserve(sb
, (u_int32_t
)optval
) == 0) {
4844 sb
->sb_flags
|= SB_USRSIZE
;
4845 sb
->sb_flags
&= ~SB_AUTOSIZE
;
4846 sb
->sb_idealsize
= (u_int32_t
)optval
;
4850 * Make sure the low-water is never greater than
4854 int space
= sbspace(&so
->so_snd
);
4855 u_int32_t hiwat
= so
->so_snd
.sb_hiwat
;
4857 if (so
->so_snd
.sb_flags
& SB_UNIX
) {
4859 (struct unpcb
*)(so
->so_pcb
);
4861 unp
->unp_conn
!= NULL
) {
4862 hiwat
+= unp
->unp_conn
->unp_cc
;
4866 so
->so_snd
.sb_lowat
=
4870 if (space
>= so
->so_snd
.sb_lowat
) {
4877 so
->so_rcv
.sb_lowat
=
4878 (optval
> so
->so_rcv
.sb_hiwat
) ?
4879 so
->so_rcv
.sb_hiwat
: optval
;
4880 data_len
= so
->so_rcv
.sb_cc
4881 - so
->so_rcv
.sb_ctl
;
4882 if (data_len
>= so
->so_rcv
.sb_lowat
)
4891 error
= sooptcopyin_timeval(sopt
, &tv
);
4895 switch (sopt
->sopt_name
) {
4897 so
->so_snd
.sb_timeo
= tv
;
4900 so
->so_rcv
.sb_timeo
= tv
;
4908 error
= sooptcopyin(sopt
, &nke
, sizeof (nke
),
4913 error
= sflt_attach_internal(so
, nke
.nke_handle
);
4918 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4923 so
->so_flags
|= SOF_NOSIGPIPE
;
4925 so
->so_flags
&= ~SOF_NOSIGPIPE
;
4929 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4934 so
->so_flags
|= SOF_NOADDRAVAIL
;
4936 so
->so_flags
&= ~SOF_NOADDRAVAIL
;
4939 case SO_REUSESHAREUID
:
4940 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4945 so
->so_flags
|= SOF_REUSESHAREUID
;
4947 so
->so_flags
&= ~SOF_REUSESHAREUID
;
4950 case SO_NOTIFYCONFLICT
:
4951 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
4955 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4960 so
->so_flags
|= SOF_NOTIFYCONFLICT
;
4962 so
->so_flags
&= ~SOF_NOTIFYCONFLICT
;
4965 case SO_RESTRICTIONS
:
4966 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4971 error
= so_set_restrictions(so
, optval
);
4974 case SO_AWDL_UNRESTRICTED
:
4975 if (SOCK_DOM(so
) != PF_INET
&&
4976 SOCK_DOM(so
) != PF_INET6
) {
4980 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
4985 error
= soopt_cred_check(so
,
4986 PRIV_NET_RESTRICTED_AWDL
, false);
4988 inp_set_awdl_unrestricted(
4991 inp_clear_awdl_unrestricted(sotoinpcb(so
));
4993 case SO_INTCOPROC_ALLOW
:
4994 if (SOCK_DOM(so
) != PF_INET6
) {
4998 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5003 inp_get_intcoproc_allowed(sotoinpcb(so
)) == FALSE
) {
5004 error
= soopt_cred_check(so
,
5005 PRIV_NET_RESTRICTED_INTCOPROC
, false);
5007 inp_set_intcoproc_allowed(
5009 } else if (optval
== 0)
5010 inp_clear_intcoproc_allowed(sotoinpcb(so
));
5014 #if CONFIG_MACF_SOCKET
5015 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof (extmac
),
5016 sizeof (extmac
))) != 0)
5019 error
= mac_setsockopt_label(proc_ucred(sopt
->sopt_p
),
5023 #endif /* MAC_SOCKET */
5026 case SO_UPCALLCLOSEWAIT
:
5027 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
5032 so
->so_flags
|= SOF_UPCALLCLOSEWAIT
;
5034 so
->so_flags
&= ~SOF_UPCALLCLOSEWAIT
;
5038 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
5043 so
->so_flags
|= SOF_BINDRANDOMPORT
;
5045 so
->so_flags
&= ~SOF_BINDRANDOMPORT
;
5048 case SO_NP_EXTENSIONS
: {
5049 struct so_np_extensions sonpx
;
5051 error
= sooptcopyin(sopt
, &sonpx
, sizeof (sonpx
),
5055 if (sonpx
.npx_mask
& ~SONPX_MASK_VALID
) {
5060 * Only one bit defined for now
5062 if ((sonpx
.npx_mask
& SONPX_SETOPTSHUT
)) {
5063 if ((sonpx
.npx_flags
& SONPX_SETOPTSHUT
))
5064 so
->so_flags
|= SOF_NPX_SETOPTSHUT
;
5066 so
->so_flags
&= ~SOF_NPX_SETOPTSHUT
;
5071 case SO_TRAFFIC_CLASS
: {
5072 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
5076 if (optval
>= SO_TC_NET_SERVICE_OFFSET
) {
5077 int netsvc
= optval
- SO_TC_NET_SERVICE_OFFSET
;
5078 error
= so_set_net_service_type(so
, netsvc
);
5081 error
= so_set_traffic_class(so
, optval
);
5084 so
->so_flags1
&= ~SOF1_TC_NET_SERV_TYPE
;
5085 so
->so_netsvctype
= _NET_SERVICE_TYPE_UNSPEC
;
5089 case SO_RECV_TRAFFIC_CLASS
: {
5090 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
5095 so
->so_flags
&= ~SOF_RECV_TRAFFIC_CLASS
;
5097 so
->so_flags
|= SOF_RECV_TRAFFIC_CLASS
;
5101 #if (DEVELOPMENT || DEBUG)
5102 case SO_TRAFFIC_CLASS_DBG
: {
5103 struct so_tcdbg so_tcdbg
;
5105 error
= sooptcopyin(sopt
, &so_tcdbg
,
5106 sizeof (struct so_tcdbg
), sizeof (struct so_tcdbg
));
5109 error
= so_set_tcdbg(so
, &so_tcdbg
);
5114 #endif /* (DEVELOPMENT || DEBUG) */
5116 case SO_PRIVILEGED_TRAFFIC_CLASS
:
5117 error
= priv_check_cred(kauth_cred_get(),
5118 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS
, 0);
5121 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
5126 so
->so_flags
&= ~SOF_PRIVILEGED_TRAFFIC_CLASS
;
5128 so
->so_flags
|= SOF_PRIVILEGED_TRAFFIC_CLASS
;
5132 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
5134 if (error
!= 0 || (so
->so_flags
& SOF_DEFUNCT
)) {
5140 * Any process can set SO_DEFUNCTOK (clear
5141 * SOF_NODEFUNCT), but only root can clear
5142 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5145 kauth_cred_issuser(kauth_cred_get()) == 0) {
5150 so
->so_flags
&= ~SOF_NODEFUNCT
;
5152 so
->so_flags
|= SOF_NODEFUNCT
;
5154 if (SOCK_DOM(so
) == PF_INET
||
5155 SOCK_DOM(so
) == PF_INET6
) {
5156 char s
[MAX_IPv6_STR_LEN
];
5157 char d
[MAX_IPv6_STR_LEN
];
5158 struct inpcb
*inp
= sotoinpcb(so
);
5160 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5161 "[%s %s:%d -> %s:%d] is now marked "
5162 "as %seligible for "
5163 "defunct\n", __func__
, proc_selfpid(),
5164 proc_best_name(current_proc()),
5165 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
5166 (SOCK_TYPE(so
) == SOCK_STREAM
) ?
5167 "TCP" : "UDP", inet_ntop(SOCK_DOM(so
),
5168 ((SOCK_DOM(so
) == PF_INET
) ?
5169 (void *)&inp
->inp_laddr
.s_addr
:
5170 (void *)&inp
->in6p_laddr
), s
, sizeof (s
)),
5171 ntohs(inp
->in6p_lport
),
5172 inet_ntop(SOCK_DOM(so
),
5173 (SOCK_DOM(so
) == PF_INET
) ?
5174 (void *)&inp
->inp_faddr
.s_addr
:
5175 (void *)&inp
->in6p_faddr
, d
, sizeof (d
)),
5176 ntohs(inp
->in6p_fport
),
5177 (so
->so_flags
& SOF_NODEFUNCT
) ?
5180 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5181 "is now marked as %seligible for "
5183 __func__
, proc_selfpid(),
5184 proc_best_name(current_proc()),
5185 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
5186 SOCK_DOM(so
), SOCK_TYPE(so
),
5187 (so
->so_flags
& SOF_NODEFUNCT
) ?
5193 /* This option is not settable */
5197 case SO_OPPORTUNISTIC
:
5198 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
5201 error
= so_set_opportunistic(so
, optval
);
5205 /* This option is handled by lower layer(s) */
5210 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
5213 error
= so_set_recv_anyif(so
, optval
);
5216 case SO_TRAFFIC_MGT_BACKGROUND
: {
5217 /* This option is handled by lower layer(s) */
5223 case SO_FLOW_DIVERT_TOKEN
:
5224 error
= flow_divert_token_set(so
, sopt
);
5226 #endif /* FLOW_DIVERT */
5230 if ((error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
5231 sizeof (optval
))) != 0)
5234 error
= so_set_effective_pid(so
, optval
, sopt
->sopt_p
);
5237 case SO_DELEGATED_UUID
: {
5240 if ((error
= sooptcopyin(sopt
, &euuid
, sizeof (euuid
),
5241 sizeof (euuid
))) != 0)
5244 error
= so_set_effective_uuid(so
, euuid
, sopt
->sopt_p
);
5249 case SO_NECP_ATTRIBUTES
:
5250 error
= necp_set_socket_attributes(so
, sopt
);
5253 case SO_NECP_CLIENTUUID
:
5254 if (SOCK_DOM(so
) == PF_MULTIPATH
) {
5255 /* Handled by MPTCP itself */
5259 if (SOCK_DOM(so
) != PF_INET
&& SOCK_DOM(so
) != PF_INET6
) {
5264 struct inpcb
*inp
= sotoinpcb(so
);
5265 if (!uuid_is_null(inp
->necp_client_uuid
)) {
5266 // Clear out the old client UUID if present
5267 necp_inpcb_remove_cb(inp
);
5270 error
= sooptcopyin(sopt
, &inp
->necp_client_uuid
,
5271 sizeof(uuid_t
), sizeof(uuid_t
));
5276 if (uuid_is_null(inp
->necp_client_uuid
)) {
5281 error
= necp_client_register_socket_flow(so
->last_pid
,
5282 inp
->necp_client_uuid
, inp
);
5284 uuid_clear(inp
->necp_client_uuid
);
5288 if (inp
->inp_lport
!= 0) {
5289 // There is bound local port, so this is not
5290 // a fresh socket. Assign to the client.
5291 necp_client_assign_from_socket(so
->last_pid
, inp
->necp_client_uuid
, inp
);
5297 case SO_EXTENDED_BK_IDLE
:
5298 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
5301 error
= so_set_extended_bk_idle(so
, optval
);
5304 case SO_MARK_CELLFALLBACK
:
5305 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5314 so
->so_flags1
&= ~SOF1_CELLFALLBACK
;
5316 so
->so_flags1
|= SOF1_CELLFALLBACK
;
5319 case SO_NET_SERVICE_TYPE
: {
5320 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5324 error
= so_set_net_service_type(so
, optval
);
5328 case SO_QOSMARKING_POLICY_OVERRIDE
:
5329 error
= priv_check_cred(kauth_cred_get(),
5330 PRIV_NET_QOSMARKING_POLICY_OVERRIDE
, 0);
5333 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5338 so
->so_flags1
&= ~SOF1_QOSMARKING_POLICY_OVERRIDE
;
5340 so
->so_flags1
|= SOF1_QOSMARKING_POLICY_OVERRIDE
;
5344 error
= ENOPROTOOPT
;
5347 if (error
== 0 && so
->so_proto
!= NULL
&&
5348 so
->so_proto
->pr_ctloutput
!= NULL
) {
5349 (void) so
->so_proto
->pr_ctloutput(so
, sopt
);
5354 socket_unlock(so
, 1);
5358 /* Helper routines for getsockopt */
5360 sooptcopyout(struct sockopt
*sopt
, void *buf
, size_t len
)
5368 * Documented get behavior is that we always return a value,
5369 * possibly truncated to fit in the user's buffer.
5370 * Traditional behavior is that we always tell the user
5371 * precisely how much we copied, rather than something useful
5372 * like the total amount we had available for her.
5373 * Note that this interface is not idempotent; the entire answer must
5374 * generated ahead of time.
5376 valsize
= min(len
, sopt
->sopt_valsize
);
5377 sopt
->sopt_valsize
= valsize
;
5378 if (sopt
->sopt_val
!= USER_ADDR_NULL
) {
5379 if (sopt
->sopt_p
!= kernproc
)
5380 error
= copyout(buf
, sopt
->sopt_val
, valsize
);
5382 bcopy(buf
, CAST_DOWN(caddr_t
, sopt
->sopt_val
), valsize
);
5388 sooptcopyout_timeval(struct sockopt
*sopt
, const struct timeval
*tv_p
)
5392 struct user64_timeval tv64
= {};
5393 struct user32_timeval tv32
= {};
5398 if (proc_is64bit(sopt
->sopt_p
)) {
5399 len
= sizeof (tv64
);
5400 tv64
.tv_sec
= tv_p
->tv_sec
;
5401 tv64
.tv_usec
= tv_p
->tv_usec
;
5404 len
= sizeof (tv32
);
5405 tv32
.tv_sec
= tv_p
->tv_sec
;
5406 tv32
.tv_usec
= tv_p
->tv_usec
;
5409 valsize
= min(len
, sopt
->sopt_valsize
);
5410 sopt
->sopt_valsize
= valsize
;
5411 if (sopt
->sopt_val
!= USER_ADDR_NULL
) {
5412 if (sopt
->sopt_p
!= kernproc
)
5413 error
= copyout(val
, sopt
->sopt_val
, valsize
);
5415 bcopy(val
, CAST_DOWN(caddr_t
, sopt
->sopt_val
), valsize
);
5423 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5424 * <pr_ctloutput>:???
5425 * <sf_getoption>:???
5428 sogetoptlock(struct socket
*so
, struct sockopt
*sopt
, int dolock
)
5433 #if CONFIG_MACF_SOCKET
5435 #endif /* MAC_SOCKET */
5437 if (sopt
->sopt_dir
!= SOPT_GET
)
5438 sopt
->sopt_dir
= SOPT_GET
;
5443 error
= sflt_getsockopt(so
, sopt
);
5445 if (error
== EJUSTRETURN
)
5450 if (sopt
->sopt_level
!= SOL_SOCKET
) {
5451 if (so
->so_proto
!= NULL
&&
5452 so
->so_proto
->pr_ctloutput
!= NULL
) {
5453 error
= (*so
->so_proto
->pr_ctloutput
)(so
, sopt
);
5456 error
= ENOPROTOOPT
;
5459 * Allow socket-level (SOL_SOCKET) options to be filtered by
5460 * the protocol layer, if needed. A zero value returned from
5461 * the handler means use default socket-level processing as
5462 * done by the rest of this routine. Otherwise, any other
5463 * return value indicates that the option is unsupported.
5465 if (so
->so_proto
!= NULL
&& (error
= so
->so_proto
->pr_usrreqs
->
5466 pru_socheckopt(so
, sopt
)) != 0)
5470 switch (sopt
->sopt_name
) {
5473 l
.l_onoff
= ((so
->so_options
& SO_LINGER
) ? 1 : 0);
5474 l
.l_linger
= (sopt
->sopt_name
== SO_LINGER
) ?
5475 so
->so_linger
: so
->so_linger
/ hz
;
5476 error
= sooptcopyout(sopt
, &l
, sizeof (l
));
5479 case SO_USELOOPBACK
:
5488 case SO_TIMESTAMP_MONOTONIC
:
5491 case SO_WANTOOBFLAG
:
5492 case SO_NOWAKEFROMSLEEP
:
5493 case SO_NOAPNFALLBK
:
5494 optval
= so
->so_options
& sopt
->sopt_name
;
5496 error
= sooptcopyout(sopt
, &optval
, sizeof (optval
));
5500 optval
= so
->so_type
;
5504 if (so
->so_proto
->pr_flags
& PR_ATOMIC
) {
5509 m1
= so
->so_rcv
.sb_mb
;
5510 while (m1
!= NULL
) {
5511 if (m1
->m_type
== MT_DATA
||
5512 m1
->m_type
== MT_HEADER
||
5513 m1
->m_type
== MT_OOBDATA
)
5514 pkt_total
+= m1
->m_len
;
5519 optval
= so
->so_rcv
.sb_cc
- so
->so_rcv
.sb_ctl
;
5524 if (so
->so_proto
->pr_flags
& PR_ATOMIC
) {
5528 m1
= so
->so_rcv
.sb_mb
;
5529 while (m1
!= NULL
) {
5530 if (m1
->m_type
== MT_DATA
||
5531 m1
->m_type
== MT_HEADER
||
5532 m1
->m_type
== MT_OOBDATA
)
5544 optval
= so
->so_snd
.sb_cc
;
5548 optval
= so
->so_error
;
5553 u_int32_t hiwat
= so
->so_snd
.sb_hiwat
;
5555 if (so
->so_snd
.sb_flags
& SB_UNIX
) {
5557 (struct unpcb
*)(so
->so_pcb
);
5558 if (unp
!= NULL
&& unp
->unp_conn
!= NULL
) {
5559 hiwat
+= unp
->unp_conn
->unp_cc
;
5567 optval
= so
->so_rcv
.sb_hiwat
;
5571 optval
= so
->so_snd
.sb_lowat
;
5575 optval
= so
->so_rcv
.sb_lowat
;
5580 tv
= (sopt
->sopt_name
== SO_SNDTIMEO
?
5581 so
->so_snd
.sb_timeo
: so
->so_rcv
.sb_timeo
);
5583 error
= sooptcopyout_timeval(sopt
, &tv
);
5587 optval
= (so
->so_flags
& SOF_NOSIGPIPE
);
5591 optval
= (so
->so_flags
& SOF_NOADDRAVAIL
);
5594 case SO_REUSESHAREUID
:
5595 optval
= (so
->so_flags
& SOF_REUSESHAREUID
);
5599 case SO_NOTIFYCONFLICT
:
5600 optval
= (so
->so_flags
& SOF_NOTIFYCONFLICT
);
5603 case SO_RESTRICTIONS
:
5604 optval
= so_get_restrictions(so
);
5607 case SO_AWDL_UNRESTRICTED
:
5608 if (SOCK_DOM(so
) == PF_INET
||
5609 SOCK_DOM(so
) == PF_INET6
) {
5610 optval
= inp_get_awdl_unrestricted(
5617 case SO_INTCOPROC_ALLOW
:
5618 if (SOCK_DOM(so
) == PF_INET6
) {
5619 optval
= inp_get_intcoproc_allowed(
5627 #if CONFIG_MACF_SOCKET
5628 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof (extmac
),
5629 sizeof (extmac
))) != 0 ||
5630 (error
= mac_socket_label_get(proc_ucred(
5631 sopt
->sopt_p
), so
, &extmac
)) != 0)
5634 error
= sooptcopyout(sopt
, &extmac
, sizeof (extmac
));
5637 #endif /* MAC_SOCKET */
5641 #if CONFIG_MACF_SOCKET
5642 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof (extmac
),
5643 sizeof (extmac
))) != 0 ||
5644 (error
= mac_socketpeer_label_get(proc_ucred(
5645 sopt
->sopt_p
), so
, &extmac
)) != 0)
5648 error
= sooptcopyout(sopt
, &extmac
, sizeof (extmac
));
5651 #endif /* MAC_SOCKET */
5654 #ifdef __APPLE_API_PRIVATE
5655 case SO_UPCALLCLOSEWAIT
:
5656 optval
= (so
->so_flags
& SOF_UPCALLCLOSEWAIT
);
5660 optval
= (so
->so_flags
& SOF_BINDRANDOMPORT
);
5663 case SO_NP_EXTENSIONS
: {
5664 struct so_np_extensions sonpx
;
5666 sonpx
.npx_flags
= (so
->so_flags
& SOF_NPX_SETOPTSHUT
) ?
5667 SONPX_SETOPTSHUT
: 0;
5668 sonpx
.npx_mask
= SONPX_MASK_VALID
;
5670 error
= sooptcopyout(sopt
, &sonpx
,
5671 sizeof (struct so_np_extensions
));
5675 case SO_TRAFFIC_CLASS
:
5676 optval
= so
->so_traffic_class
;
5679 case SO_RECV_TRAFFIC_CLASS
:
5680 optval
= (so
->so_flags
& SOF_RECV_TRAFFIC_CLASS
);
5683 case SO_TRAFFIC_CLASS_STATS
:
5684 error
= sooptcopyout(sopt
, &so
->so_tc_stats
,
5685 sizeof (so
->so_tc_stats
));
5688 #if (DEVELOPMENT || DEBUG)
5689 case SO_TRAFFIC_CLASS_DBG
:
5690 error
= sogetopt_tcdbg(so
, sopt
);
5692 #endif /* (DEVELOPMENT || DEBUG) */
5694 case SO_PRIVILEGED_TRAFFIC_CLASS
:
5695 optval
= (so
->so_flags
& SOF_PRIVILEGED_TRAFFIC_CLASS
);
5699 optval
= !(so
->so_flags
& SOF_NODEFUNCT
);
5703 optval
= (so
->so_flags
& SOF_DEFUNCT
);
5706 case SO_OPPORTUNISTIC
:
5707 optval
= so_get_opportunistic(so
);
5711 /* This option is not gettable */
5716 optval
= so_get_recv_anyif(so
);
5719 case SO_TRAFFIC_MGT_BACKGROUND
:
5720 /* This option is handled by lower layer(s) */
5721 if (so
->so_proto
!= NULL
&&
5722 so
->so_proto
->pr_ctloutput
!= NULL
) {
5723 (void) so
->so_proto
->pr_ctloutput(so
, sopt
);
5728 case SO_FLOW_DIVERT_TOKEN
:
5729 error
= flow_divert_token_get(so
, sopt
);
5731 #endif /* FLOW_DIVERT */
5734 case SO_NECP_ATTRIBUTES
:
5735 error
= necp_get_socket_attributes(so
, sopt
);
5738 case SO_NECP_CLIENTUUID
:
5742 if (SOCK_DOM(so
) == PF_MULTIPATH
) {
5743 ncu
= &mpsotomppcb(so
)->necp_client_uuid
;
5744 } else if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
5745 ncu
= &sotoinpcb(so
)->necp_client_uuid
;
5751 error
= sooptcopyout(sopt
, ncu
, sizeof(uuid_t
));
5757 case SO_CFIL_SOCK_ID
: {
5758 cfil_sock_id_t sock_id
;
5760 sock_id
= cfil_sock_id_from_socket(so
);
5762 error
= sooptcopyout(sopt
, &sock_id
,
5763 sizeof(cfil_sock_id_t
));
5766 #endif /* CONTENT_FILTER */
5768 case SO_EXTENDED_BK_IDLE
:
5769 optval
= (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
);
5771 case SO_MARK_CELLFALLBACK
:
5772 optval
= ((so
->so_flags1
& SOF1_CELLFALLBACK
) > 0)
5775 case SO_NET_SERVICE_TYPE
: {
5776 if ((so
->so_flags1
& SOF1_TC_NET_SERV_TYPE
))
5777 optval
= so
->so_netsvctype
;
5779 optval
= NET_SERVICE_TYPE_BE
;
5782 case SO_NETSVC_MARKING_LEVEL
:
5783 optval
= so_get_netsvc_marking_level(so
);
5787 error
= ENOPROTOOPT
;
5793 socket_unlock(so
, 1);
5798 * The size limits on our soopt_getm is different from that on FreeBSD.
5799 * We limit the size of options to MCLBYTES. This will have to change
5800 * if we need to define options that need more space than MCLBYTES.
5803 soopt_getm(struct sockopt
*sopt
, struct mbuf
**mp
)
5805 struct mbuf
*m
, *m_prev
;
5806 int sopt_size
= sopt
->sopt_valsize
;
5809 if (sopt_size
<= 0 || sopt_size
> MCLBYTES
)
5812 how
= sopt
->sopt_p
!= kernproc
? M_WAIT
: M_DONTWAIT
;
5813 MGET(m
, how
, MT_DATA
);
5816 if (sopt_size
> MLEN
) {
5818 if ((m
->m_flags
& M_EXT
) == 0) {
5822 m
->m_len
= min(MCLBYTES
, sopt_size
);
5824 m
->m_len
= min(MLEN
, sopt_size
);
5826 sopt_size
-= m
->m_len
;
5830 while (sopt_size
> 0) {
5831 MGET(m
, how
, MT_DATA
);
5836 if (sopt_size
> MLEN
) {
5838 if ((m
->m_flags
& M_EXT
) == 0) {
5843 m
->m_len
= min(MCLBYTES
, sopt_size
);
5845 m
->m_len
= min(MLEN
, sopt_size
);
5847 sopt_size
-= m
->m_len
;
5854 /* copyin sopt data into mbuf chain */
5856 soopt_mcopyin(struct sockopt
*sopt
, struct mbuf
*m
)
5858 struct mbuf
*m0
= m
;
5860 if (sopt
->sopt_val
== USER_ADDR_NULL
)
5862 while (m
!= NULL
&& sopt
->sopt_valsize
>= m
->m_len
) {
5863 if (sopt
->sopt_p
!= kernproc
) {
5866 error
= copyin(sopt
->sopt_val
, mtod(m
, char *),
5873 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
),
5874 mtod(m
, char *), m
->m_len
);
5876 sopt
->sopt_valsize
-= m
->m_len
;
5877 sopt
->sopt_val
+= m
->m_len
;
5880 /* should be allocated enoughly at ip6_sooptmcopyin() */
5882 panic("soopt_mcopyin");
5888 /* copyout mbuf chain data into soopt */
5890 soopt_mcopyout(struct sockopt
*sopt
, struct mbuf
*m
)
5892 struct mbuf
*m0
= m
;
5895 if (sopt
->sopt_val
== USER_ADDR_NULL
)
5897 while (m
!= NULL
&& sopt
->sopt_valsize
>= m
->m_len
) {
5898 if (sopt
->sopt_p
!= kernproc
) {
5901 error
= copyout(mtod(m
, char *), sopt
->sopt_val
,
5908 bcopy(mtod(m
, char *),
5909 CAST_DOWN(caddr_t
, sopt
->sopt_val
), m
->m_len
);
5911 sopt
->sopt_valsize
-= m
->m_len
;
5912 sopt
->sopt_val
+= m
->m_len
;
5913 valsize
+= m
->m_len
;
5917 /* enough soopt buffer should be given from user-land */
5921 sopt
->sopt_valsize
= valsize
;
5926 sohasoutofband(struct socket
*so
)
5928 if (so
->so_pgid
< 0)
5929 gsignal(-so
->so_pgid
, SIGURG
);
5930 else if (so
->so_pgid
> 0)
5931 proc_signal(so
->so_pgid
, SIGURG
);
5932 selwakeup(&so
->so_rcv
.sb_sel
);
5933 if (so
->so_rcv
.sb_flags
& SB_KNOTE
) {
5934 KNOTE(&so
->so_rcv
.sb_sel
.si_note
,
5935 (NOTE_OOB
| SO_FILT_HINT_LOCKED
));
5940 sopoll(struct socket
*so
, int events
, kauth_cred_t cred
, void * wql
)
5942 #pragma unused(cred)
5943 struct proc
*p
= current_proc();
5947 so_update_last_owner_locked(so
, PROC_NULL
);
5948 so_update_policy(so
);
5950 if (events
& (POLLIN
| POLLRDNORM
))
5952 revents
|= events
& (POLLIN
| POLLRDNORM
);
5954 if (events
& (POLLOUT
| POLLWRNORM
))
5955 if (sowriteable(so
))
5956 revents
|= events
& (POLLOUT
| POLLWRNORM
);
5958 if (events
& (POLLPRI
| POLLRDBAND
))
5959 if (so
->so_oobmark
|| (so
->so_state
& SS_RCVATMARK
))
5960 revents
|= events
& (POLLPRI
| POLLRDBAND
);
5963 if (events
& (POLLIN
| POLLPRI
| POLLRDNORM
| POLLRDBAND
)) {
5965 * Darwin sets the flag first,
5966 * BSD calls selrecord first
5968 so
->so_rcv
.sb_flags
|= SB_SEL
;
5969 selrecord(p
, &so
->so_rcv
.sb_sel
, wql
);
5972 if (events
& (POLLOUT
| POLLWRNORM
)) {
5974 * Darwin sets the flag first,
5975 * BSD calls selrecord first
5977 so
->so_snd
.sb_flags
|= SB_SEL
;
5978 selrecord(p
, &so
->so_snd
.sb_sel
, wql
);
5982 socket_unlock(so
, 1);
5987 soo_kqfilter(struct fileproc
*fp
, struct knote
*kn
,
5988 struct kevent_internal_s
*kev
, vfs_context_t ctx
)
5991 #if !CONFIG_MACF_SOCKET
5993 #endif /* MAC_SOCKET */
5994 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
5998 so_update_last_owner_locked(so
, PROC_NULL
);
5999 so_update_policy(so
);
6001 #if CONFIG_MACF_SOCKET
6002 if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx
)),
6004 socket_unlock(so
, 1);
6005 kn
->kn_flags
= EV_ERROR
;
6006 kn
->kn_data
= EPERM
;
6009 #endif /* MAC_SOCKET */
6011 switch (kn
->kn_filter
) {
6013 kn
->kn_filtid
= EVFILTID_SOREAD
;
6016 kn
->kn_filtid
= EVFILTID_SOWRITE
;
6019 kn
->kn_filtid
= EVFILTID_SCK
;
6022 kn
->kn_filtid
= EVFILTID_SOEXCEPT
;
6025 socket_unlock(so
, 1);
6026 kn
->kn_flags
= EV_ERROR
;
6027 kn
->kn_data
= EINVAL
;
6032 * call the appropriate sub-filter attach
6033 * with the socket still locked
6035 result
= knote_fops(kn
)->f_attach(kn
, kev
);
6037 socket_unlock(so
, 1);
6043 filt_soread_common(struct knote
*kn
, struct socket
*so
)
6045 if (so
->so_options
& SO_ACCEPTCONN
) {
6049 * Radar 6615193 handle the listen case dynamically
6050 * for kqueue read filter. This allows to call listen()
6051 * after registering the kqueue EVFILT_READ.
6054 kn
->kn_data
= so
->so_qlen
;
6055 is_not_empty
= ! TAILQ_EMPTY(&so
->so_comp
);
6057 return (is_not_empty
);
6060 /* socket isn't a listener */
6062 * NOTE_LOWAT specifies new low water mark in data, i.e.
6063 * the bytes of protocol data. We therefore exclude any
6066 kn
->kn_data
= so
->so_rcv
.sb_cc
- so
->so_rcv
.sb_ctl
;
6068 if (kn
->kn_sfflags
& NOTE_OOB
) {
6069 if (so
->so_oobmark
|| (so
->so_state
& SS_RCVATMARK
)) {
6070 kn
->kn_fflags
|= NOTE_OOB
;
6071 kn
->kn_data
-= so
->so_oobmark
;
6076 if ((so
->so_state
& SS_CANTRCVMORE
)
6078 && cfil_sock_data_pending(&so
->so_rcv
) == 0
6079 #endif /* CONTENT_FILTER */
6081 kn
->kn_flags
|= EV_EOF
;
6082 kn
->kn_fflags
= so
->so_error
;
6086 if (so
->so_error
) { /* temporary udp error */
6090 int64_t lowwat
= so
->so_rcv
.sb_lowat
;
6092 * Ensure that when NOTE_LOWAT is used, the derived
6093 * low water mark is bounded by socket's rcv buf's
6094 * high and low water mark values.
6096 if (kn
->kn_sfflags
& NOTE_LOWAT
) {
6097 if (kn
->kn_sdata
> so
->so_rcv
.sb_hiwat
)
6098 lowwat
= so
->so_rcv
.sb_hiwat
;
6099 else if (kn
->kn_sdata
> lowwat
)
6100 lowwat
= kn
->kn_sdata
;
6104 * The order below is important. Since NOTE_LOWAT
6105 * overrides sb_lowat, check for NOTE_LOWAT case
6108 if (kn
->kn_sfflags
& NOTE_LOWAT
)
6109 return (kn
->kn_data
>= lowwat
);
6111 return (so
->so_rcv
.sb_cc
>= lowwat
);
6115 filt_sorattach(struct knote
*kn
, __unused
struct kevent_internal_s
*kev
)
6117 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6122 * If the caller explicitly asked for OOB results (e.g. poll())
6123 * from EVFILT_READ, then save that off in the hookid field
6124 * and reserve the kn_flags EV_OOBAND bit for output only.
6126 if (kn
->kn_filter
== EVFILT_READ
&&
6127 kn
->kn_flags
& EV_OOBAND
) {
6128 kn
->kn_flags
&= ~EV_OOBAND
;
6129 kn
->kn_hookid
= EV_OOBAND
;
6133 if (KNOTE_ATTACH(&so
->so_rcv
.sb_sel
.si_note
, kn
))
6134 so
->so_rcv
.sb_flags
|= SB_KNOTE
;
6136 /* indicate if event is already fired */
6137 return filt_soread_common(kn
, so
);
6141 filt_sordetach(struct knote
*kn
)
6143 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6146 if (so
->so_rcv
.sb_flags
& SB_KNOTE
)
6147 if (KNOTE_DETACH(&so
->so_rcv
.sb_sel
.si_note
, kn
))
6148 so
->so_rcv
.sb_flags
&= ~SB_KNOTE
;
6149 socket_unlock(so
, 1);
6154 filt_soread(struct knote
*kn
, long hint
)
6156 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6159 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
6162 retval
= filt_soread_common(kn
, so
);
6164 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
6165 socket_unlock(so
, 1);
6171 filt_sortouch(struct knote
*kn
, struct kevent_internal_s
*kev
)
6173 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6178 /* save off the new input fflags and data */
6179 kn
->kn_sfflags
= kev
->fflags
;
6180 kn
->kn_sdata
= kev
->data
;
6181 if ((kn
->kn_status
& KN_UDATA_SPECIFIC
) == 0)
6182 kn
->kn_udata
= kev
->udata
;
6184 /* determine if changes result in fired events */
6185 retval
= filt_soread_common(kn
, so
);
6187 socket_unlock(so
, 1);
6193 filt_sorprocess(struct knote
*kn
, struct filt_process_s
*data
, struct kevent_internal_s
*kev
)
6195 #pragma unused(data)
6196 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6200 retval
= filt_soread_common(kn
, so
);
6202 *kev
= kn
->kn_kevent
;
6203 if (kn
->kn_flags
& EV_CLEAR
) {
6208 socket_unlock(so
, 1);
6214 so_wait_for_if_feedback(struct socket
*so
)
6216 if ((SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) &&
6217 (so
->so_state
& SS_ISCONNECTED
)) {
6218 struct inpcb
*inp
= sotoinpcb(so
);
6219 if (INP_WAIT_FOR_IF_FEEDBACK(inp
))
6226 filt_sowrite_common(struct knote
*kn
, struct socket
*so
)
6230 kn
->kn_data
= sbspace(&so
->so_snd
);
6231 if (so
->so_state
& SS_CANTSENDMORE
) {
6232 kn
->kn_flags
|= EV_EOF
;
6233 kn
->kn_fflags
= so
->so_error
;
6236 if (so
->so_error
) { /* temporary udp error */
6239 if (!socanwrite(so
)) {
6242 if (so
->so_flags1
& SOF1_PRECONNECT_DATA
) {
6245 int64_t lowwat
= so
->so_snd
.sb_lowat
;
6246 if (kn
->kn_sfflags
& NOTE_LOWAT
) {
6247 if (kn
->kn_sdata
> so
->so_snd
.sb_hiwat
)
6248 lowwat
= so
->so_snd
.sb_hiwat
;
6249 else if (kn
->kn_sdata
> lowwat
)
6250 lowwat
= kn
->kn_sdata
;
6252 if (kn
->kn_data
>= lowwat
) {
6253 if ((so
->so_flags
& SOF_NOTSENT_LOWAT
)
6254 #if (DEBUG || DEVELOPMENT)
6255 && so_notsent_lowat_check
== 1
6256 #endif /* DEBUG || DEVELOPMENT */
6258 if ((SOCK_DOM(so
) == PF_INET
||
6259 SOCK_DOM(so
) == PF_INET6
) &&
6260 so
->so_type
== SOCK_STREAM
) {
6261 ret
= tcp_notsent_lowat_check(so
);
6264 else if ((SOCK_DOM(so
) == PF_MULTIPATH
) &&
6265 (SOCK_PROTO(so
) == IPPROTO_TCP
)) {
6266 ret
= mptcp_notsent_lowat_check(so
);
6276 if (so_wait_for_if_feedback(so
))
6282 filt_sowattach(struct knote
*kn
, __unused
struct kevent_internal_s
*kev
)
6284 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6287 if (KNOTE_ATTACH(&so
->so_snd
.sb_sel
.si_note
, kn
))
6288 so
->so_snd
.sb_flags
|= SB_KNOTE
;
6290 /* determine if its already fired */
6291 return filt_sowrite_common(kn
, so
);
6295 filt_sowdetach(struct knote
*kn
)
6297 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6300 if (so
->so_snd
.sb_flags
& SB_KNOTE
)
6301 if (KNOTE_DETACH(&so
->so_snd
.sb_sel
.si_note
, kn
))
6302 so
->so_snd
.sb_flags
&= ~SB_KNOTE
;
6303 socket_unlock(so
, 1);
6308 filt_sowrite(struct knote
*kn
, long hint
)
6310 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6313 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
6316 ret
= filt_sowrite_common(kn
, so
);
6318 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
6319 socket_unlock(so
, 1);
6325 filt_sowtouch(struct knote
*kn
, struct kevent_internal_s
*kev
)
6327 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6332 /*save off the new input fflags and data */
6333 kn
->kn_sfflags
= kev
->fflags
;
6334 kn
->kn_sdata
= kev
->data
;
6335 if ((kn
->kn_status
& KN_UDATA_SPECIFIC
) == 0)
6336 kn
->kn_udata
= kev
->udata
;
6338 /* determine if these changes result in a triggered event */
6339 ret
= filt_sowrite_common(kn
, so
);
6341 socket_unlock(so
, 1);
6347 filt_sowprocess(struct knote
*kn
, struct filt_process_s
*data
, struct kevent_internal_s
*kev
)
6349 #pragma unused(data)
6350 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6354 ret
= filt_sowrite_common(kn
, so
);
6356 *kev
= kn
->kn_kevent
;
6357 if (kn
->kn_flags
& EV_CLEAR
) {
6362 socket_unlock(so
, 1);
6367 filt_sockev_common(struct knote
*kn
, struct socket
*so
, long ev_hint
)
6370 uint32_t level_trigger
= 0;
6372 if (ev_hint
& SO_FILT_HINT_CONNRESET
) {
6373 kn
->kn_fflags
|= NOTE_CONNRESET
;
6375 if (ev_hint
& SO_FILT_HINT_TIMEOUT
) {
6376 kn
->kn_fflags
|= NOTE_TIMEOUT
;
6378 if (ev_hint
& SO_FILT_HINT_NOSRCADDR
) {
6379 kn
->kn_fflags
|= NOTE_NOSRCADDR
;
6381 if (ev_hint
& SO_FILT_HINT_IFDENIED
) {
6382 kn
->kn_fflags
|= NOTE_IFDENIED
;
6384 if (ev_hint
& SO_FILT_HINT_KEEPALIVE
) {
6385 kn
->kn_fflags
|= NOTE_KEEPALIVE
;
6387 if (ev_hint
& SO_FILT_HINT_ADAPTIVE_WTIMO
) {
6388 kn
->kn_fflags
|= NOTE_ADAPTIVE_WTIMO
;
6390 if (ev_hint
& SO_FILT_HINT_ADAPTIVE_RTIMO
) {
6391 kn
->kn_fflags
|= NOTE_ADAPTIVE_RTIMO
;
6393 if ((ev_hint
& SO_FILT_HINT_CONNECTED
) ||
6394 (so
->so_state
& SS_ISCONNECTED
)) {
6395 kn
->kn_fflags
|= NOTE_CONNECTED
;
6396 level_trigger
|= NOTE_CONNECTED
;
6398 if ((ev_hint
& SO_FILT_HINT_DISCONNECTED
) ||
6399 (so
->so_state
& SS_ISDISCONNECTED
)) {
6400 kn
->kn_fflags
|= NOTE_DISCONNECTED
;
6401 level_trigger
|= NOTE_DISCONNECTED
;
6403 if (ev_hint
& SO_FILT_HINT_CONNINFO_UPDATED
) {
6404 if (so
->so_proto
!= NULL
&&
6405 (so
->so_proto
->pr_flags
& PR_EVCONNINFO
))
6406 kn
->kn_fflags
|= NOTE_CONNINFO_UPDATED
;
6409 if ((ev_hint
& SO_FILT_HINT_NOTIFY_ACK
) ||
6410 tcp_notify_ack_active(so
)) {
6411 kn
->kn_fflags
|= NOTE_NOTIFY_ACK
;
6414 if ((so
->so_state
& SS_CANTRCVMORE
)
6416 && cfil_sock_data_pending(&so
->so_rcv
) == 0
6417 #endif /* CONTENT_FILTER */
6419 kn
->kn_fflags
|= NOTE_READCLOSED
;
6420 level_trigger
|= NOTE_READCLOSED
;
6423 if (so
->so_state
& SS_CANTSENDMORE
) {
6424 kn
->kn_fflags
|= NOTE_WRITECLOSED
;
6425 level_trigger
|= NOTE_WRITECLOSED
;
6428 if ((ev_hint
& SO_FILT_HINT_SUSPEND
) ||
6429 (so
->so_flags
& SOF_SUSPENDED
)) {
6430 kn
->kn_fflags
&= ~(NOTE_SUSPEND
| NOTE_RESUME
);
6432 /* If resume event was delivered before, reset it */
6433 kn
->kn_hookid
&= ~NOTE_RESUME
;
6435 kn
->kn_fflags
|= NOTE_SUSPEND
;
6436 level_trigger
|= NOTE_SUSPEND
;
6439 if ((ev_hint
& SO_FILT_HINT_RESUME
) ||
6440 (so
->so_flags
& SOF_SUSPENDED
) == 0) {
6441 kn
->kn_fflags
&= ~(NOTE_SUSPEND
| NOTE_RESUME
);
6443 /* If suspend event was delivered before, reset it */
6444 kn
->kn_hookid
&= ~NOTE_SUSPEND
;
6446 kn
->kn_fflags
|= NOTE_RESUME
;
6447 level_trigger
|= NOTE_RESUME
;
6450 if (so
->so_error
!= 0) {
6452 kn
->kn_data
= so
->so_error
;
6453 kn
->kn_flags
|= EV_EOF
;
6455 get_sockev_state(so
, (u_int32_t
*)&(kn
->kn_data
));
6458 /* Reset any events that are not requested on this knote */
6459 kn
->kn_fflags
&= (kn
->kn_sfflags
& EVFILT_SOCK_ALL_MASK
);
6460 level_trigger
&= (kn
->kn_sfflags
& EVFILT_SOCK_ALL_MASK
);
6462 /* Find the level triggerred events that are already delivered */
6463 level_trigger
&= kn
->kn_hookid
;
6464 level_trigger
&= EVFILT_SOCK_LEVEL_TRIGGER_MASK
;
6466 /* Do not deliver level triggerred events more than once */
6467 if ((kn
->kn_fflags
& ~level_trigger
) != 0)
6474 filt_sockattach(struct knote
*kn
, __unused
struct kevent_internal_s
*kev
)
6476 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6480 if (KNOTE_ATTACH(&so
->so_klist
, kn
))
6481 so
->so_flags
|= SOF_KNOTE
;
6483 /* determine if event already fired */
6484 return filt_sockev_common(kn
, so
, 0);
6488 filt_sockdetach(struct knote
*kn
)
6490 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6493 if ((so
->so_flags
& SOF_KNOTE
) != 0)
6494 if (KNOTE_DETACH(&so
->so_klist
, kn
))
6495 so
->so_flags
&= ~SOF_KNOTE
;
6496 socket_unlock(so
, 1);
6500 filt_sockev(struct knote
*kn
, long hint
)
6502 int ret
= 0, locked
= 0;
6503 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6504 long ev_hint
= (hint
& SO_FILT_HINT_EV
);
6506 if ((hint
& SO_FILT_HINT_LOCKED
) == 0) {
6511 ret
= filt_sockev_common(kn
, so
, ev_hint
);
6514 socket_unlock(so
, 1);
6522 * filt_socktouch - update event state
6527 struct kevent_internal_s
*kev
)
6529 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6530 uint32_t changed_flags
;
6535 /* save off the [result] data and fflags */
6536 changed_flags
= (kn
->kn_sfflags
^ kn
->kn_hookid
);
6538 /* save off the new input fflags and data */
6539 kn
->kn_sfflags
= kev
->fflags
;
6540 kn
->kn_sdata
= kev
->data
;
6541 if ((kn
->kn_status
& KN_UDATA_SPECIFIC
) == 0)
6542 kn
->kn_udata
= kev
->udata
;
6544 /* restrict the current results to the (smaller?) set of new interest */
6546 * For compatibility with previous implementations, we leave kn_fflags
6547 * as they were before.
6549 //kn->kn_fflags &= kev->fflags;
6552 * Since we keep track of events that are already
6553 * delivered, if any of those events are not requested
6554 * anymore the state related to them can be reset
6557 ~(changed_flags
& EVFILT_SOCK_LEVEL_TRIGGER_MASK
);
6559 /* determine if we have events to deliver */
6560 ret
= filt_sockev_common(kn
, so
, 0);
6562 socket_unlock(so
, 1);
6568 * filt_sockprocess - query event fired state and return data
6573 struct filt_process_s
*data
,
6574 struct kevent_internal_s
*kev
)
6576 #pragma unused(data)
6578 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6583 ret
= filt_sockev_common(kn
, so
, 0);
6585 *kev
= kn
->kn_kevent
;
6588 * Store the state of the events being delivered. This
6589 * state can be used to deliver level triggered events
6590 * ateast once and still avoid waking up the application
6591 * multiple times as long as the event is active.
6593 if (kn
->kn_fflags
!= 0)
6594 kn
->kn_hookid
|= (kn
->kn_fflags
&
6595 EVFILT_SOCK_LEVEL_TRIGGER_MASK
);
6598 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6599 * only one of them and remember the last one that was
6602 if (kn
->kn_fflags
& NOTE_SUSPEND
)
6603 kn
->kn_hookid
&= ~NOTE_RESUME
;
6604 if (kn
->kn_fflags
& NOTE_RESUME
)
6605 kn
->kn_hookid
&= ~NOTE_SUSPEND
;
6607 if (kn
->kn_flags
& EV_CLEAR
) {
6613 socket_unlock(so
, 1);
6619 get_sockev_state(struct socket
*so
, u_int32_t
*statep
)
6621 u_int32_t state
= *(statep
);
6624 * If the state variable is already used by a previous event,
6630 if (so
->so_state
& SS_ISCONNECTED
)
6631 state
|= SOCKEV_CONNECTED
;
6633 state
&= ~(SOCKEV_CONNECTED
);
6634 state
|= ((so
->so_state
& SS_ISDISCONNECTED
) ? SOCKEV_DISCONNECTED
: 0);
6638 #define SO_LOCK_HISTORY_STR_LEN \
6639 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
6641 __private_extern__
const char *
6642 solockhistory_nr(struct socket
*so
)
6646 static char lock_history_str
[SO_LOCK_HISTORY_STR_LEN
];
6648 bzero(lock_history_str
, sizeof (lock_history_str
));
6649 for (i
= SO_LCKDBG_MAX
- 1; i
>= 0; i
--) {
6650 n
+= snprintf(lock_history_str
+ n
,
6651 SO_LOCK_HISTORY_STR_LEN
- n
, "%p:%p ",
6652 so
->lock_lr
[(so
->next_lock_lr
+ i
) % SO_LCKDBG_MAX
],
6653 so
->unlock_lr
[(so
->next_unlock_lr
+ i
) % SO_LCKDBG_MAX
]);
6655 return (lock_history_str
);
6659 socket_lock(struct socket
*so
, int refcount
)
6663 lr_saved
= __builtin_return_address(0);
6665 if (so
->so_proto
->pr_lock
) {
6666 (*so
->so_proto
->pr_lock
)(so
, refcount
, lr_saved
);
6668 #ifdef MORE_LOCKING_DEBUG
6669 LCK_MTX_ASSERT(so
->so_proto
->pr_domain
->dom_mtx
,
6670 LCK_MTX_ASSERT_NOTOWNED
);
6672 lck_mtx_lock(so
->so_proto
->pr_domain
->dom_mtx
);
6675 so
->lock_lr
[so
->next_lock_lr
] = lr_saved
;
6676 so
->next_lock_lr
= (so
->next_lock_lr
+1) % SO_LCKDBG_MAX
;
6681 socket_lock_assert_owned(struct socket
*so
)
6683 lck_mtx_t
*mutex_held
;
6685 if (so
->so_proto
->pr_getlock
!= NULL
)
6686 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
6688 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
6690 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
6694 socket_try_lock(struct socket
*so
)
6698 if (so
->so_proto
->pr_getlock
!= NULL
)
6699 mtx
= (*so
->so_proto
->pr_getlock
)(so
, 0);
6701 mtx
= so
->so_proto
->pr_domain
->dom_mtx
;
6703 return (lck_mtx_try_lock(mtx
));
6707 socket_unlock(struct socket
*so
, int refcount
)
6710 lck_mtx_t
*mutex_held
;
6712 lr_saved
= __builtin_return_address(0);
6714 if (so
->so_proto
== NULL
) {
6715 panic("%s: null so_proto so=%p\n", __func__
, so
);
6719 if (so
&& so
->so_proto
->pr_unlock
) {
6720 (*so
->so_proto
->pr_unlock
)(so
, refcount
, lr_saved
);
6722 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
6723 #ifdef MORE_LOCKING_DEBUG
6724 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
6726 so
->unlock_lr
[so
->next_unlock_lr
] = lr_saved
;
6727 so
->next_unlock_lr
= (so
->next_unlock_lr
+1) % SO_LCKDBG_MAX
;
6730 if (so
->so_usecount
<= 0) {
6731 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
6732 "lrh=%s", __func__
, so
->so_usecount
, so
,
6733 SOCK_DOM(so
), so
->so_type
,
6734 SOCK_PROTO(so
), solockhistory_nr(so
));
6739 if (so
->so_usecount
== 0)
6740 sofreelastref(so
, 1);
6742 lck_mtx_unlock(mutex_held
);
6746 /* Called with socket locked, will unlock socket */
6748 sofree(struct socket
*so
)
6750 lck_mtx_t
*mutex_held
;
6752 if (so
->so_proto
->pr_getlock
!= NULL
)
6753 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
6755 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
6756 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
6758 sofreelastref(so
, 0);
6762 soreference(struct socket
*so
)
6764 socket_lock(so
, 1); /* locks & take one reference on socket */
6765 socket_unlock(so
, 0); /* unlock only */
6769 sodereference(struct socket
*so
)
6772 socket_unlock(so
, 1);
6776 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
6777 * possibility of using jumbo clusters. Caller must ensure to hold
6781 somultipages(struct socket
*so
, boolean_t set
)
6784 so
->so_flags
|= SOF_MULTIPAGES
;
6786 so
->so_flags
&= ~SOF_MULTIPAGES
;
6790 soif2kcl(struct socket
*so
, boolean_t set
)
6793 so
->so_flags1
|= SOF1_IF_2KCL
;
6795 so
->so_flags1
&= ~SOF1_IF_2KCL
;
6799 so_isdstlocal(struct socket
*so
) {
6801 struct inpcb
*inp
= (struct inpcb
*)so
->so_pcb
;
6803 if (SOCK_DOM(so
) == PF_INET
)
6804 return (inaddr_local(inp
->inp_faddr
));
6805 else if (SOCK_DOM(so
) == PF_INET6
)
6806 return (in6addr_local(&inp
->in6p_faddr
));
6812 sosetdefunct(struct proc
*p
, struct socket
*so
, int level
, boolean_t noforce
)
6814 struct sockbuf
*rcv
, *snd
;
6815 int err
= 0, defunct
;
6820 defunct
= (so
->so_flags
& SOF_DEFUNCT
);
6822 if (!(snd
->sb_flags
& rcv
->sb_flags
& SB_DROP
)) {
6823 panic("%s: SB_DROP not set", __func__
);
6829 if (so
->so_flags
& SOF_NODEFUNCT
) {
6832 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
6833 "name %s level %d) so 0x%llx [%d,%d] "
6834 "is not eligible for defunct "
6835 "(%d)\n", __func__
, proc_selfpid(),
6836 proc_best_name(current_proc()), proc_pid(p
),
6837 proc_best_name(p
), level
,
6838 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
6839 SOCK_DOM(so
), SOCK_TYPE(so
), err
);
6842 so
->so_flags
&= ~SOF_NODEFUNCT
;
6843 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6844 "so 0x%llx [%d,%d] defunct by force\n", __func__
,
6845 proc_selfpid(), proc_best_name(current_proc()),
6846 proc_pid(p
), proc_best_name(p
), level
,
6847 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
6848 SOCK_DOM(so
), SOCK_TYPE(so
));
6849 } else if (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
) {
6850 struct inpcb
*inp
= (struct inpcb
*)so
->so_pcb
;
6851 struct ifnet
*ifp
= inp
->inp_last_outifp
;
6853 if (ifp
&& IFNET_IS_CELLULAR(ifp
)) {
6854 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_nocell
);
6855 } else if (so
->so_flags
& SOF_DELEGATED
) {
6856 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_nodlgtd
);
6857 } else if (soextbkidlestat
.so_xbkidle_time
== 0) {
6858 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_notime
);
6859 } else if (noforce
) {
6860 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_active
);
6862 so
->so_flags1
|= SOF1_EXTEND_BK_IDLE_INPROG
;
6863 so
->so_extended_bk_start
= net_uptime();
6864 OSBitOrAtomic(P_LXBKIDLEINPROG
, &p
->p_ladvflag
);
6866 inpcb_timer_sched(inp
->inp_pcbinfo
, INPCB_TIMER_LAZY
);
6869 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s "
6870 "level %d) extend bk idle so 0x%llx rcv hw %d "
6872 __func__
, proc_selfpid(),
6873 proc_best_name(current_proc()), proc_pid(p
),
6874 proc_best_name(p
), level
,
6875 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
6876 so
->so_rcv
.sb_hiwat
, so
->so_rcv
.sb_cc
);
6879 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_forced
);
6883 so
->so_flags
|= SOF_DEFUNCT
;
6885 /* Prevent further data from being appended to the socket buffers */
6886 snd
->sb_flags
|= SB_DROP
;
6887 rcv
->sb_flags
|= SB_DROP
;
6889 /* Flush any existing data in the socket buffers */
6890 if (rcv
->sb_cc
!= 0) {
6891 rcv
->sb_flags
&= ~SB_SEL
;
6892 selthreadclear(&rcv
->sb_sel
);
6895 if (snd
->sb_cc
!= 0) {
6896 snd
->sb_flags
&= ~SB_SEL
;
6897 selthreadclear(&snd
->sb_sel
);
6902 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6903 "so 0x%llx [%d,%d] %s defunct%s\n", __func__
, proc_selfpid(),
6904 proc_best_name(current_proc()), proc_pid(p
), proc_best_name(p
),
6905 level
, (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), SOCK_DOM(so
),
6906 SOCK_TYPE(so
), defunct
? "is already" : "marked as",
6907 (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
) ? " extbkidle" : "");
6913 sodefunct(struct proc
*p
, struct socket
*so
, int level
)
6915 struct sockbuf
*rcv
, *snd
;
6917 if (!(so
->so_flags
& SOF_DEFUNCT
)) {
6918 panic("%s improperly called", __func__
);
6921 if (so
->so_state
& SS_DEFUNCT
)
6927 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
6928 char s
[MAX_IPv6_STR_LEN
];
6929 char d
[MAX_IPv6_STR_LEN
];
6930 struct inpcb
*inp
= sotoinpcb(so
);
6932 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6933 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
6934 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n",
6935 __func__
, proc_selfpid(), proc_best_name(current_proc()),
6936 proc_pid(p
), proc_best_name(p
), level
,
6937 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
6938 (SOCK_TYPE(so
) == SOCK_STREAM
) ? "TCP" : "UDP",
6939 inet_ntop(SOCK_DOM(so
), ((SOCK_DOM(so
) == PF_INET
) ?
6940 (void *)&inp
->inp_laddr
.s_addr
: (void *)&inp
->in6p_laddr
),
6941 s
, sizeof (s
)), ntohs(inp
->in6p_lport
),
6942 inet_ntop(SOCK_DOM(so
), (SOCK_DOM(so
) == PF_INET
) ?
6943 (void *)&inp
->inp_faddr
.s_addr
: (void *)&inp
->in6p_faddr
,
6944 d
, sizeof (d
)), ntohs(inp
->in6p_fport
),
6945 (uint32_t)rcv
->sb_sel
.si_flags
,
6946 (uint32_t)snd
->sb_sel
.si_flags
,
6947 rcv
->sb_flags
, snd
->sb_flags
);
6949 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6950 "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
6951 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__
,
6952 proc_selfpid(), proc_best_name(current_proc()),
6953 proc_pid(p
), proc_best_name(p
), level
,
6954 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
6955 SOCK_DOM(so
), SOCK_TYPE(so
),
6956 (uint32_t)rcv
->sb_sel
.si_flags
,
6957 (uint32_t)snd
->sb_sel
.si_flags
, rcv
->sb_flags
,
6962 * Unwedge threads blocked on sbwait() and sb_lock().
6967 so
->so_flags1
|= SOF1_DEFUNCTINPROG
;
6968 if (rcv
->sb_flags
& SB_LOCK
)
6969 sbunlock(rcv
, TRUE
); /* keep socket locked */
6970 if (snd
->sb_flags
& SB_LOCK
)
6971 sbunlock(snd
, TRUE
); /* keep socket locked */
6974 * Flush the buffers and disconnect. We explicitly call shutdown
6975 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
6976 * states are set for the socket. This would also flush out data
6977 * hanging off the receive list of this socket.
6979 (void) soshutdownlock_final(so
, SHUT_RD
);
6980 (void) soshutdownlock_final(so
, SHUT_WR
);
6981 (void) sodisconnectlocked(so
);
6984 * Explicitly handle connectionless-protocol disconnection
6985 * and release any remaining data in the socket buffers.
6987 if (!(so
->so_state
& SS_ISDISCONNECTED
))
6988 (void) soisdisconnected(so
);
6990 if (so
->so_error
== 0)
6991 so
->so_error
= EBADF
;
6993 if (rcv
->sb_cc
!= 0) {
6994 rcv
->sb_flags
&= ~SB_SEL
;
6995 selthreadclear(&rcv
->sb_sel
);
6998 if (snd
->sb_cc
!= 0) {
6999 snd
->sb_flags
&= ~SB_SEL
;
7000 selthreadclear(&snd
->sb_sel
);
7003 so
->so_state
|= SS_DEFUNCT
;
7004 OSIncrementAtomicLong((volatile long *)&sodefunct_calls
);
7011 soresume(struct proc
*p
, struct socket
*so
, int locked
)
7016 if (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_INPROG
) {
7017 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7018 "[%d,%d] resumed from bk idle\n",
7019 __func__
, proc_selfpid(), proc_best_name(current_proc()),
7020 proc_pid(p
), proc_best_name(p
),
7021 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7022 SOCK_DOM(so
), SOCK_TYPE(so
));
7024 so
->so_flags1
&= ~SOF1_EXTEND_BK_IDLE_INPROG
;
7025 so
->so_extended_bk_start
= 0;
7026 OSBitAndAtomic(~P_LXBKIDLEINPROG
, &p
->p_ladvflag
);
7028 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_resumed
);
7029 OSDecrementAtomic(&soextbkidlestat
.so_xbkidle_active
);
7030 VERIFY(soextbkidlestat
.so_xbkidle_active
>= 0);
7033 socket_unlock(so
, 1);
7039 * Does not attempt to account for sockets that are delegated from
7040 * the current process
7043 so_set_extended_bk_idle(struct socket
*so
, int optval
)
7047 if ((SOCK_DOM(so
) != PF_INET
&& SOCK_DOM(so
) != PF_INET6
) ||
7048 SOCK_PROTO(so
) != IPPROTO_TCP
) {
7049 OSDecrementAtomic(&soextbkidlestat
.so_xbkidle_notsupp
);
7051 } else if (optval
== 0) {
7052 so
->so_flags1
&= ~SOF1_EXTEND_BK_IDLE_WANTED
;
7054 soresume(current_proc(), so
, 1);
7056 struct proc
*p
= current_proc();
7058 struct filedesc
*fdp
;
7062 * Unlock socket to avoid lock ordering issue with
7063 * the proc fd table lock
7065 socket_unlock(so
, 0);
7070 for (i
= 0; i
< fdp
->fd_nfiles
; i
++) {
7071 struct fileproc
*fp
= fdp
->fd_ofiles
[i
];
7075 (fdp
->fd_ofileflags
[i
] & UF_RESERVED
) != 0 ||
7076 FILEGLOB_DTYPE(fp
->f_fglob
) != DTYPE_SOCKET
)
7079 so2
= (struct socket
*)fp
->f_fglob
->fg_data
;
7081 so2
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
)
7083 if (count
>= soextbkidlestat
.so_xbkidle_maxperproc
)
7090 if (count
>= soextbkidlestat
.so_xbkidle_maxperproc
) {
7091 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_toomany
);
7093 } else if (so
->so_flags
& SOF_DELEGATED
) {
7094 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_nodlgtd
);
7097 so
->so_flags1
|= SOF1_EXTEND_BK_IDLE_WANTED
;
7098 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_wantok
);
7100 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7101 "%s marked for extended bk idle\n",
7102 __func__
, proc_selfpid(), proc_best_name(current_proc()),
7103 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7104 SOCK_DOM(so
), SOCK_TYPE(so
),
7105 (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
) ?
7113 so_stop_extended_bk_idle(struct socket
*so
)
7115 so
->so_flags1
&= ~SOF1_EXTEND_BK_IDLE_INPROG
;
7116 so
->so_extended_bk_start
= 0;
7118 OSDecrementAtomic(&soextbkidlestat
.so_xbkidle_active
);
7119 VERIFY(soextbkidlestat
.so_xbkidle_active
>= 0);
7123 sosetdefunct(current_proc(), so
,
7124 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL
, FALSE
);
7125 if (so
->so_flags
& SOF_DEFUNCT
) {
7126 sodefunct(current_proc(), so
,
7127 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL
);
7132 so_drain_extended_bk_idle(struct socket
*so
)
7134 if (so
&& (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_INPROG
)) {
7136 * Only penalize sockets that have outstanding data
7138 if (so
->so_rcv
.sb_cc
|| so
->so_snd
.sb_cc
) {
7139 so_stop_extended_bk_idle(so
);
7141 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_drained
);
7147 * Return values tells if socket is still in extended background idle
7150 so_check_extended_bk_idle_time(struct socket
*so
)
7154 if ((so
->so_flags1
& SOF1_EXTEND_BK_IDLE_INPROG
)) {
7155 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7156 __func__
, proc_selfpid(), proc_best_name(current_proc()),
7157 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7158 SOCK_DOM(so
), SOCK_TYPE(so
));
7159 if (net_uptime() - so
->so_extended_bk_start
>
7160 soextbkidlestat
.so_xbkidle_time
) {
7161 so_stop_extended_bk_idle(so
);
7163 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_expired
);
7167 struct inpcb
*inp
= (struct inpcb
*)so
->so_pcb
;
7169 inpcb_timer_sched(inp
->inp_pcbinfo
, INPCB_TIMER_LAZY
);
7170 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_resched
);
7178 resume_proc_sockets(proc_t p
)
7180 if (p
->p_ladvflag
& P_LXBKIDLEINPROG
) {
7181 struct filedesc
*fdp
;
7186 for (i
= 0; i
< fdp
->fd_nfiles
; i
++) {
7187 struct fileproc
*fp
;
7190 fp
= fdp
->fd_ofiles
[i
];
7192 (fdp
->fd_ofileflags
[i
] & UF_RESERVED
) != 0 ||
7193 FILEGLOB_DTYPE(fp
->f_fglob
) != DTYPE_SOCKET
)
7196 so
= (struct socket
*)fp
->f_fglob
->fg_data
;
7197 (void) soresume(p
, so
, 0);
7201 OSBitAndAtomic(~P_LXBKIDLEINPROG
, &p
->p_ladvflag
);
7205 __private_extern__
int
7206 so_set_recv_anyif(struct socket
*so
, int optval
)
7211 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
7213 if (SOCK_DOM(so
) == PF_INET
) {
7216 sotoinpcb(so
)->inp_flags
|= INP_RECV_ANYIF
;
7218 sotoinpcb(so
)->inp_flags
&= ~INP_RECV_ANYIF
;
7225 __private_extern__
int
7226 so_get_recv_anyif(struct socket
*so
)
7231 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
7233 if (SOCK_DOM(so
) == PF_INET
) {
7235 ret
= (sotoinpcb(so
)->inp_flags
& INP_RECV_ANYIF
) ? 1 : 0;
7242 so_set_restrictions(struct socket
*so
, uint32_t vals
)
7244 int nocell_old
, nocell_new
;
7245 int noexpensive_old
, noexpensive_new
;
7248 * Deny-type restrictions are trapdoors; once set they cannot be
7249 * unset for the lifetime of the socket. This allows them to be
7250 * issued by a framework on behalf of the application without
7251 * having to worry that they can be undone.
7253 * Note here that socket-level restrictions overrides any protocol
7254 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7255 * socket restriction issued on the socket has a higher precendence
7256 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7257 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7258 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7260 nocell_old
= (so
->so_restrictions
& SO_RESTRICT_DENY_CELLULAR
);
7261 noexpensive_old
= (so
->so_restrictions
& SO_RESTRICT_DENY_EXPENSIVE
);
7262 so
->so_restrictions
|= (vals
& (SO_RESTRICT_DENY_IN
|
7263 SO_RESTRICT_DENY_OUT
| SO_RESTRICT_DENY_CELLULAR
|
7264 SO_RESTRICT_DENY_EXPENSIVE
));
7265 nocell_new
= (so
->so_restrictions
& SO_RESTRICT_DENY_CELLULAR
);
7266 noexpensive_new
= (so
->so_restrictions
& SO_RESTRICT_DENY_EXPENSIVE
);
7268 /* we can only set, not clear restrictions */
7269 if ((nocell_new
- nocell_old
) == 0 &&
7270 (noexpensive_new
- noexpensive_old
) == 0)
7273 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
7275 if (SOCK_DOM(so
) == PF_INET
) {
7277 if (nocell_new
- nocell_old
!= 0) {
7279 * if deny cellular is now set, do what's needed
7282 inp_set_nocellular(sotoinpcb(so
));
7284 if (noexpensive_new
- noexpensive_old
!= 0) {
7285 inp_set_noexpensive(sotoinpcb(so
));
7289 if (SOCK_DOM(so
) == PF_MULTIPATH
)
7290 mptcp_set_restrictions(so
);
7296 so_get_restrictions(struct socket
*so
)
7298 return (so
->so_restrictions
& (SO_RESTRICT_DENY_IN
|
7299 SO_RESTRICT_DENY_OUT
|
7300 SO_RESTRICT_DENY_CELLULAR
| SO_RESTRICT_DENY_EXPENSIVE
));
7304 so_set_effective_pid(struct socket
*so
, int epid
, struct proc
*p
)
7306 struct proc
*ep
= PROC_NULL
;
7309 /* pid 0 is reserved for kernel */
7316 * If this is an in-kernel socket, prevent its delegate
7317 * association from changing unless the socket option is
7318 * coming from within the kernel itself.
7320 if (so
->last_pid
== 0 && p
!= kernproc
) {
7326 * If this is issued by a process that's recorded as the
7327 * real owner of the socket, or if the pid is the same as
7328 * the process's own pid, then proceed. Otherwise ensure
7329 * that the issuing process has the necessary privileges.
7331 if (epid
!= so
->last_pid
|| epid
!= proc_pid(p
)) {
7332 if ((error
= priv_check_cred(kauth_cred_get(),
7333 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE
, 0))) {
7339 /* Find the process that corresponds to the effective pid */
7340 if ((ep
= proc_find(epid
)) == PROC_NULL
) {
7346 * If a process tries to delegate the socket to itself, then
7347 * there's really nothing to do; treat it as a way for the
7348 * delegate association to be cleared. Note that we check
7349 * the passed-in proc rather than calling proc_selfpid(),
7350 * as we need to check the process issuing the socket option
7351 * which could be kernproc. Given that we don't allow 0 for
7352 * effective pid, it means that a delegated in-kernel socket
7353 * stays delegated during its lifetime (which is probably OK.)
7355 if (epid
== proc_pid(p
)) {
7356 so
->so_flags
&= ~SOF_DELEGATED
;
7359 uuid_clear(so
->e_uuid
);
7361 so
->so_flags
|= SOF_DELEGATED
;
7362 so
->e_upid
= proc_uniqueid(ep
);
7363 so
->e_pid
= proc_pid(ep
);
7364 proc_getexecutableuuid(ep
, so
->e_uuid
, sizeof (so
->e_uuid
));
7367 if (error
== 0 && net_io_policy_log
) {
7370 uuid_unparse(so
->e_uuid
, buf
);
7371 log(LOG_DEBUG
, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7372 "euuid %s%s\n", __func__
, proc_name_address(p
),
7373 proc_pid(p
), (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7374 SOCK_DOM(so
), SOCK_TYPE(so
),
7375 so
->e_pid
, proc_name_address(ep
), buf
,
7376 ((so
->so_flags
& SOF_DELEGATED
) ? " [delegated]" : ""));
7377 } else if (error
!= 0 && net_io_policy_log
) {
7378 log(LOG_ERR
, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7379 "ERROR (%d)\n", __func__
, proc_name_address(p
),
7380 proc_pid(p
), (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7381 SOCK_DOM(so
), SOCK_TYPE(so
),
7382 epid
, (ep
== PROC_NULL
) ? "PROC_NULL" :
7383 proc_name_address(ep
), error
);
7386 /* Update this socket's policy upon success */
7388 so
->so_policy_gencnt
*= -1;
7389 so_update_policy(so
);
7391 so_update_necp_policy(so
, NULL
, NULL
);
7395 if (ep
!= PROC_NULL
)
7402 so_set_effective_uuid(struct socket
*so
, uuid_t euuid
, struct proc
*p
)
7408 /* UUID must not be all-zeroes (reserved for kernel) */
7409 if (uuid_is_null(euuid
)) {
7415 * If this is an in-kernel socket, prevent its delegate
7416 * association from changing unless the socket option is
7417 * coming from within the kernel itself.
7419 if (so
->last_pid
== 0 && p
!= kernproc
) {
7424 /* Get the UUID of the issuing process */
7425 proc_getexecutableuuid(p
, uuid
, sizeof (uuid
));
7428 * If this is issued by a process that's recorded as the
7429 * real owner of the socket, or if the uuid is the same as
7430 * the process's own uuid, then proceed. Otherwise ensure
7431 * that the issuing process has the necessary privileges.
7433 if (uuid_compare(euuid
, so
->last_uuid
) != 0 ||
7434 uuid_compare(euuid
, uuid
) != 0) {
7435 if ((error
= priv_check_cred(kauth_cred_get(),
7436 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE
, 0))) {
7443 * If a process tries to delegate the socket to itself, then
7444 * there's really nothing to do; treat it as a way for the
7445 * delegate association to be cleared. Note that we check
7446 * the uuid of the passed-in proc rather than that of the
7447 * current process, as we need to check the process issuing
7448 * the socket option which could be kernproc itself. Given
7449 * that we don't allow 0 for effective uuid, it means that
7450 * a delegated in-kernel socket stays delegated during its
7451 * lifetime (which is okay.)
7453 if (uuid_compare(euuid
, uuid
) == 0) {
7454 so
->so_flags
&= ~SOF_DELEGATED
;
7457 uuid_clear(so
->e_uuid
);
7459 so
->so_flags
|= SOF_DELEGATED
;
7461 * Unlike so_set_effective_pid(), we only have the UUID
7462 * here and the process ID is not known. Inherit the
7463 * real {pid,upid} of the socket.
7465 so
->e_upid
= so
->last_upid
;
7466 so
->e_pid
= so
->last_pid
;
7467 uuid_copy(so
->e_uuid
, euuid
);
7471 if (error
== 0 && net_io_policy_log
) {
7472 uuid_unparse(so
->e_uuid
, buf
);
7473 log(LOG_DEBUG
, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7474 "euuid %s%s\n", __func__
, proc_name_address(p
), proc_pid(p
),
7475 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), SOCK_DOM(so
),
7476 SOCK_TYPE(so
), so
->e_pid
, buf
,
7477 ((so
->so_flags
& SOF_DELEGATED
) ? " [delegated]" : ""));
7478 } else if (error
!= 0 && net_io_policy_log
) {
7479 uuid_unparse(euuid
, buf
);
7480 log(LOG_DEBUG
, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7481 "ERROR (%d)\n", __func__
, proc_name_address(p
), proc_pid(p
),
7482 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), SOCK_DOM(so
),
7483 SOCK_TYPE(so
), buf
, error
);
7486 /* Update this socket's policy upon success */
7488 so
->so_policy_gencnt
*= -1;
7489 so_update_policy(so
);
7491 so_update_necp_policy(so
, NULL
, NULL
);
7499 netpolicy_post_msg(uint32_t ev_code
, struct netpolicy_event_data
*ev_data
,
7500 uint32_t ev_datalen
)
7502 struct kev_msg ev_msg
;
7505 * A netpolicy event always starts with a netpolicy_event_data
7506 * structure, but the caller can provide for a longer event
7507 * structure to post, depending on the event code.
7509 VERIFY(ev_data
!= NULL
&& ev_datalen
>= sizeof (*ev_data
));
7511 bzero(&ev_msg
, sizeof (ev_msg
));
7512 ev_msg
.vendor_code
= KEV_VENDOR_APPLE
;
7513 ev_msg
.kev_class
= KEV_NETWORK_CLASS
;
7514 ev_msg
.kev_subclass
= KEV_NETPOLICY_SUBCLASS
;
7515 ev_msg
.event_code
= ev_code
;
7517 ev_msg
.dv
[0].data_ptr
= ev_data
;
7518 ev_msg
.dv
[0].data_length
= ev_datalen
;
7520 kev_post_msg(&ev_msg
);
7524 socket_post_kev_msg(uint32_t ev_code
,
7525 struct kev_socket_event_data
*ev_data
,
7526 uint32_t ev_datalen
)
7528 struct kev_msg ev_msg
;
7530 bzero(&ev_msg
, sizeof(ev_msg
));
7531 ev_msg
.vendor_code
= KEV_VENDOR_APPLE
;
7532 ev_msg
.kev_class
= KEV_NETWORK_CLASS
;
7533 ev_msg
.kev_subclass
= KEV_SOCKET_SUBCLASS
;
7534 ev_msg
.event_code
= ev_code
;
7536 ev_msg
.dv
[0].data_ptr
= ev_data
;
7537 ev_msg
.dv
[0]. data_length
= ev_datalen
;
7539 kev_post_msg(&ev_msg
);
7543 socket_post_kev_msg_closed(struct socket
*so
)
7545 struct kev_socket_closed ev
;
7546 struct sockaddr
*socksa
= NULL
, *peersa
= NULL
;
7548 bzero(&ev
, sizeof(ev
));
7549 err
= (*so
->so_proto
->pr_usrreqs
->pru_sockaddr
)(so
, &socksa
);
7551 err
= (*so
->so_proto
->pr_usrreqs
->pru_peeraddr
)(so
,
7554 memcpy(&ev
.ev_data
.kev_sockname
, socksa
,
7556 sizeof (ev
.ev_data
.kev_sockname
)));
7557 memcpy(&ev
.ev_data
.kev_peername
, peersa
,
7559 sizeof (ev
.ev_data
.kev_peername
)));
7560 socket_post_kev_msg(KEV_SOCKET_CLOSED
,
7561 &ev
.ev_data
, sizeof (ev
));
7565 FREE(socksa
, M_SONAME
);
7567 FREE(peersa
, M_SONAME
);