2 * Copyright (c) 1998-2015 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
92 #include <sys/uio_internal.h>
94 #include <sys/kdebug.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/ntstat.h>
102 #include <net/content_filter.h>
103 #include <netinet/in.h>
104 #include <netinet/in_pcb.h>
105 #include <netinet/ip6.h>
106 #include <netinet6/ip6_var.h>
107 #include <netinet/flow_divert.h>
108 #include <kern/zalloc.h>
109 #include <kern/locks.h>
110 #include <machine/limits.h>
111 #include <libkern/OSAtomic.h>
112 #include <pexpert/pexpert.h>
113 #include <kern/assert.h>
114 #include <kern/task.h>
115 #include <sys/kpi_mbuf.h>
116 #include <sys/mcache.h>
117 #include <sys/unpcb.h>
120 #include <security/mac.h>
121 #include <security/mac_framework.h>
125 #include <netinet/mp_pcb.h>
126 #include <netinet/mptcp_var.h>
127 #endif /* MULTIPATH */
129 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
131 #if DEBUG || DEVELOPMENT
132 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
134 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
137 /* TODO: this should be in a header file somewhere */
138 extern char *proc_name_address(void *p
);
140 static u_int32_t so_cache_hw
; /* High water mark for socache */
141 static u_int32_t so_cache_timeouts
; /* number of timeouts */
142 static u_int32_t so_cache_max_freed
; /* max freed per timeout */
143 static u_int32_t cached_sock_count
= 0;
144 STAILQ_HEAD(, socket
) so_cache_head
;
145 int max_cached_sock_count
= MAX_CACHED_SOCKETS
;
146 static u_int32_t so_cache_time
;
147 static int socketinit_done
;
148 static struct zone
*so_cache_zone
;
150 static lck_grp_t
*so_cache_mtx_grp
;
151 static lck_attr_t
*so_cache_mtx_attr
;
152 static lck_grp_attr_t
*so_cache_mtx_grp_attr
;
153 static lck_mtx_t
*so_cache_mtx
;
155 #include <machine/limits.h>
157 static void filt_sordetach(struct knote
*kn
);
158 static int filt_soread(struct knote
*kn
, long hint
);
159 static void filt_sowdetach(struct knote
*kn
);
160 static int filt_sowrite(struct knote
*kn
, long hint
);
161 static void filt_sockdetach(struct knote
*kn
);
162 static int filt_sockev(struct knote
*kn
, long hint
);
163 static void filt_socktouch(struct knote
*kn
, struct kevent_internal_s
*kev
,
166 static int sooptcopyin_timeval(struct sockopt
*, struct timeval
*);
167 static int sooptcopyout_timeval(struct sockopt
*, const struct timeval
*);
169 static struct filterops soread_filtops
= {
171 .f_detach
= filt_sordetach
,
172 .f_event
= filt_soread
,
175 static struct filterops sowrite_filtops
= {
177 .f_detach
= filt_sowdetach
,
178 .f_event
= filt_sowrite
,
181 static struct filterops sock_filtops
= {
183 .f_detach
= filt_sockdetach
,
184 .f_event
= filt_sockev
,
185 .f_touch
= filt_socktouch
,
188 SYSCTL_DECL(_kern_ipc
);
190 #define EVEN_MORE_LOCKING_DEBUG 0
192 int socket_debug
= 0;
193 SYSCTL_INT(_kern_ipc
, OID_AUTO
, socket_debug
,
194 CTLFLAG_RW
| CTLFLAG_LOCKED
, &socket_debug
, 0, "");
196 static int socket_zone
= M_SOCKET
;
197 so_gen_t so_gencnt
; /* generation count for sockets */
199 MALLOC_DEFINE(M_SONAME
, "soname", "socket name");
200 MALLOC_DEFINE(M_PCB
, "pcb", "protocol control block");
202 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
203 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
204 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
205 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
206 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
207 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
208 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
209 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
210 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
212 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
214 int somaxconn
= SOMAXCONN
;
215 SYSCTL_INT(_kern_ipc
, KIPC_SOMAXCONN
, somaxconn
,
216 CTLFLAG_RW
| CTLFLAG_LOCKED
, &somaxconn
, 0, "");
218 /* Should we get a maximum also ??? */
219 static int sosendmaxchain
= 65536;
220 static int sosendminchain
= 16384;
221 static int sorecvmincopy
= 16384;
222 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendminchain
,
223 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sosendminchain
, 0, "");
224 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sorecvmincopy
,
225 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sorecvmincopy
, 0, "");
228 * Set to enable jumbo clusters (if available) for large writes when
229 * the socket is marked with SOF_MULTIPAGES; see below.
232 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendjcl
,
233 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sosendjcl
, 0, "");
236 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
237 * writes on the socket for all protocols on any network interfaces,
238 * depending upon sosendjcl above. Be extra careful when setting this
239 * to 1, because sending down packets that cross physical pages down to
240 * broken drivers (those that falsely assume that the physical pages
241 * are contiguous) might lead to system panics or silent data corruption.
242 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
243 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
244 * capable. Set this to 1 only for testing/debugging purposes.
246 int sosendjcl_ignore_capab
= 0;
247 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendjcl_ignore_capab
,
248 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sosendjcl_ignore_capab
, 0, "");
251 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
252 * writes on the socket for all protocols on any network interfaces.
253 * Be extra careful when setting this to 1, because sending down packets with
254 * clusters larger that 2 KB might lead to system panics or data corruption.
255 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
256 * on the outgoing interface
257 * Set this to 1 for testing/debugging purposes only.
259 int sosendbigcl_ignore_capab
= 0;
260 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendbigcl_ignore_capab
,
261 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sosendbigcl_ignore_capab
, 0, "");
263 int sodefunctlog
= 0;
264 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sodefunctlog
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
265 &sodefunctlog
, 0, "");
267 int sothrottlelog
= 0;
268 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sothrottlelog
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
269 &sothrottlelog
, 0, "");
271 int sorestrictrecv
= 1;
272 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sorestrictrecv
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
273 &sorestrictrecv
, 0, "Enable inbound interface restrictions");
275 int sorestrictsend
= 1;
276 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sorestrictsend
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
277 &sorestrictsend
, 0, "Enable outbound interface restrictions");
279 int soreserveheadroom
= 1;
280 SYSCTL_INT(_kern_ipc
, OID_AUTO
, soreserveheadroom
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
281 &soreserveheadroom
, 0, "To allocate contiguous datagram buffers");
283 extern struct inpcbinfo tcbinfo
;
285 /* TODO: these should be in header file */
286 extern int get_inpcb_str_size(void);
287 extern int get_tcp_str_size(void);
289 static unsigned int sl_zone_size
; /* size of sockaddr_list */
290 static struct zone
*sl_zone
; /* zone for sockaddr_list */
292 static unsigned int se_zone_size
; /* size of sockaddr_entry */
293 static struct zone
*se_zone
; /* zone for sockaddr_entry */
295 vm_size_t so_cache_zone_element_size
;
297 static int sodelayed_copy(struct socket
*, struct uio
*, struct mbuf
**,
299 static void cached_sock_alloc(struct socket
**, int);
300 static void cached_sock_free(struct socket
*);
303 * Maximum of extended background idle sockets per process
304 * Set to zero to disable further setting of the option
307 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
308 #define SO_IDLE_BK_IDLE_TIME 600
309 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
311 struct soextbkidlestat soextbkidlestat
;
313 SYSCTL_UINT(_kern_ipc
, OID_AUTO
, maxextbkidleperproc
,
314 CTLFLAG_RW
| CTLFLAG_LOCKED
, &soextbkidlestat
.so_xbkidle_maxperproc
, 0,
315 "Maximum of extended background idle sockets per process");
317 SYSCTL_UINT(_kern_ipc
, OID_AUTO
, extbkidletime
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
318 &soextbkidlestat
.so_xbkidle_time
, 0,
319 "Time in seconds to keep extended background idle sockets");
321 SYSCTL_UINT(_kern_ipc
, OID_AUTO
, extbkidlercvhiwat
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
322 &soextbkidlestat
.so_xbkidle_rcvhiwat
, 0,
323 "High water mark for extended background idle sockets");
325 SYSCTL_STRUCT(_kern_ipc
, OID_AUTO
, extbkidlestat
, CTLFLAG_RD
| CTLFLAG_LOCKED
,
326 &soextbkidlestat
, soextbkidlestat
, "");
328 int so_set_extended_bk_idle(struct socket
*, int);
331 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
332 * setting the DSCP code on the packet based on the service class; see
333 * <rdar://problem/11277343> for details.
335 __private_extern__ u_int32_t sotcdb
= SOTCDB_NO_DSCP
;
336 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sotcdb
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
342 _CASSERT(sizeof(so_gencnt
) == sizeof(uint64_t));
343 VERIFY(IS_P2ALIGNED(&so_gencnt
, sizeof(uint32_t)));
346 _CASSERT(sizeof(struct sa_endpoints
) == sizeof(struct user64_sa_endpoints
));
347 _CASSERT(offsetof(struct sa_endpoints
, sae_srcif
) == offsetof(struct user64_sa_endpoints
, sae_srcif
));
348 _CASSERT(offsetof(struct sa_endpoints
, sae_srcaddr
) == offsetof(struct user64_sa_endpoints
, sae_srcaddr
));
349 _CASSERT(offsetof(struct sa_endpoints
, sae_srcaddrlen
) == offsetof(struct user64_sa_endpoints
, sae_srcaddrlen
));
350 _CASSERT(offsetof(struct sa_endpoints
, sae_dstaddr
) == offsetof(struct user64_sa_endpoints
, sae_dstaddr
));
351 _CASSERT(offsetof(struct sa_endpoints
, sae_dstaddrlen
) == offsetof(struct user64_sa_endpoints
, sae_dstaddrlen
));
353 _CASSERT(sizeof(struct sa_endpoints
) == sizeof(struct user32_sa_endpoints
));
354 _CASSERT(offsetof(struct sa_endpoints
, sae_srcif
) == offsetof(struct user32_sa_endpoints
, sae_srcif
));
355 _CASSERT(offsetof(struct sa_endpoints
, sae_srcaddr
) == offsetof(struct user32_sa_endpoints
, sae_srcaddr
));
356 _CASSERT(offsetof(struct sa_endpoints
, sae_srcaddrlen
) == offsetof(struct user32_sa_endpoints
, sae_srcaddrlen
));
357 _CASSERT(offsetof(struct sa_endpoints
, sae_dstaddr
) == offsetof(struct user32_sa_endpoints
, sae_dstaddr
));
358 _CASSERT(offsetof(struct sa_endpoints
, sae_dstaddrlen
) == offsetof(struct user32_sa_endpoints
, sae_dstaddrlen
));
361 if (socketinit_done
) {
362 printf("socketinit: already called...\n");
367 PE_parse_boot_argn("socket_debug", &socket_debug
,
368 sizeof (socket_debug
));
371 * allocate lock group attribute and group for socket cache mutex
373 so_cache_mtx_grp_attr
= lck_grp_attr_alloc_init();
374 so_cache_mtx_grp
= lck_grp_alloc_init("so_cache",
375 so_cache_mtx_grp_attr
);
378 * allocate the lock attribute for socket cache mutex
380 so_cache_mtx_attr
= lck_attr_alloc_init();
382 /* cached sockets mutex */
383 so_cache_mtx
= lck_mtx_alloc_init(so_cache_mtx_grp
, so_cache_mtx_attr
);
384 if (so_cache_mtx
== NULL
) {
385 panic("%s: unable to allocate so_cache_mtx\n", __func__
);
388 STAILQ_INIT(&so_cache_head
);
390 so_cache_zone_element_size
= (vm_size_t
)(sizeof (struct socket
) + 4
391 + get_inpcb_str_size() + 4 + get_tcp_str_size());
393 so_cache_zone
= zinit(so_cache_zone_element_size
,
394 (120000 * so_cache_zone_element_size
), 8192, "socache zone");
395 zone_change(so_cache_zone
, Z_CALLERACCT
, FALSE
);
396 zone_change(so_cache_zone
, Z_NOENCRYPT
, TRUE
);
398 sl_zone_size
= sizeof (struct sockaddr_list
);
399 if ((sl_zone
= zinit(sl_zone_size
, 1024 * sl_zone_size
, 1024,
400 "sockaddr_list")) == NULL
) {
401 panic("%s: unable to allocate sockaddr_list zone\n", __func__
);
404 zone_change(sl_zone
, Z_CALLERACCT
, FALSE
);
405 zone_change(sl_zone
, Z_EXPAND
, TRUE
);
407 se_zone_size
= sizeof (struct sockaddr_entry
);
408 if ((se_zone
= zinit(se_zone_size
, 1024 * se_zone_size
, 1024,
409 "sockaddr_entry")) == NULL
) {
410 panic("%s: unable to allocate sockaddr_entry zone\n", __func__
);
413 zone_change(se_zone
, Z_CALLERACCT
, FALSE
);
414 zone_change(se_zone
, Z_EXPAND
, TRUE
);
416 bzero(&soextbkidlestat
, sizeof(struct soextbkidlestat
));
417 soextbkidlestat
.so_xbkidle_maxperproc
= SO_IDLE_BK_IDLE_MAX_PER_PROC
;
418 soextbkidlestat
.so_xbkidle_time
= SO_IDLE_BK_IDLE_TIME
;
419 soextbkidlestat
.so_xbkidle_rcvhiwat
= SO_IDLE_BK_IDLE_RCV_HIWAT
;
423 socket_tclass_init();
426 #endif /* MULTIPATH */
430 cached_sock_alloc(struct socket
**so
, int waitok
)
435 lck_mtx_lock(so_cache_mtx
);
437 if (!STAILQ_EMPTY(&so_cache_head
)) {
438 VERIFY(cached_sock_count
> 0);
440 *so
= STAILQ_FIRST(&so_cache_head
);
441 STAILQ_REMOVE_HEAD(&so_cache_head
, so_cache_ent
);
442 STAILQ_NEXT((*so
), so_cache_ent
) = NULL
;
445 lck_mtx_unlock(so_cache_mtx
);
447 temp
= (*so
)->so_saved_pcb
;
448 bzero((caddr_t
)*so
, sizeof (struct socket
));
450 (*so
)->so_saved_pcb
= temp
;
453 lck_mtx_unlock(so_cache_mtx
);
456 *so
= (struct socket
*)zalloc(so_cache_zone
);
458 *so
= (struct socket
*)zalloc_noblock(so_cache_zone
);
463 bzero((caddr_t
)*so
, sizeof (struct socket
));
466 * Define offsets for extra structures into our
467 * single block of memory. Align extra structures
468 * on longword boundaries.
471 offset
= (uintptr_t)*so
;
472 offset
+= sizeof (struct socket
);
474 offset
= ALIGN(offset
);
476 (*so
)->so_saved_pcb
= (caddr_t
)offset
;
477 offset
+= get_inpcb_str_size();
479 offset
= ALIGN(offset
);
481 ((struct inpcb
*)(void *)(*so
)->so_saved_pcb
)->inp_saved_ppcb
=
485 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER
, &(*so
)->so_flags1
);
489 cached_sock_free(struct socket
*so
)
492 lck_mtx_lock(so_cache_mtx
);
494 so_cache_time
= net_uptime();
495 if (++cached_sock_count
> max_cached_sock_count
) {
497 lck_mtx_unlock(so_cache_mtx
);
498 zfree(so_cache_zone
, so
);
500 if (so_cache_hw
< cached_sock_count
)
501 so_cache_hw
= cached_sock_count
;
503 STAILQ_INSERT_TAIL(&so_cache_head
, so
, so_cache_ent
);
505 so
->cache_timestamp
= so_cache_time
;
506 lck_mtx_unlock(so_cache_mtx
);
511 so_update_last_owner_locked(struct socket
*so
, proc_t self
)
513 if (so
->last_pid
!= 0) {
515 * last_pid and last_upid should remain zero for sockets
516 * created using sock_socket. The check above achieves that
518 if (self
== PROC_NULL
)
519 self
= current_proc();
521 if (so
->last_upid
!= proc_uniqueid(self
) ||
522 so
->last_pid
!= proc_pid(self
)) {
523 so
->last_upid
= proc_uniqueid(self
);
524 so
->last_pid
= proc_pid(self
);
525 proc_getexecutableuuid(self
, so
->last_uuid
,
526 sizeof (so
->last_uuid
));
528 proc_pidoriginatoruuid(so
->so_vuuid
, sizeof(so
->so_vuuid
));
533 so_update_policy(struct socket
*so
)
535 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
)
536 (void) inp_update_policy(sotoinpcb(so
));
541 so_update_necp_policy(struct socket
*so
, struct sockaddr
*override_local_addr
,
542 struct sockaddr
*override_remote_addr
)
544 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
)
545 inp_update_necp_policy(sotoinpcb(so
), override_local_addr
,
546 override_remote_addr
, 0);
555 boolean_t rc
= FALSE
;
557 lck_mtx_lock(so_cache_mtx
);
559 so_cache_time
= net_uptime();
561 while (!STAILQ_EMPTY(&so_cache_head
)) {
562 VERIFY(cached_sock_count
> 0);
563 p
= STAILQ_FIRST(&so_cache_head
);
564 if ((so_cache_time
- p
->cache_timestamp
) <
568 STAILQ_REMOVE_HEAD(&so_cache_head
, so_cache_ent
);
571 zfree(so_cache_zone
, p
);
573 if (++n_freed
>= SO_CACHE_MAX_FREE_BATCH
) {
574 so_cache_max_freed
++;
579 /* Schedule again if there is more to cleanup */
580 if (!STAILQ_EMPTY(&so_cache_head
))
583 lck_mtx_unlock(so_cache_mtx
);
588 * Get a socket structure from our zone, and initialize it.
589 * We don't implement `waitok' yet (see comments in uipc_domain.c).
590 * Note that it would probably be better to allocate socket
591 * and PCB at the same time, but I'm not convinced that all
592 * the protocols can be easily modified to do this.
595 soalloc(int waitok
, int dom
, int type
)
599 if ((dom
== PF_INET
) && (type
== SOCK_STREAM
)) {
600 cached_sock_alloc(&so
, waitok
);
602 MALLOC_ZONE(so
, struct socket
*, sizeof (*so
), socket_zone
,
605 bzero(so
, sizeof (*so
));
608 so
->so_gencnt
= OSIncrementAtomic64((SInt64
*)&so_gencnt
);
609 so
->so_zone
= socket_zone
;
610 #if CONFIG_MACF_SOCKET
611 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
612 if (mac_socket_label_init(so
, !waitok
) != 0) {
616 #endif /* MAC_SOCKET */
623 socreate_internal(int dom
, struct socket
**aso
, int type
, int proto
,
624 struct proc
*p
, uint32_t flags
, struct proc
*ep
)
631 extern int tcpconsdebug
;
638 prp
= pffindproto(dom
, proto
, type
);
640 prp
= pffindtype(dom
, type
);
642 if (prp
== NULL
|| prp
->pr_usrreqs
->pru_attach
== NULL
) {
643 if (pffinddomain(dom
) == NULL
)
644 return (EAFNOSUPPORT
);
646 if (pffindprotonotype(dom
, proto
) != NULL
)
649 return (EPROTONOSUPPORT
);
651 if (prp
->pr_type
!= type
)
653 so
= soalloc(1, dom
, type
);
657 if (flags
& SOCF_ASYNC
)
658 so
->so_state
|= SS_NBIO
;
660 if (flags
& SOCF_MP_SUBFLOW
) {
662 * A multipath subflow socket is used internally in the kernel,
663 * therefore it does not have a file desciptor associated by
666 so
->so_state
|= SS_NOFDREF
;
667 so
->so_flags
|= SOF_MP_SUBFLOW
;
669 #endif /* MULTIPATH */
671 TAILQ_INIT(&so
->so_incomp
);
672 TAILQ_INIT(&so
->so_comp
);
674 so
->last_upid
= proc_uniqueid(p
);
675 so
->last_pid
= proc_pid(p
);
676 proc_getexecutableuuid(p
, so
->last_uuid
, sizeof (so
->last_uuid
));
677 proc_pidoriginatoruuid(so
->so_vuuid
, sizeof(so
->so_vuuid
));
679 if (ep
!= PROC_NULL
&& ep
!= p
) {
680 so
->e_upid
= proc_uniqueid(ep
);
681 so
->e_pid
= proc_pid(ep
);
682 proc_getexecutableuuid(ep
, so
->e_uuid
, sizeof (so
->e_uuid
));
683 so
->so_flags
|= SOF_DELEGATED
;
686 so
->so_cred
= kauth_cred_proc_ref(p
);
687 if (!suser(kauth_cred_get(), NULL
))
688 so
->so_state
|= SS_PRIV
;
691 so
->so_rcv
.sb_flags
|= SB_RECV
;
692 so
->so_rcv
.sb_so
= so
->so_snd
.sb_so
= so
;
693 so
->next_lock_lr
= 0;
694 so
->next_unlock_lr
= 0;
696 #if CONFIG_MACF_SOCKET
697 mac_socket_label_associate(kauth_cred_get(), so
);
698 #endif /* MAC_SOCKET */
701 * Attachment will create the per pcb lock if necessary and
702 * increase refcount for creation, make sure it's done before
703 * socket is inserted in lists.
707 error
= (*prp
->pr_usrreqs
->pru_attach
)(so
, proto
, p
);
711 * If so_pcb is not zero, the socket will be leaked,
712 * so protocol attachment handler must be coded carefuly
714 so
->so_state
|= SS_NOFDREF
;
716 sofreelastref(so
, 1); /* will deallocate the socket */
720 atomic_add_32(&prp
->pr_domain
->dom_refs
, 1);
721 TAILQ_INIT(&so
->so_evlist
);
723 /* Attach socket filters for this protocol */
726 if (tcpconsdebug
== 2)
727 so
->so_options
|= SO_DEBUG
;
729 so_set_default_traffic_class(so
);
732 * If this thread or task is marked to create backgrounded sockets,
733 * mark the socket as background.
735 if (proc_get_effective_thread_policy(current_thread(),
736 TASK_POLICY_NEW_SOCKETS_BG
)) {
737 socket_set_traffic_mgt_flags(so
, TRAFFIC_MGT_SO_BACKGROUND
);
738 so
->so_background_thread
= current_thread();
743 * Don't mark Unix domain, system or multipath sockets as
744 * eligible for defunct by default.
749 so
->so_flags
|= SOF_NODEFUNCT
;
756 * Entitlements can't be checked at socket creation time except if the
757 * application requested a feature guarded by a privilege (c.f., socket
759 * The priv(9) and the Sandboxing APIs are designed with the idea that
760 * a privilege check should only be triggered by a userland request.
761 * A privilege check at socket creation time is time consuming and
762 * could trigger many authorisation error messages from the security
777 * <pru_attach>:ENOBUFS[AF_UNIX]
778 * <pru_attach>:ENOBUFS[TCP]
779 * <pru_attach>:ENOMEM[TCP]
780 * <pru_attach>:??? [other protocol families, IPSEC]
783 socreate(int dom
, struct socket
**aso
, int type
, int proto
)
785 return (socreate_internal(dom
, aso
, type
, proto
, current_proc(), 0,
790 socreate_delegate(int dom
, struct socket
**aso
, int type
, int proto
, pid_t epid
)
793 struct proc
*ep
= PROC_NULL
;
795 if ((proc_selfpid() != epid
) && ((ep
= proc_find(epid
)) == PROC_NULL
)) {
800 error
= socreate_internal(dom
, aso
, type
, proto
, current_proc(), 0, ep
);
803 * It might not be wise to hold the proc reference when calling
804 * socreate_internal since it calls soalloc with M_WAITOK
815 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
816 * <pru_bind>:EAFNOSUPPORT Address family not supported
817 * <pru_bind>:EADDRNOTAVAIL Address not available.
818 * <pru_bind>:EINVAL Invalid argument
819 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
820 * <pru_bind>:EACCES Permission denied
821 * <pru_bind>:EADDRINUSE Address in use
822 * <pru_bind>:EAGAIN Resource unavailable, try again
823 * <pru_bind>:EPERM Operation not permitted
827 * Notes: It's not possible to fully enumerate the return codes above,
828 * since socket filter authors and protocol family authors may
829 * not choose to limit their error returns to those listed, even
830 * though this may result in some software operating incorrectly.
832 * The error codes which are enumerated above are those known to
833 * be returned by the tcp_usr_bind function supplied.
836 sobindlock(struct socket
*so
, struct sockaddr
*nam
, int dolock
)
838 struct proc
*p
= current_proc();
843 VERIFY(so
->so_usecount
> 1);
845 so_update_last_owner_locked(so
, p
);
846 so_update_policy(so
);
849 so_update_necp_policy(so
, nam
, NULL
);
853 * If this is a bind request on a socket that has been marked
854 * as inactive, reject it now before we go any further.
856 if (so
->so_flags
& SOF_DEFUNCT
) {
858 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
859 __func__
, proc_pid(p
), (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
860 SOCK_DOM(so
), SOCK_TYPE(so
), error
));
865 error
= sflt_bind(so
, nam
);
868 error
= (*so
->so_proto
->pr_usrreqs
->pru_bind
)(so
, nam
, p
);
871 socket_unlock(so
, 1);
873 if (error
== EJUSTRETURN
)
880 sodealloc(struct socket
*so
)
882 kauth_cred_unref(&so
->so_cred
);
884 /* Remove any filters */
888 cfil_sock_detach(so
);
889 #endif /* CONTENT_FILTER */
891 /* Delete the state allocated for msg queues on a socket */
892 if (so
->so_flags
& SOF_ENABLE_MSGS
) {
893 FREE(so
->so_msg_state
, M_TEMP
);
894 so
->so_msg_state
= NULL
;
896 VERIFY(so
->so_msg_state
== NULL
);
898 so
->so_gencnt
= OSIncrementAtomic64((SInt64
*)&so_gencnt
);
900 #if CONFIG_MACF_SOCKET
901 mac_socket_label_destroy(so
);
902 #endif /* MAC_SOCKET */
904 if (so
->so_flags1
& SOF1_CACHED_IN_SOCK_LAYER
) {
905 cached_sock_free(so
);
907 FREE_ZONE(so
, sizeof (*so
), so
->so_zone
);
915 * <pru_listen>:EINVAL[AF_UNIX]
916 * <pru_listen>:EINVAL[TCP]
917 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
918 * <pru_listen>:EINVAL[TCP] Invalid argument
919 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
920 * <pru_listen>:EACCES[TCP] Permission denied
921 * <pru_listen>:EADDRINUSE[TCP] Address in use
922 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
923 * <pru_listen>:EPERM[TCP] Operation not permitted
926 * Notes: Other <pru_listen> returns depend on the protocol family; all
927 * <sf_listen> returns depend on what the filter author causes
928 * their filter to return.
931 solisten(struct socket
*so
, int backlog
)
933 struct proc
*p
= current_proc();
938 so_update_last_owner_locked(so
, p
);
939 so_update_policy(so
);
942 so_update_necp_policy(so
, NULL
, NULL
);
945 if (so
->so_proto
== NULL
) {
949 if ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) == 0) {
955 * If the listen request is made on a socket that is not fully
956 * disconnected, or on a socket that has been marked as inactive,
957 * reject the request now.
960 (SS_ISCONNECTED
|SS_ISCONNECTING
|SS_ISDISCONNECTING
)) ||
961 (so
->so_flags
& SOF_DEFUNCT
)) {
963 if (so
->so_flags
& SOF_DEFUNCT
) {
964 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
965 "(%d)\n", __func__
, proc_pid(p
),
966 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
967 SOCK_DOM(so
), SOCK_TYPE(so
), error
));
972 if ((so
->so_restrictions
& SO_RESTRICT_DENY_IN
) != 0) {
977 error
= sflt_listen(so
);
979 error
= (*so
->so_proto
->pr_usrreqs
->pru_listen
)(so
, p
);
982 if (error
== EJUSTRETURN
)
987 if (TAILQ_EMPTY(&so
->so_comp
))
988 so
->so_options
|= SO_ACCEPTCONN
;
990 * POSIX: The implementation may have an upper limit on the length of
991 * the listen queue-either global or per accepting socket. If backlog
992 * exceeds this limit, the length of the listen queue is set to the
995 * If listen() is called with a backlog argument value that is less
996 * than 0, the function behaves as if it had been called with a backlog
997 * argument value of 0.
999 * A backlog argument of 0 may allow the socket to accept connections,
1000 * in which case the length of the listen queue may be set to an
1001 * implementation-defined minimum value.
1003 if (backlog
<= 0 || backlog
> somaxconn
)
1004 backlog
= somaxconn
;
1006 so
->so_qlimit
= backlog
;
1008 socket_unlock(so
, 1);
1013 sofreelastref(struct socket
*so
, int dealloc
)
1015 struct socket
*head
= so
->so_head
;
1017 /* Assume socket is locked */
1019 if (!(so
->so_flags
& SOF_PCBCLEARING
) || !(so
->so_state
& SS_NOFDREF
)) {
1020 selthreadclear(&so
->so_snd
.sb_sel
);
1021 selthreadclear(&so
->so_rcv
.sb_sel
);
1022 so
->so_rcv
.sb_flags
&= ~(SB_SEL
|SB_UPCALL
);
1023 so
->so_snd
.sb_flags
&= ~(SB_SEL
|SB_UPCALL
);
1024 so
->so_event
= sonullevent
;
1028 socket_lock(head
, 1);
1029 if (so
->so_state
& SS_INCOMP
) {
1030 TAILQ_REMOVE(&head
->so_incomp
, so
, so_list
);
1032 } else if (so
->so_state
& SS_COMP
) {
1034 * We must not decommission a socket that's
1035 * on the accept(2) queue. If we do, then
1036 * accept(2) may hang after select(2) indicated
1037 * that the listening socket was ready.
1039 selthreadclear(&so
->so_snd
.sb_sel
);
1040 selthreadclear(&so
->so_rcv
.sb_sel
);
1041 so
->so_rcv
.sb_flags
&= ~(SB_SEL
|SB_UPCALL
);
1042 so
->so_snd
.sb_flags
&= ~(SB_SEL
|SB_UPCALL
);
1043 so
->so_event
= sonullevent
;
1044 socket_unlock(head
, 1);
1047 panic("sofree: not queued");
1050 so
->so_state
&= ~SS_INCOMP
;
1052 socket_unlock(head
, 1);
1058 if (so
->so_flags
& SOF_FLOW_DIVERT
) {
1059 flow_divert_detach(so
);
1061 #endif /* FLOW_DIVERT */
1063 /* 3932268: disable upcall */
1064 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
1065 so
->so_snd
.sb_flags
&= ~SB_UPCALL
;
1066 so
->so_event
= sonullevent
;
1073 soclose_wait_locked(struct socket
*so
)
1075 lck_mtx_t
*mutex_held
;
1077 if (so
->so_proto
->pr_getlock
!= NULL
)
1078 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
1080 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1081 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1084 * Double check here and return if there's no outstanding upcall;
1085 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1087 if (!so
->so_upcallusecount
|| !(so
->so_flags
& SOF_UPCALLCLOSEWAIT
))
1089 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
1090 so
->so_snd
.sb_flags
&= ~SB_UPCALL
;
1091 so
->so_flags
|= SOF_CLOSEWAIT
;
1092 (void) msleep((caddr_t
)&so
->so_upcallusecount
, mutex_held
, (PZERO
- 1),
1093 "soclose_wait_locked", NULL
);
1094 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1095 so
->so_flags
&= ~SOF_CLOSEWAIT
;
1099 * Close a socket on last file table reference removal.
1100 * Initiate disconnect if connected.
1101 * Free socket when disconnect complete.
1104 soclose_locked(struct socket
*so
)
1107 lck_mtx_t
*mutex_held
;
1110 if (so
->so_usecount
== 0) {
1111 panic("soclose: so=%p refcount=0\n", so
);
1115 sflt_notify(so
, sock_evt_closing
, NULL
);
1117 if (so
->so_upcallusecount
)
1118 soclose_wait_locked(so
);
1122 * We have to wait until the content filters are done
1124 if ((so
->so_flags
& SOF_CONTENT_FILTER
) != 0) {
1125 cfil_sock_close_wait(so
);
1126 cfil_sock_is_closed(so
);
1127 cfil_sock_detach(so
);
1129 #endif /* CONTENT_FILTER */
1131 if (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_INPROG
) {
1132 soresume(current_proc(), so
, 1);
1133 so
->so_flags1
&= ~SOF1_EXTEND_BK_IDLE_WANTED
;
1136 if ((so
->so_options
& SO_ACCEPTCONN
)) {
1137 struct socket
*sp
, *sonext
;
1141 * We do not want new connection to be added
1142 * to the connection queues
1144 so
->so_options
&= ~SO_ACCEPTCONN
;
1146 for (sp
= TAILQ_FIRST(&so
->so_incomp
);
1147 sp
!= NULL
; sp
= sonext
) {
1148 sonext
= TAILQ_NEXT(sp
, so_list
);
1152 * skip sockets thrown away by tcpdropdropblreq
1153 * they will get cleanup by the garbage collection.
1154 * otherwise, remove the incomp socket from the queue
1155 * and let soabort trigger the appropriate cleanup.
1157 if (sp
->so_flags
& SOF_OVERFLOW
)
1160 if (so
->so_proto
->pr_getlock
!= NULL
) {
1162 * Lock ordering for consistency with the
1163 * rest of the stack, we lock the socket
1164 * first and then grabb the head.
1166 socket_unlock(so
, 0);
1172 TAILQ_REMOVE(&so
->so_incomp
, sp
, so_list
);
1175 if (sp
->so_state
& SS_INCOMP
) {
1176 sp
->so_state
&= ~SS_INCOMP
;
1183 socket_unlock(sp
, 1);
1186 while ((sp
= TAILQ_FIRST(&so
->so_comp
)) != NULL
) {
1187 /* Dequeue from so_comp since sofree() won't do it */
1188 TAILQ_REMOVE(&so
->so_comp
, sp
, so_list
);
1191 if (so
->so_proto
->pr_getlock
!= NULL
) {
1192 socket_unlock(so
, 0);
1196 if (sp
->so_state
& SS_COMP
) {
1197 sp
->so_state
&= ~SS_COMP
;
1203 if (so
->so_proto
->pr_getlock
!= NULL
) {
1204 socket_unlock(sp
, 1);
1209 if (so
->so_pcb
== NULL
) {
1210 /* 3915887: mark the socket as ready for dealloc */
1211 so
->so_flags
|= SOF_PCBCLEARING
;
1214 if (so
->so_state
& SS_ISCONNECTED
) {
1215 if ((so
->so_state
& SS_ISDISCONNECTING
) == 0) {
1216 error
= sodisconnectlocked(so
);
1220 if (so
->so_options
& SO_LINGER
) {
1221 if ((so
->so_state
& SS_ISDISCONNECTING
) &&
1222 (so
->so_state
& SS_NBIO
))
1224 if (so
->so_proto
->pr_getlock
!= NULL
)
1225 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
1227 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1228 while (so
->so_state
& SS_ISCONNECTED
) {
1229 ts
.tv_sec
= (so
->so_linger
/100);
1230 ts
.tv_nsec
= (so
->so_linger
% 100) *
1231 NSEC_PER_USEC
* 1000 * 10;
1232 error
= msleep((caddr_t
)&so
->so_timeo
,
1233 mutex_held
, PSOCK
| PCATCH
, "soclose", &ts
);
1236 * It's OK when the time fires,
1237 * don't report an error
1239 if (error
== EWOULDBLOCK
)
1247 if (so
->so_usecount
== 0) {
1248 panic("soclose: usecount is zero so=%p\n", so
);
1251 if (so
->so_pcb
!= NULL
&& !(so
->so_flags
& SOF_PCBCLEARING
)) {
1252 int error2
= (*so
->so_proto
->pr_usrreqs
->pru_detach
)(so
);
1256 if (so
->so_usecount
<= 0) {
1257 panic("soclose: usecount is zero so=%p\n", so
);
1261 if (so
->so_pcb
!= NULL
&& !(so
->so_flags
& SOF_MP_SUBFLOW
) &&
1262 (so
->so_state
& SS_NOFDREF
)) {
1263 panic("soclose: NOFDREF");
1266 so
->so_state
|= SS_NOFDREF
;
1268 if (so
->so_flags
& SOF_MP_SUBFLOW
)
1269 so
->so_flags
&= ~SOF_MP_SUBFLOW
;
1271 if ((so
->so_flags
& SOF_KNOTE
) != 0)
1272 KNOTE(&so
->so_klist
, SO_FILT_HINT_LOCKED
);
1274 atomic_add_32(&so
->so_proto
->pr_domain
->dom_refs
, -1);
1283 soclose(struct socket
*so
)
1288 if (so
->so_retaincnt
== 0) {
1289 error
= soclose_locked(so
);
1292 * if the FD is going away, but socket is
1293 * retained in kernel remove its reference
1296 if (so
->so_usecount
< 2)
1297 panic("soclose: retaincnt non null and so=%p "
1298 "usecount=%d\n", so
, so
->so_usecount
);
1300 socket_unlock(so
, 1);
1305 * Must be called at splnet...
1307 /* Should already be locked */
1309 soabort(struct socket
*so
)
1313 #ifdef MORE_LOCKING_DEBUG
1314 lck_mtx_t
*mutex_held
;
1316 if (so
->so_proto
->pr_getlock
!= NULL
)
1317 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
1319 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1320 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1323 if ((so
->so_flags
& SOF_ABORTED
) == 0) {
1324 so
->so_flags
|= SOF_ABORTED
;
1325 error
= (*so
->so_proto
->pr_usrreqs
->pru_abort
)(so
);
1335 soacceptlock(struct socket
*so
, struct sockaddr
**nam
, int dolock
)
1342 so_update_last_owner_locked(so
, PROC_NULL
);
1343 so_update_policy(so
);
1345 so_update_necp_policy(so
, NULL
, NULL
);
1348 if ((so
->so_state
& SS_NOFDREF
) == 0)
1349 panic("soaccept: !NOFDREF");
1350 so
->so_state
&= ~SS_NOFDREF
;
1351 error
= (*so
->so_proto
->pr_usrreqs
->pru_accept
)(so
, nam
);
1354 socket_unlock(so
, 1);
1359 soaccept(struct socket
*so
, struct sockaddr
**nam
)
1361 return (soacceptlock(so
, nam
, 1));
1365 soacceptfilter(struct socket
*so
)
1367 struct sockaddr
*local
= NULL
, *remote
= NULL
;
1369 struct socket
*head
= so
->so_head
;
1372 * Hold the lock even if this socket has not been made visible
1373 * to the filter(s). For sockets with global locks, this protects
1374 * against the head or peer going away
1377 if (sogetaddr_locked(so
, &remote
, 1) != 0 ||
1378 sogetaddr_locked(so
, &local
, 0) != 0) {
1379 so
->so_state
&= ~(SS_NOFDREF
| SS_COMP
);
1381 socket_unlock(so
, 1);
1383 /* Out of resources; try it again next time */
1384 error
= ECONNABORTED
;
1388 error
= sflt_accept(head
, so
, local
, remote
);
1391 * If we get EJUSTRETURN from one of the filters, mark this socket
1392 * as inactive and return it anyway. This newly accepted socket
1393 * will be disconnected later before we hand it off to the caller.
1395 if (error
== EJUSTRETURN
) {
1397 (void) sosetdefunct(current_proc(), so
,
1398 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL
, FALSE
);
1403 * This may seem like a duplication to the above error
1404 * handling part when we return ECONNABORTED, except
1405 * the following is done while holding the lock since
1406 * the socket has been exposed to the filter(s) earlier.
1408 so
->so_state
&= ~(SS_NOFDREF
| SS_COMP
);
1410 socket_unlock(so
, 1);
1412 /* Propagate socket filter's error code to the caller */
1414 socket_unlock(so
, 1);
1417 /* Callee checks for NULL pointer */
1418 sock_freeaddr(remote
);
1419 sock_freeaddr(local
);
1424 * Returns: 0 Success
1425 * EOPNOTSUPP Operation not supported on socket
1426 * EISCONN Socket is connected
1427 * <pru_connect>:EADDRNOTAVAIL Address not available.
1428 * <pru_connect>:EINVAL Invalid argument
1429 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1430 * <pru_connect>:EACCES Permission denied
1431 * <pru_connect>:EADDRINUSE Address in use
1432 * <pru_connect>:EAGAIN Resource unavailable, try again
1433 * <pru_connect>:EPERM Operation not permitted
1434 * <sf_connect_out>:??? [anything a filter writer might set]
1437 soconnectlock(struct socket
*so
, struct sockaddr
*nam
, int dolock
)
1440 struct proc
*p
= current_proc();
1445 so_update_last_owner_locked(so
, p
);
1446 so_update_policy(so
);
1449 so_update_necp_policy(so
, NULL
, nam
);
1453 * If this is a listening socket or if this is a previously-accepted
1454 * socket that has been marked as inactive, reject the connect request.
1456 if ((so
->so_options
& SO_ACCEPTCONN
) || (so
->so_flags
& SOF_DEFUNCT
)) {
1458 if (so
->so_flags
& SOF_DEFUNCT
) {
1459 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1460 "(%d)\n", __func__
, proc_pid(p
),
1461 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
1462 SOCK_DOM(so
), SOCK_TYPE(so
), error
));
1465 socket_unlock(so
, 1);
1469 if ((so
->so_restrictions
& SO_RESTRICT_DENY_OUT
) != 0) {
1471 socket_unlock(so
, 1);
1476 * If protocol is connection-based, can only connect once.
1477 * Otherwise, if connected, try to disconnect first.
1478 * This allows user to disconnect by connecting to, e.g.,
1481 if (so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
) &&
1482 ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ||
1483 (error
= sodisconnectlocked(so
)))) {
1487 * Run connect filter before calling protocol:
1488 * - non-blocking connect returns before completion;
1490 error
= sflt_connectout(so
, nam
);
1492 if (error
== EJUSTRETURN
)
1495 error
= (*so
->so_proto
->pr_usrreqs
->pru_connect
)
1500 socket_unlock(so
, 1);
1505 soconnect(struct socket
*so
, struct sockaddr
*nam
)
1507 return (soconnectlock(so
, nam
, 1));
1511 * Returns: 0 Success
1512 * <pru_connect2>:EINVAL[AF_UNIX]
1513 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1514 * <pru_connect2>:??? [other protocol families]
1516 * Notes: <pru_connect2> is not supported by [TCP].
1519 soconnect2(struct socket
*so1
, struct socket
*so2
)
1523 socket_lock(so1
, 1);
1524 if (so2
->so_proto
->pr_lock
)
1525 socket_lock(so2
, 1);
1527 error
= (*so1
->so_proto
->pr_usrreqs
->pru_connect2
)(so1
, so2
);
1529 socket_unlock(so1
, 1);
1530 if (so2
->so_proto
->pr_lock
)
1531 socket_unlock(so2
, 1);
1536 soconnectxlocked(struct socket
*so
, struct sockaddr_list
**src_sl
,
1537 struct sockaddr_list
**dst_sl
, struct proc
*p
, uint32_t ifscope
,
1538 sae_associd_t aid
, sae_connid_t
*pcid
, uint32_t flags
, void *arg
,
1539 uint32_t arglen
, uio_t auio
, user_ssize_t
*bytes_written
)
1543 so_update_last_owner_locked(so
, p
);
1544 so_update_policy(so
);
1547 * If this is a listening socket or if this is a previously-accepted
1548 * socket that has been marked as inactive, reject the connect request.
1550 if ((so
->so_options
& SO_ACCEPTCONN
) || (so
->so_flags
& SOF_DEFUNCT
)) {
1552 if (so
->so_flags
& SOF_DEFUNCT
) {
1553 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1554 "(%d)\n", __func__
, proc_pid(p
),
1555 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
1556 SOCK_DOM(so
), SOCK_TYPE(so
), error
));
1561 if ((so
->so_restrictions
& SO_RESTRICT_DENY_OUT
) != 0)
1565 * If protocol is connection-based, can only connect once
1566 * unless PR_MULTICONN is set. Otherwise, if connected,
1567 * try to disconnect first. This allows user to disconnect
1568 * by connecting to, e.g., a null address.
1570 if ((so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
)) &&
1571 !(so
->so_proto
->pr_flags
& PR_MULTICONN
) &&
1572 ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ||
1573 (error
= sodisconnectlocked(so
)) != 0)) {
1577 * Run connect filter before calling protocol:
1578 * - non-blocking connect returns before completion;
1580 error
= sflt_connectxout(so
, dst_sl
);
1582 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1583 so
->so_flags1
&= ~SOF1_PRECONNECT_DATA
;
1584 if (error
== EJUSTRETURN
)
1587 error
= (*so
->so_proto
->pr_usrreqs
->pru_connectx
)
1588 (so
, src_sl
, dst_sl
, p
, ifscope
, aid
, pcid
,
1589 flags
, arg
, arglen
, auio
, bytes_written
);
1597 sodisconnectlocked(struct socket
*so
)
1601 if ((so
->so_state
& SS_ISCONNECTED
) == 0) {
1605 if (so
->so_state
& SS_ISDISCONNECTING
) {
1610 error
= (*so
->so_proto
->pr_usrreqs
->pru_disconnect
)(so
);
1612 sflt_notify(so
, sock_evt_disconnected
, NULL
);
1618 /* Locking version */
1620 sodisconnect(struct socket
*so
)
1625 error
= sodisconnectlocked(so
);
1626 socket_unlock(so
, 1);
1631 sodisconnectxlocked(struct socket
*so
, sae_associd_t aid
, sae_connid_t cid
)
1636 * Call the protocol disconnectx handler; let it handle all
1637 * matters related to the connection state of this session.
1639 error
= (*so
->so_proto
->pr_usrreqs
->pru_disconnectx
)(so
, aid
, cid
);
1642 * The event applies only for the session, not for
1643 * the disconnection of individual subflows.
1645 if (so
->so_state
& (SS_ISDISCONNECTING
|SS_ISDISCONNECTED
))
1646 sflt_notify(so
, sock_evt_disconnected
, NULL
);
1652 sodisconnectx(struct socket
*so
, sae_associd_t aid
, sae_connid_t cid
)
1657 error
= sodisconnectxlocked(so
, aid
, cid
);
1658 socket_unlock(so
, 1);
1663 sopeelofflocked(struct socket
*so
, sae_associd_t aid
, struct socket
**psop
)
1665 return ((*so
->so_proto
->pr_usrreqs
->pru_peeloff
)(so
, aid
, psop
));
1668 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1671 * sosendcheck will lock the socket buffer if it isn't locked and
1672 * verify that there is space for the data being inserted.
1674 * Returns: 0 Success
1676 * sblock:EWOULDBLOCK
1683 sosendcheck(struct socket
*so
, struct sockaddr
*addr
, user_ssize_t resid
,
1684 int32_t clen
, int32_t atomic
, int flags
, int *sblocked
,
1685 struct mbuf
*control
)
1692 if (*sblocked
== 0) {
1693 if ((so
->so_snd
.sb_flags
& SB_LOCK
) != 0 &&
1694 so
->so_send_filt_thread
!= 0 &&
1695 so
->so_send_filt_thread
== current_thread()) {
1697 * We're being called recursively from a filter,
1698 * allow this to continue. Radar 4150520.
1699 * Don't set sblocked because we don't want
1700 * to perform an unlock later.
1704 error
= sblock(&so
->so_snd
, SBLOCKWAIT(flags
));
1706 if (so
->so_flags
& SOF_DEFUNCT
)
1715 * If a send attempt is made on a socket that has been marked
1716 * as inactive (disconnected), reject the request.
1718 if (so
->so_flags
& SOF_DEFUNCT
) {
1721 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
1722 __func__
, proc_selfpid(),
1723 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
1724 SOCK_DOM(so
), SOCK_TYPE(so
), error
));
1728 if (so
->so_state
& SS_CANTSENDMORE
) {
1731 * Can re-inject data of half closed connections
1733 if ((so
->so_state
& SS_ISDISCONNECTED
) == 0 &&
1734 so
->so_snd
.sb_cfil_thread
== current_thread() &&
1735 cfil_sock_data_pending(&so
->so_snd
) != 0)
1737 "so %llx ignore SS_CANTSENDMORE",
1738 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
));
1740 #endif /* CONTENT_FILTER */
1744 error
= so
->so_error
;
1749 if ((so
->so_state
& SS_ISCONNECTED
) == 0) {
1750 if ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) != 0) {
1751 if (((so
->so_state
& SS_ISCONFIRMING
) == 0) &&
1752 (resid
!= 0 || clen
== 0) &&
1753 !(so
->so_flags1
& SOF1_PRECONNECT_DATA
)) {
1756 * MPTCP Fast Join sends data before the
1757 * socket is truly connected.
1759 if ((so
->so_flags
& (SOF_MP_SUBFLOW
|
1760 SOF_MPTCP_FASTJOIN
)) !=
1761 (SOF_MP_SUBFLOW
| SOF_MPTCP_FASTJOIN
))
1765 } else if (addr
== 0 && !(flags
&MSG_HOLD
)) {
1766 return ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ?
1767 ENOTCONN
: EDESTADDRREQ
);
1771 if (so
->so_flags
& SOF_ENABLE_MSGS
)
1772 space
= msgq_sbspace(so
, control
);
1774 space
= sbspace(&so
->so_snd
);
1776 if (flags
& MSG_OOB
)
1778 if ((atomic
&& resid
> so
->so_snd
.sb_hiwat
) ||
1779 clen
> so
->so_snd
.sb_hiwat
)
1782 if ((space
< resid
+ clen
&&
1783 (atomic
|| (space
< (int32_t)so
->so_snd
.sb_lowat
) ||
1785 (so
->so_type
== SOCK_STREAM
&& so_wait_for_if_feedback(so
))) {
1787 * don't block the connectx call when there's more data
1788 * than can be copied.
1790 if (so
->so_flags1
& SOF1_PRECONNECT_DATA
) {
1792 return (EWOULDBLOCK
);
1794 if (space
< (int32_t)so
->so_snd
.sb_lowat
) {
1798 if ((so
->so_state
& SS_NBIO
) || (flags
& MSG_NBIO
) ||
1800 return (EWOULDBLOCK
);
1802 sbunlock(&so
->so_snd
, TRUE
); /* keep socket locked */
1804 error
= sbwait(&so
->so_snd
);
1806 if (so
->so_flags
& SOF_DEFUNCT
)
1817 * If send must go all at once and message is larger than
1818 * send buffering, then hard error.
1819 * Lock against other senders.
1820 * If must go all at once and not enough room now, then
1821 * inform user that this would block and do nothing.
1822 * Otherwise, if nonblocking, send as much as possible.
1823 * The data to be sent is described by "uio" if nonzero,
1824 * otherwise by the mbuf chain "top" (which must be null
1825 * if uio is not). Data provided in mbuf chain must be small
1826 * enough to send all at once.
1828 * Returns nonzero on error, timeout or signal; callers
1829 * must check for short counts if EINTR/ERESTART are returned.
1830 * Data and control buffers are freed on return.
1832 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1833 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1834 * point at the mbuf chain being constructed and go from there.
1836 * Returns: 0 Success
1842 * sosendcheck:EWOULDBLOCK
1846 * sosendcheck:??? [value from so_error]
1847 * <pru_send>:ECONNRESET[TCP]
1848 * <pru_send>:EINVAL[TCP]
1849 * <pru_send>:ENOBUFS[TCP]
1850 * <pru_send>:EADDRINUSE[TCP]
1851 * <pru_send>:EADDRNOTAVAIL[TCP]
1852 * <pru_send>:EAFNOSUPPORT[TCP]
1853 * <pru_send>:EACCES[TCP]
1854 * <pru_send>:EAGAIN[TCP]
1855 * <pru_send>:EPERM[TCP]
1856 * <pru_send>:EMSGSIZE[TCP]
1857 * <pru_send>:EHOSTUNREACH[TCP]
1858 * <pru_send>:ENETUNREACH[TCP]
1859 * <pru_send>:ENETDOWN[TCP]
1860 * <pru_send>:ENOMEM[TCP]
1861 * <pru_send>:ENOBUFS[TCP]
1862 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
1863 * <pru_send>:EINVAL[AF_UNIX]
1864 * <pru_send>:EOPNOTSUPP[AF_UNIX]
1865 * <pru_send>:EPIPE[AF_UNIX]
1866 * <pru_send>:ENOTCONN[AF_UNIX]
1867 * <pru_send>:EISCONN[AF_UNIX]
1868 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
1869 * <sf_data_out>:??? [whatever a filter author chooses]
1871 * Notes: Other <pru_send> returns depend on the protocol family; all
1872 * <sf_data_out> returns depend on what the filter author causes
1873 * their filter to return.
1876 sosend(struct socket
*so
, struct sockaddr
*addr
, struct uio
*uio
,
1877 struct mbuf
*top
, struct mbuf
*control
, int flags
)
1880 struct mbuf
*m
, *freelist
= NULL
;
1881 user_ssize_t space
, len
, resid
, orig_resid
;
1882 int clen
= 0, error
, dontroute
, mlen
, sendflags
;
1883 int atomic
= sosendallatonce(so
) || top
;
1885 struct proc
*p
= current_proc();
1886 struct mbuf
*control_copy
= NULL
;
1887 uint16_t headroom
= 0;
1888 boolean_t en_tracing
= FALSE
;
1891 resid
= uio_resid(uio
);
1893 resid
= top
->m_pkthdr
.len
;
1895 KERNEL_DEBUG((DBG_FNC_SOSEND
| DBG_FUNC_START
), so
, resid
,
1896 so
->so_snd
.sb_cc
, so
->so_snd
.sb_lowat
, so
->so_snd
.sb_hiwat
);
1901 * trace if tracing & network (vs. unix) sockets & and
1904 if (ENTR_SHOULDTRACE
&&
1905 (SOCK_CHECK_DOM(so
, AF_INET
) || SOCK_CHECK_DOM(so
, AF_INET6
))) {
1906 struct inpcb
*inp
= sotoinpcb(so
);
1907 if (inp
->inp_last_outifp
!= NULL
&&
1908 !(inp
->inp_last_outifp
->if_flags
& IFF_LOOPBACK
)) {
1910 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite
, DBG_FUNC_START
,
1911 VM_KERNEL_ADDRPERM(so
),
1912 ((so
->so_state
& SS_NBIO
) ? kEnTrFlagNonBlocking
: 0),
1919 * Re-injection should not affect process accounting
1921 if ((flags
& MSG_SKIPCFIL
) == 0) {
1922 so_update_last_owner_locked(so
, p
);
1923 so_update_policy(so
);
1926 so_update_necp_policy(so
, NULL
, addr
);
1930 if (so
->so_type
!= SOCK_STREAM
&& (flags
& MSG_OOB
) != 0) {
1932 socket_unlock(so
, 1);
1937 * In theory resid should be unsigned.
1938 * However, space must be signed, as it might be less than 0
1939 * if we over-committed, and we must use a signed comparison
1940 * of space and resid. On the other hand, a negative resid
1941 * causes us to loop sending 0-length segments to the protocol.
1943 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
1944 * But it will be used by sockets doing message delivery.
1946 * Note: We limit resid to be a positive int value as we use
1947 * imin() to set bytes_to_copy -- radr://14558484
1949 if (resid
< 0 || resid
> INT_MAX
|| (so
->so_type
== SOCK_STREAM
&&
1950 !(so
->so_flags
& SOF_ENABLE_MSGS
) && (flags
& MSG_EOR
))) {
1952 socket_unlock(so
, 1);
1956 dontroute
= (flags
& MSG_DONTROUTE
) &&
1957 (so
->so_options
& SO_DONTROUTE
) == 0 &&
1958 (so
->so_proto
->pr_flags
& PR_ATOMIC
);
1959 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgsnd
);
1961 if (control
!= NULL
)
1962 clen
= control
->m_len
;
1964 if (soreserveheadroom
!= 0)
1965 headroom
= so
->so_pktheadroom
;
1968 error
= sosendcheck(so
, addr
, resid
, clen
, atomic
, flags
,
1969 &sblocked
, control
);
1974 if (so
->so_flags
& SOF_ENABLE_MSGS
)
1975 space
= msgq_sbspace(so
, control
);
1977 space
= sbspace(&so
->so_snd
) - clen
;
1978 space
+= ((flags
& MSG_OOB
) ? 1024 : 0);
1983 * Data is prepackaged in "top".
1986 if (flags
& MSG_EOR
)
1987 top
->m_flags
|= M_EOR
;
1995 bytes_to_copy
= imin(resid
, space
);
1997 bytes_to_alloc
= bytes_to_copy
;
1999 bytes_to_alloc
+= headroom
;
2001 if (sosendminchain
> 0)
2004 chainlength
= sosendmaxchain
;
2007 * Use big 4 KB cluster when the outgoing interface
2008 * does not prefer 2 KB clusters
2010 bigcl
= !(so
->so_flags1
& SOF1_IF_2KCL
) ||
2011 sosendbigcl_ignore_capab
;
2014 * Attempt to use larger than system page-size
2015 * clusters for large writes only if there is
2016 * a jumbo cluster pool and if the socket is
2017 * marked accordingly.
2019 jumbocl
= sosendjcl
&& njcl
> 0 &&
2020 ((so
->so_flags
& SOF_MULTIPAGES
) ||
2021 sosendjcl_ignore_capab
) &&
2024 socket_unlock(so
, 0);
2028 int hdrs_needed
= (top
== NULL
) ? 1 : 0;
2031 * try to maintain a local cache of mbuf
2032 * clusters needed to complete this
2033 * write the list is further limited to
2034 * the number that are currently needed
2035 * to fill the socket this mechanism
2036 * allows a large number of mbufs/
2037 * clusters to be grabbed under a single
2038 * mbuf lock... if we can't get any
2039 * clusters, than fall back to trying
2040 * for mbufs if we fail early (or
2041 * miscalcluate the number needed) make
2042 * sure to release any clusters we
2043 * haven't yet consumed.
2045 if (freelist
== NULL
&&
2046 bytes_to_alloc
> MBIGCLBYTES
&&
2049 bytes_to_alloc
/ M16KCLBYTES
;
2051 if ((bytes_to_alloc
-
2052 (num_needed
* M16KCLBYTES
))
2057 m_getpackets_internal(
2058 (unsigned int *)&num_needed
,
2059 hdrs_needed
, M_WAIT
, 0,
2062 * Fall back to 4K cluster size
2063 * if allocation failed
2067 if (freelist
== NULL
&&
2068 bytes_to_alloc
> MCLBYTES
&&
2071 bytes_to_alloc
/ MBIGCLBYTES
;
2073 if ((bytes_to_alloc
-
2074 (num_needed
* MBIGCLBYTES
)) >=
2079 m_getpackets_internal(
2080 (unsigned int *)&num_needed
,
2081 hdrs_needed
, M_WAIT
, 0,
2084 * Fall back to cluster size
2085 * if allocation failed
2090 * Allocate a cluster as we want to
2091 * avoid to split the data in more
2092 * that one segment and using MINCLSIZE
2093 * would lead us to allocate two mbufs
2095 if (soreserveheadroom
!= 0 &&
2098 bytes_to_alloc
> _MHLEN
) ||
2099 bytes_to_alloc
> _MLEN
)) {
2100 num_needed
= ROUNDUP(bytes_to_alloc
, MCLBYTES
) /
2103 m_getpackets_internal(
2104 (unsigned int *)&num_needed
,
2105 hdrs_needed
, M_WAIT
, 0,
2108 * Fall back to a single mbuf
2109 * if allocation failed
2111 } else if (freelist
== NULL
&&
2112 bytes_to_alloc
> MINCLSIZE
) {
2114 bytes_to_alloc
/ MCLBYTES
;
2116 if ((bytes_to_alloc
-
2117 (num_needed
* MCLBYTES
)) >=
2122 m_getpackets_internal(
2123 (unsigned int *)&num_needed
,
2124 hdrs_needed
, M_WAIT
, 0,
2127 * Fall back to a single mbuf
2128 * if allocation failed
2132 * For datagram protocols, leave
2133 * headroom for protocol headers
2134 * in the first cluster of the chain
2136 if (freelist
!= NULL
&& atomic
&&
2137 top
== NULL
&& headroom
> 0) {
2138 freelist
->m_data
+= headroom
;
2142 * Fall back to regular mbufs without
2143 * reserving the socket headroom
2145 if (freelist
== NULL
) {
2153 if (freelist
== NULL
) {
2159 * For datagram protocols,
2160 * leave room for protocol
2161 * headers in first mbuf.
2163 if (atomic
&& top
== NULL
&&
2164 bytes_to_copy
< MHLEN
) {
2170 freelist
= m
->m_next
;
2173 if ((m
->m_flags
& M_EXT
))
2174 mlen
= m
->m_ext
.ext_size
-
2176 else if ((m
->m_flags
& M_PKTHDR
))
2178 MHLEN
- m_leadingspace(m
);
2180 mlen
= MLEN
- m_leadingspace(m
);
2181 len
= imin(mlen
, bytes_to_copy
);
2187 error
= uiomove(mtod(m
, caddr_t
),
2190 resid
= uio_resid(uio
);
2194 top
->m_pkthdr
.len
+= len
;
2199 if (flags
& MSG_EOR
)
2200 top
->m_flags
|= M_EOR
;
2203 bytes_to_copy
= min(resid
, space
);
2205 } while (space
> 0 &&
2206 (chainlength
< sosendmaxchain
|| atomic
||
2207 resid
< MINCLSIZE
));
2215 if (flags
& (MSG_HOLD
|MSG_SEND
)) {
2216 /* Enqueue for later, go away if HOLD */
2218 if (so
->so_temp
&& (flags
& MSG_FLUSH
)) {
2219 m_freem(so
->so_temp
);
2223 so
->so_tail
->m_next
= top
;
2230 if (flags
& MSG_HOLD
) {
2237 so
->so_options
|= SO_DONTROUTE
;
2240 * Compute flags here, for pru_send and NKEs
2242 * If the user set MSG_EOF, the protocol
2243 * understands this flag and nothing left to
2244 * send then use PRU_SEND_EOF instead of PRU_SEND.
2246 sendflags
= (flags
& MSG_OOB
) ? PRUS_OOB
:
2247 ((flags
& MSG_EOF
) &&
2248 (so
->so_proto
->pr_flags
& PR_IMPLOPCL
) &&
2249 (resid
<= 0)) ? PRUS_EOF
:
2250 /* If there is more to send set PRUS_MORETOCOME */
2251 (resid
> 0 && space
> 0) ? PRUS_MORETOCOME
: 0;
2253 if ((flags
& MSG_SKIPCFIL
) == 0) {
2255 * Socket filter processing
2257 error
= sflt_data_out(so
, addr
, &top
,
2258 &control
, (sendflags
& MSG_OOB
) ?
2259 sock_data_filt_flag_oob
: 0);
2261 if (error
== EJUSTRETURN
) {
2271 * Content filter processing
2273 error
= cfil_sock_data_out(so
, addr
, top
,
2274 control
, (sendflags
& MSG_OOB
) ?
2275 sock_data_filt_flag_oob
: 0);
2277 if (error
== EJUSTRETURN
) {
2285 #endif /* CONTENT_FILTER */
2287 if (so
->so_flags
& SOF_ENABLE_MSGS
) {
2289 * Make a copy of control mbuf,
2290 * so that msg priority can be
2291 * passed to subsequent mbufs.
2293 control_copy
= m_dup(control
, M_NOWAIT
);
2295 error
= (*so
->so_proto
->pr_usrreqs
->pru_send
)
2296 (so
, sendflags
, top
, addr
, control
, p
);
2298 if (flags
& MSG_SEND
)
2302 so
->so_options
&= ~SO_DONTROUTE
;
2305 control
= control_copy
;
2306 control_copy
= NULL
;
2311 } while (resid
&& space
> 0);
2316 sbunlock(&so
->so_snd
, FALSE
); /* will unlock socket */
2318 socket_unlock(so
, 1);
2322 if (control
!= NULL
)
2324 if (freelist
!= NULL
)
2325 m_freem_list(freelist
);
2326 if (control_copy
!= NULL
)
2327 m_freem(control_copy
);
2330 * One write has been done. This was enough. Get back to "normal"
2333 if (so
->so_flags1
& SOF1_PRECONNECT_DATA
)
2334 so
->so_flags1
&= ~SOF1_PRECONNECT_DATA
;
2337 /* resid passed here is the bytes left in uio */
2338 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite
, DBG_FUNC_END
,
2339 VM_KERNEL_ADDRPERM(so
),
2340 ((error
== EWOULDBLOCK
) ? kEnTrFlagNoWork
: 0),
2341 (int64_t)(orig_resid
- resid
));
2343 KERNEL_DEBUG(DBG_FNC_SOSEND
| DBG_FUNC_END
, so
, resid
,
2344 so
->so_snd
.sb_cc
, space
, error
);
2350 * Supported only connected sockets (no address) without ancillary data
2351 * (control mbuf) for atomic protocols
2354 sosend_list(struct socket
*so
, struct uio
**uioarray
, u_int uiocnt
, int flags
)
2356 struct mbuf
*m
, *freelist
= NULL
;
2357 user_ssize_t len
, resid
;
2358 int error
, dontroute
, mlen
;
2359 int atomic
= sosendallatonce(so
);
2361 struct proc
*p
= current_proc();
2364 struct mbuf
*top
= NULL
;
2365 uint16_t headroom
= 0;
2368 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST
| DBG_FUNC_START
), so
, uiocnt
,
2369 so
->so_snd
.sb_cc
, so
->so_snd
.sb_lowat
, so
->so_snd
.sb_hiwat
);
2371 if (so
->so_type
!= SOCK_DGRAM
) {
2379 if (so
->so_proto
->pr_usrreqs
->pru_send_list
== NULL
) {
2380 error
= EPROTONOSUPPORT
;
2383 if (flags
& ~(MSG_DONTWAIT
| MSG_NBIO
)) {
2387 resid
= uio_array_resid(uioarray
, uiocnt
);
2390 * In theory resid should be unsigned.
2391 * However, space must be signed, as it might be less than 0
2392 * if we over-committed, and we must use a signed comparison
2393 * of space and resid. On the other hand, a negative resid
2394 * causes us to loop sending 0-length segments to the protocol.
2396 * Note: We limit resid to be a positive int value as we use
2397 * imin() to set bytes_to_copy -- radr://14558484
2399 if (resid
< 0 || resid
> INT_MAX
) {
2405 so_update_last_owner_locked(so
, p
);
2406 so_update_policy(so
);
2409 so_update_necp_policy(so
, NULL
, NULL
);
2412 dontroute
= (flags
& MSG_DONTROUTE
) &&
2413 (so
->so_options
& SO_DONTROUTE
) == 0 &&
2414 (so
->so_proto
->pr_flags
& PR_ATOMIC
);
2415 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgsnd
);
2417 error
= sosendcheck(so
, NULL
, resid
, 0, atomic
, flags
,
2423 * Use big 4 KB clusters when the outgoing interface does not prefer
2426 bigcl
= !(so
->so_flags1
& SOF1_IF_2KCL
) || sosendbigcl_ignore_capab
;
2428 if (soreserveheadroom
!= 0)
2429 headroom
= so
->so_pktheadroom
;
2435 size_t maxpktlen
= 0;
2438 if (sosendminchain
> 0)
2441 chainlength
= sosendmaxchain
;
2443 socket_unlock(so
, 0);
2446 * Find a set of uio that fit in a reasonable number
2449 for (i
= uiofirst
; i
< uiocnt
; i
++) {
2450 struct uio
*auio
= uioarray
[i
];
2452 len
= uio_resid(auio
);
2454 /* Do nothing for empty messages */
2461 if (len
> maxpktlen
)
2465 if (chainlength
> sosendmaxchain
)
2469 * Nothing left to send
2471 if (num_needed
== 0) {
2476 * Allocate buffer large enough to include headroom space for
2477 * network and link header
2480 bytes_to_alloc
= maxpktlen
+ headroom
;
2483 * Allocate a single contiguous buffer of the smallest available
2484 * size when possible
2486 if (bytes_to_alloc
> MCLBYTES
&&
2487 bytes_to_alloc
<= MBIGCLBYTES
&& bigcl
) {
2488 freelist
= m_getpackets_internal(
2489 (unsigned int *)&num_needed
,
2490 num_needed
, M_WAIT
, 1,
2492 } else if (bytes_to_alloc
> _MHLEN
&&
2493 bytes_to_alloc
<= MCLBYTES
) {
2494 freelist
= m_getpackets_internal(
2495 (unsigned int *)&num_needed
,
2496 num_needed
, M_WAIT
, 1,
2499 freelist
= m_allocpacket_internal(
2500 (unsigned int *)&num_needed
,
2501 bytes_to_alloc
, NULL
, M_WAIT
, 1, 0);
2504 if (freelist
== NULL
) {
2510 * Copy each uio of the set into its own mbuf packet
2512 for (i
= uiofirst
, m
= freelist
;
2513 i
< uiolast
&& m
!= NULL
;
2517 struct uio
*auio
= uioarray
[i
];
2519 bytes_to_copy
= uio_resid(auio
);
2521 /* Do nothing for empty messages */
2522 if (bytes_to_copy
== 0)
2525 * Leave headroom for protocol headers
2526 * in the first mbuf of the chain
2528 m
->m_data
+= headroom
;
2530 for (n
= m
; n
!= NULL
; n
= n
->m_next
) {
2531 if ((m
->m_flags
& M_EXT
))
2532 mlen
= m
->m_ext
.ext_size
-
2534 else if ((m
->m_flags
& M_PKTHDR
))
2536 MHLEN
- m_leadingspace(m
);
2538 mlen
= MLEN
- m_leadingspace(m
);
2539 len
= imin(mlen
, bytes_to_copy
);
2542 * Note: uiomove() decrements the iovec
2545 error
= uiomove(mtod(n
, caddr_t
),
2550 m
->m_pkthdr
.len
+= len
;
2552 VERIFY(m
->m_pkthdr
.len
<= maxpktlen
);
2554 bytes_to_copy
-= len
;
2557 if (m
->m_pkthdr
.len
== 0) {
2559 "%s:%d so %llx pkt %llx type %u len null\n",
2561 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
2562 (uint64_t)DEBUG_KERNEL_ADDRPERM(m
),
2578 so
->so_options
|= SO_DONTROUTE
;
2580 if ((flags
& MSG_SKIPCFIL
) == 0) {
2581 struct mbuf
**prevnextp
= NULL
;
2583 for (i
= uiofirst
, m
= top
;
2584 i
< uiolast
&& m
!= NULL
;
2586 struct mbuf
*nextpkt
= m
->m_nextpkt
;
2589 * Socket filter processing
2591 error
= sflt_data_out(so
, NULL
, &m
,
2593 if (error
!= 0 && error
!= EJUSTRETURN
)
2599 * Content filter processing
2601 error
= cfil_sock_data_out(so
, NULL
, m
,
2603 if (error
!= 0 && error
!= EJUSTRETURN
)
2606 #endif /* CONTENT_FILTER */
2608 * Remove packet from the list when
2609 * swallowed by a filter
2611 if (error
== EJUSTRETURN
) {
2613 if (prevnextp
!= NULL
)
2614 *prevnextp
= nextpkt
;
2621 prevnextp
= &m
->m_nextpkt
;
2625 error
= (*so
->so_proto
->pr_usrreqs
->pru_send_list
)
2626 (so
, 0, top
, NULL
, NULL
, p
);
2629 so
->so_options
&= ~SO_DONTROUTE
;
2633 } while (resid
> 0 && error
== 0);
2636 sbunlock(&so
->so_snd
, FALSE
); /* will unlock socket */
2638 socket_unlock(so
, 1);
2642 if (freelist
!= NULL
)
2643 m_freem_list(freelist
);
2645 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST
| DBG_FUNC_END
, so
, resid
,
2646 so
->so_snd
.sb_cc
, 0, error
);
2652 * May return ERESTART when packet is dropped by MAC policy check
2655 soreceive_addr(struct proc
*p
, struct socket
*so
, struct sockaddr
**psa
,
2656 int flags
, struct mbuf
**mp
, struct mbuf
**nextrecordp
, int canwait
)
2659 struct mbuf
*m
= *mp
;
2660 struct mbuf
*nextrecord
= *nextrecordp
;
2662 KASSERT(m
->m_type
== MT_SONAME
, ("receive 1a"));
2663 #if CONFIG_MACF_SOCKET_SUBSET
2665 * Call the MAC framework for policy checking if we're in
2666 * the user process context and the socket isn't connected.
2668 if (p
!= kernproc
&& !(so
->so_state
& SS_ISCONNECTED
)) {
2669 struct mbuf
*m0
= m
;
2671 * Dequeue this record (temporarily) from the receive
2672 * list since we're about to drop the socket's lock
2673 * where a new record may arrive and be appended to
2674 * the list. Upon MAC policy failure, the record
2675 * will be freed. Otherwise, we'll add it back to
2676 * the head of the list. We cannot rely on SB_LOCK
2677 * because append operation uses the socket's lock.
2680 m
->m_nextpkt
= NULL
;
2681 sbfree(&so
->so_rcv
, m
);
2683 } while (m
!= NULL
);
2685 so
->so_rcv
.sb_mb
= nextrecord
;
2686 SB_EMPTY_FIXUP(&so
->so_rcv
);
2687 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1a");
2688 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1a");
2689 socket_unlock(so
, 0);
2691 if (mac_socket_check_received(proc_ucred(p
), so
,
2692 mtod(m
, struct sockaddr
*)) != 0) {
2694 * MAC policy failure; free this record and
2695 * process the next record (or block until
2696 * one is available). We have adjusted sb_cc
2697 * and sb_mbcnt above so there is no need to
2698 * call sbfree() again.
2702 * Clear SB_LOCK but don't unlock the socket.
2703 * Process the next record or wait for one.
2706 sbunlock(&so
->so_rcv
, TRUE
); /* stay locked */
2712 * If the socket has been defunct'd, drop it.
2714 if (so
->so_flags
& SOF_DEFUNCT
) {
2720 * Re-adjust the socket receive list and re-enqueue
2721 * the record in front of any packets which may have
2722 * been appended while we dropped the lock.
2724 for (m
= m0
; m
->m_next
!= NULL
; m
= m
->m_next
)
2725 sballoc(&so
->so_rcv
, m
);
2726 sballoc(&so
->so_rcv
, m
);
2727 if (so
->so_rcv
.sb_mb
== NULL
) {
2728 so
->so_rcv
.sb_lastrecord
= m0
;
2729 so
->so_rcv
.sb_mbtail
= m
;
2732 nextrecord
= m
->m_nextpkt
= so
->so_rcv
.sb_mb
;
2733 so
->so_rcv
.sb_mb
= m
;
2734 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1b");
2735 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1b");
2737 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2739 *psa
= dup_sockaddr(mtod(m
, struct sockaddr
*), canwait
);
2740 if ((*psa
== NULL
) && (flags
& MSG_NEEDSA
)) {
2741 error
= EWOULDBLOCK
;
2745 if (flags
& MSG_PEEK
) {
2748 sbfree(&so
->so_rcv
, m
);
2749 if (m
->m_next
== NULL
&& so
->so_rcv
.sb_cc
!= 0) {
2750 panic("%s: about to create invalid socketbuf",
2754 MFREE(m
, so
->so_rcv
.sb_mb
);
2755 m
= so
->so_rcv
.sb_mb
;
2757 m
->m_nextpkt
= nextrecord
;
2759 so
->so_rcv
.sb_mb
= nextrecord
;
2760 SB_EMPTY_FIXUP(&so
->so_rcv
);
2765 *nextrecordp
= nextrecord
;
2771 * Process one or more MT_CONTROL mbufs present before any data mbufs
2772 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2773 * just copy the data; if !MSG_PEEK, we call into the protocol to
2774 * perform externalization.
2777 soreceive_ctl(struct socket
*so
, struct mbuf
**controlp
, int flags
,
2778 struct mbuf
**mp
, struct mbuf
**nextrecordp
)
2781 struct mbuf
*cm
= NULL
, *cmn
;
2782 struct mbuf
**cme
= &cm
;
2783 struct sockbuf
*sb_rcv
= &so
->so_rcv
;
2784 struct mbuf
**msgpcm
= NULL
;
2785 struct mbuf
*m
= *mp
;
2786 struct mbuf
*nextrecord
= *nextrecordp
;
2787 struct protosw
*pr
= so
->so_proto
;
2790 * Externalizing the control messages would require us to
2791 * drop the socket's lock below. Once we re-acquire the
2792 * lock, the mbuf chain might change. In order to preserve
2793 * consistency, we unlink all control messages from the
2794 * first mbuf chain in one shot and link them separately
2795 * onto a different chain.
2798 if (flags
& MSG_PEEK
) {
2799 if (controlp
!= NULL
) {
2800 if (*controlp
== NULL
) {
2803 *controlp
= m_copy(m
, 0, m
->m_len
);
2806 * If we failed to allocate an mbuf,
2807 * release any previously allocated
2808 * mbufs for control data. Return
2809 * an error. Keep the mbufs in the
2810 * socket as this is using
2813 if (*controlp
== NULL
) {
2818 controlp
= &(*controlp
)->m_next
;
2822 m
->m_nextpkt
= NULL
;
2824 sb_rcv
->sb_mb
= m
->m_next
;
2827 cme
= &(*cme
)->m_next
;
2830 } while (m
!= NULL
&& m
->m_type
== MT_CONTROL
);
2832 if (!(flags
& MSG_PEEK
)) {
2833 if (sb_rcv
->sb_mb
!= NULL
) {
2834 sb_rcv
->sb_mb
->m_nextpkt
= nextrecord
;
2836 sb_rcv
->sb_mb
= nextrecord
;
2837 SB_EMPTY_FIXUP(sb_rcv
);
2839 if (nextrecord
== NULL
)
2840 sb_rcv
->sb_lastrecord
= m
;
2843 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive ctl");
2844 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive ctl");
2846 while (cm
!= NULL
) {
2851 cmsg_type
= mtod(cm
, struct cmsghdr
*)->cmsg_type
;
2854 * Call the protocol to externalize SCM_RIGHTS message
2855 * and return the modified message to the caller upon
2856 * success. Otherwise, all other control messages are
2857 * returned unmodified to the caller. Note that we
2858 * only get into this loop if MSG_PEEK is not set.
2860 if (pr
->pr_domain
->dom_externalize
!= NULL
&&
2861 cmsg_type
== SCM_RIGHTS
) {
2863 * Release socket lock: see 3903171. This
2864 * would also allow more records to be appended
2865 * to the socket buffer. We still have SB_LOCK
2866 * set on it, so we can be sure that the head
2867 * of the mbuf chain won't change.
2869 socket_unlock(so
, 0);
2870 error
= (*pr
->pr_domain
->dom_externalize
)(cm
);
2876 if (controlp
!= NULL
&& error
== 0) {
2878 controlp
= &(*controlp
)->m_next
;
2885 * Update the value of nextrecord in case we received new
2886 * records when the socket was unlocked above for
2887 * externalizing SCM_RIGHTS.
2890 nextrecord
= sb_rcv
->sb_mb
->m_nextpkt
;
2892 nextrecord
= sb_rcv
->sb_mb
;
2896 *nextrecordp
= nextrecord
;
2902 * Implement receive operations on a socket.
2903 * We depend on the way that records are added to the sockbuf
2904 * by sbappend*. In particular, each record (mbufs linked through m_next)
2905 * must begin with an address if the protocol so specifies,
2906 * followed by an optional mbuf or mbufs containing ancillary data,
2907 * and then zero or more mbufs of data.
2908 * In order to avoid blocking network interrupts for the entire time here,
2909 * we splx() while doing the actual copy to user space.
2910 * Although the sockbuf is locked, new data may still be appended,
2911 * and thus we must maintain consistency of the sockbuf during that time.
2913 * The caller may receive the data as a single mbuf chain by supplying
2914 * an mbuf **mp0 for use in returning the chain. The uio is then used
2915 * only for the count in uio_resid.
2917 * Returns: 0 Success
2922 * sblock:EWOULDBLOCK
2926 * sodelayed_copy:EFAULT
2927 * <pru_rcvoob>:EINVAL[TCP]
2928 * <pru_rcvoob>:EWOULDBLOCK[TCP]
2930 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
2931 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
2932 * <pr_domain->dom_externalize>:???
2934 * Notes: Additional return values from calls through <pru_rcvoob> and
2935 * <pr_domain->dom_externalize> depend on protocols other than
2936 * TCP or AF_UNIX, which are documented above.
2939 soreceive(struct socket
*so
, struct sockaddr
**psa
, struct uio
*uio
,
2940 struct mbuf
**mp0
, struct mbuf
**controlp
, int *flagsp
)
2942 struct mbuf
*m
, **mp
, *ml
= NULL
;
2943 struct mbuf
*nextrecord
, *free_list
;
2944 int flags
, error
, offset
;
2946 struct protosw
*pr
= so
->so_proto
;
2948 user_ssize_t orig_resid
= uio_resid(uio
);
2949 user_ssize_t delayed_copy_len
;
2952 struct proc
*p
= current_proc();
2953 boolean_t en_tracing
= FALSE
;
2956 * Sanity check on the length passed by caller as we are making 'int'
2959 if (orig_resid
< 0 || orig_resid
> INT_MAX
)
2962 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_START
, so
,
2963 uio_resid(uio
), so
->so_rcv
.sb_cc
, so
->so_rcv
.sb_lowat
,
2964 so
->so_rcv
.sb_hiwat
);
2967 so_update_last_owner_locked(so
, p
);
2968 so_update_policy(so
);
2970 #ifdef MORE_LOCKING_DEBUG
2971 if (so
->so_usecount
== 1) {
2972 panic("%s: so=%x no other reference on socket\n", __func__
, so
);
2979 if (controlp
!= NULL
)
2982 flags
= *flagsp
&~ MSG_EOR
;
2987 * If a recv attempt is made on a previously-accepted socket
2988 * that has been marked as inactive (disconnected), reject
2991 if (so
->so_flags
& SOF_DEFUNCT
) {
2992 struct sockbuf
*sb
= &so
->so_rcv
;
2995 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
2996 __func__
, proc_pid(p
), (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
2997 SOCK_DOM(so
), SOCK_TYPE(so
), error
));
2999 * This socket should have been disconnected and flushed
3000 * prior to being returned from sodefunct(); there should
3001 * be no data on its receive list, so panic otherwise.
3003 if (so
->so_state
& SS_DEFUNCT
)
3004 sb_empty_assert(sb
, __func__
);
3005 socket_unlock(so
, 1);
3009 if ((so
->so_flags1
& SOF1_PRECONNECT_DATA
) &&
3010 pr
->pr_usrreqs
->pru_preconnect
) {
3012 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3013 * calling write() right after this. *If* the app calls a read
3014 * we do not want to block this read indefinetely. Thus,
3015 * we trigger a connect so that the session gets initiated.
3017 error
= (*pr
->pr_usrreqs
->pru_preconnect
)(so
);
3020 socket_unlock(so
, 1);
3025 if (ENTR_SHOULDTRACE
&&
3026 (SOCK_CHECK_DOM(so
, AF_INET
) || SOCK_CHECK_DOM(so
, AF_INET6
))) {
3028 * enable energy tracing for inet sockets that go over
3029 * non-loopback interfaces only.
3031 struct inpcb
*inp
= sotoinpcb(so
);
3032 if (inp
->inp_last_outifp
!= NULL
&&
3033 !(inp
->inp_last_outifp
->if_flags
& IFF_LOOPBACK
)) {
3035 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_START
,
3036 VM_KERNEL_ADDRPERM(so
),
3037 ((so
->so_state
& SS_NBIO
) ?
3038 kEnTrFlagNonBlocking
: 0),
3039 (int64_t)orig_resid
);
3044 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3045 * regardless of the flags argument. Here is the case were
3046 * out-of-band data is not inline.
3048 if ((flags
& MSG_OOB
) ||
3049 ((so
->so_options
& SO_WANTOOBFLAG
) != 0 &&
3050 (so
->so_options
& SO_OOBINLINE
) == 0 &&
3051 (so
->so_oobmark
|| (so
->so_state
& SS_RCVATMARK
)))) {
3052 m
= m_get(M_WAIT
, MT_DATA
);
3054 socket_unlock(so
, 1);
3055 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
,
3056 ENOBUFS
, 0, 0, 0, 0);
3059 error
= (*pr
->pr_usrreqs
->pru_rcvoob
)(so
, m
, flags
& MSG_PEEK
);
3062 socket_unlock(so
, 0);
3064 error
= uiomove(mtod(m
, caddr_t
),
3065 imin(uio_resid(uio
), m
->m_len
), uio
);
3067 } while (uio_resid(uio
) && error
== 0 && m
!= NULL
);
3073 if ((so
->so_options
& SO_WANTOOBFLAG
) != 0) {
3074 if (error
== EWOULDBLOCK
|| error
== EINVAL
) {
3076 * Let's try to get normal data:
3077 * EWOULDBLOCK: out-of-band data not
3078 * receive yet. EINVAL: out-of-band data
3083 } else if (error
== 0 && flagsp
!= NULL
) {
3087 socket_unlock(so
, 1);
3089 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_END
,
3090 VM_KERNEL_ADDRPERM(so
), 0,
3091 (int64_t)(orig_resid
- uio_resid(uio
)));
3093 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
3102 if (so
->so_state
& SS_ISCONFIRMING
&& uio_resid(uio
)) {
3103 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, 0);
3107 delayed_copy_len
= 0;
3109 #ifdef MORE_LOCKING_DEBUG
3110 if (so
->so_usecount
<= 1)
3111 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3112 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), so
->so_usecount
);
3115 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3116 * and if so just return to the caller. This could happen when
3117 * soreceive() is called by a socket upcall function during the
3118 * time the socket is freed. The socket buffer would have been
3119 * locked across the upcall, therefore we cannot put this thread
3120 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3121 * we may livelock), because the lock on the socket buffer will
3122 * only be released when the upcall routine returns to its caller.
3123 * Because the socket has been officially closed, there can be
3124 * no further read on it.
3126 * A multipath subflow socket would have its SS_NOFDREF set by
3127 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3128 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3130 if ((so
->so_state
& (SS_NOFDREF
| SS_CANTRCVMORE
)) ==
3131 (SS_NOFDREF
| SS_CANTRCVMORE
) && !(so
->so_flags
& SOF_MP_SUBFLOW
)) {
3132 socket_unlock(so
, 1);
3136 error
= sblock(&so
->so_rcv
, SBLOCKWAIT(flags
));
3138 socket_unlock(so
, 1);
3139 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
3142 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_END
,
3143 VM_KERNEL_ADDRPERM(so
), 0,
3144 (int64_t)(orig_resid
- uio_resid(uio
)));
3149 m
= so
->so_rcv
.sb_mb
;
3151 * If we have less data than requested, block awaiting more
3152 * (subject to any timeout) if:
3153 * 1. the current count is less than the low water mark, or
3154 * 2. MSG_WAITALL is set, and it is possible to do the entire
3155 * receive operation at once if we block (resid <= hiwat).
3156 * 3. MSG_DONTWAIT is not set
3157 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3158 * we have to do the receive in sections, and thus risk returning
3159 * a short count if a timeout or signal occurs after we start.
3161 if (m
== NULL
|| (((flags
& MSG_DONTWAIT
) == 0 &&
3162 so
->so_rcv
.sb_cc
< uio_resid(uio
)) &&
3163 (so
->so_rcv
.sb_cc
< so
->so_rcv
.sb_lowat
||
3164 ((flags
& MSG_WAITALL
) && uio_resid(uio
) <= so
->so_rcv
.sb_hiwat
)) &&
3165 m
->m_nextpkt
== NULL
&& (pr
->pr_flags
& PR_ATOMIC
) == 0)) {
3167 * Panic if we notice inconsistencies in the socket's
3168 * receive list; both sb_mb and sb_cc should correctly
3169 * reflect the contents of the list, otherwise we may
3170 * end up with false positives during select() or poll()
3171 * which could put the application in a bad state.
3173 SB_MB_CHECK(&so
->so_rcv
);
3178 error
= so
->so_error
;
3179 if ((flags
& MSG_PEEK
) == 0)
3183 if (so
->so_state
& SS_CANTRCVMORE
) {
3186 * Deal with half closed connections
3188 if ((so
->so_state
& SS_ISDISCONNECTED
) == 0 &&
3189 cfil_sock_data_pending(&so
->so_rcv
) != 0)
3191 "so %llx ignore SS_CANTRCVMORE",
3192 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
));
3194 #endif /* CONTENT_FILTER */
3200 for (; m
!= NULL
; m
= m
->m_next
)
3201 if (m
->m_type
== MT_OOBDATA
|| (m
->m_flags
& M_EOR
)) {
3202 m
= so
->so_rcv
.sb_mb
;
3205 if ((so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
)) == 0 &&
3206 (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
)) {
3210 if (uio_resid(uio
) == 0)
3213 if ((so
->so_state
& SS_NBIO
) ||
3214 (flags
& (MSG_DONTWAIT
|MSG_NBIO
))) {
3215 error
= EWOULDBLOCK
;
3218 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 1");
3219 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 1");
3220 sbunlock(&so
->so_rcv
, TRUE
); /* keep socket locked */
3221 #if EVEN_MORE_LOCKING_DEBUG
3223 printf("Waiting for socket data\n");
3226 error
= sbwait(&so
->so_rcv
);
3227 #if EVEN_MORE_LOCKING_DEBUG
3229 printf("SORECEIVE - sbwait returned %d\n", error
);
3231 if (so
->so_usecount
< 1) {
3232 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3233 __func__
, so
, so
->so_usecount
);
3237 socket_unlock(so
, 1);
3238 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
3241 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_END
,
3242 VM_KERNEL_ADDRPERM(so
), 0,
3243 (int64_t)(orig_resid
- uio_resid(uio
)));
3250 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgrcv
);
3251 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1");
3252 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1");
3253 nextrecord
= m
->m_nextpkt
;
3255 if ((pr
->pr_flags
& PR_ADDR
) && m
->m_type
== MT_SONAME
) {
3256 error
= soreceive_addr(p
, so
, psa
, flags
, &m
, &nextrecord
,
3258 if (error
== ERESTART
)
3260 else if (error
!= 0)
3266 * Process one or more MT_CONTROL mbufs present before any data mbufs
3267 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3268 * just copy the data; if !MSG_PEEK, we call into the protocol to
3269 * perform externalization.
3271 if (m
!= NULL
&& m
->m_type
== MT_CONTROL
) {
3272 error
= soreceive_ctl(so
, controlp
, flags
, &m
, &nextrecord
);
3279 * If the socket is a TCP socket with message delivery
3280 * enabled, then create a control msg to deliver the
3281 * relative TCP sequence number for this data. Waiting
3282 * until this point will protect against failures to
3283 * allocate an mbuf for control msgs.
3285 if (so
->so_type
== SOCK_STREAM
&& SOCK_PROTO(so
) == IPPROTO_TCP
&&
3286 (so
->so_flags
& SOF_ENABLE_MSGS
) && controlp
!= NULL
) {
3287 struct mbuf
*seq_cm
;
3289 seq_cm
= sbcreatecontrol((caddr_t
)&m
->m_pkthdr
.msg_seq
,
3290 sizeof (uint32_t), SCM_SEQNUM
, SOL_SOCKET
);
3291 if (seq_cm
== NULL
) {
3292 /* unable to allocate a control mbuf */
3297 controlp
= &seq_cm
->m_next
;
3301 if (!(flags
& MSG_PEEK
)) {
3303 * We get here because m points to an mbuf following
3304 * any MT_SONAME or MT_CONTROL mbufs which have been
3305 * processed above. In any case, m should be pointing
3306 * to the head of the mbuf chain, and the nextrecord
3307 * should be either NULL or equal to m->m_nextpkt.
3308 * See comments above about SB_LOCK.
3310 if (m
!= so
->so_rcv
.sb_mb
||
3311 m
->m_nextpkt
!= nextrecord
) {
3312 panic("%s: post-control !sync so=%p m=%p "
3313 "nextrecord=%p\n", __func__
, so
, m
,
3317 if (nextrecord
== NULL
)
3318 so
->so_rcv
.sb_lastrecord
= m
;
3321 if (type
== MT_OOBDATA
)
3324 if (!(flags
& MSG_PEEK
)) {
3325 SB_EMPTY_FIXUP(&so
->so_rcv
);
3328 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 2");
3329 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 2");
3334 if (!(flags
& MSG_PEEK
) && uio_resid(uio
) > sorecvmincopy
)
3342 (uio_resid(uio
) - delayed_copy_len
) > 0 && error
== 0) {
3343 if (m
->m_type
== MT_OOBDATA
) {
3344 if (type
!= MT_OOBDATA
)
3346 } else if (type
== MT_OOBDATA
) {
3350 * Make sure to allways set MSG_OOB event when getting
3351 * out of band data inline.
3353 if ((so
->so_options
& SO_WANTOOBFLAG
) != 0 &&
3354 (so
->so_options
& SO_OOBINLINE
) != 0 &&
3355 (so
->so_state
& SS_RCVATMARK
) != 0) {
3358 so
->so_state
&= ~SS_RCVATMARK
;
3359 len
= uio_resid(uio
) - delayed_copy_len
;
3360 if (so
->so_oobmark
&& len
> so
->so_oobmark
- offset
)
3361 len
= so
->so_oobmark
- offset
;
3362 if (len
> m
->m_len
- moff
)
3363 len
= m
->m_len
- moff
;
3365 * If mp is set, just pass back the mbufs.
3366 * Otherwise copy them out via the uio, then free.
3367 * Sockbuf must be consistent here (points to current mbuf,
3368 * it points to next record) when we drop priority;
3369 * we must note any additions to the sockbuf when we
3370 * block interrupts again.
3373 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive uiomove");
3374 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive uiomove");
3375 if (can_delay
&& len
== m
->m_len
) {
3377 * only delay the copy if we're consuming the
3378 * mbuf and we're NOT in MSG_PEEK mode
3379 * and we have enough data to make it worthwile
3380 * to drop and retake the lock... can_delay
3381 * reflects the state of the 2 latter
3382 * constraints moff should always be zero
3385 delayed_copy_len
+= len
;
3387 if (delayed_copy_len
) {
3388 error
= sodelayed_copy(so
, uio
,
3389 &free_list
, &delayed_copy_len
);
3395 * can only get here if MSG_PEEK is not
3396 * set therefore, m should point at the
3397 * head of the rcv queue; if it doesn't,
3398 * it means something drastically
3399 * changed while we were out from behind
3400 * the lock in sodelayed_copy. perhaps
3401 * a RST on the stream. in any event,
3402 * the stream has been interrupted. it's
3403 * probably best just to return whatever
3404 * data we've moved and let the caller
3407 if (m
!= so
->so_rcv
.sb_mb
) {
3411 socket_unlock(so
, 0);
3412 error
= uiomove(mtod(m
, caddr_t
) + moff
,
3420 uio_setresid(uio
, (uio_resid(uio
) - len
));
3422 if (len
== m
->m_len
- moff
) {
3423 if (m
->m_flags
& M_EOR
)
3425 if (flags
& MSG_PEEK
) {
3429 nextrecord
= m
->m_nextpkt
;
3430 sbfree(&so
->so_rcv
, m
);
3431 m
->m_nextpkt
= NULL
;
3434 * If this packet is an unordered packet
3435 * (indicated by M_UNORDERED_DATA flag), remove
3436 * the additional bytes added to the
3437 * receive socket buffer size.
3439 if ((so
->so_flags
& SOF_ENABLE_MSGS
) &&
3441 (m
->m_flags
& M_UNORDERED_DATA
) &&
3442 sbreserve(&so
->so_rcv
,
3443 so
->so_rcv
.sb_hiwat
- m
->m_len
)) {
3444 if (so
->so_msg_state
->msg_uno_bytes
>
3447 msg_uno_bytes
-= m
->m_len
;
3452 m
->m_flags
&= ~M_UNORDERED_DATA
;
3458 so
->so_rcv
.sb_mb
= m
= m
->m_next
;
3461 if (free_list
== NULL
)
3466 so
->so_rcv
.sb_mb
= m
= m
->m_next
;
3470 m
->m_nextpkt
= nextrecord
;
3471 if (nextrecord
== NULL
)
3472 so
->so_rcv
.sb_lastrecord
= m
;
3474 so
->so_rcv
.sb_mb
= nextrecord
;
3475 SB_EMPTY_FIXUP(&so
->so_rcv
);
3477 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 3");
3478 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 3");
3481 if (flags
& MSG_PEEK
) {
3487 if (flags
& MSG_DONTWAIT
)
3488 copy_flag
= M_DONTWAIT
;
3491 *mp
= m_copym(m
, 0, len
, copy_flag
);
3493 * Failed to allocate an mbuf?
3494 * Adjust uio_resid back, it was
3495 * adjusted down by len bytes which
3496 * we didn't copy over.
3500 (uio_resid(uio
) + len
));
3506 so
->so_rcv
.sb_cc
-= len
;
3509 if (so
->so_oobmark
) {
3510 if ((flags
& MSG_PEEK
) == 0) {
3511 so
->so_oobmark
-= len
;
3512 if (so
->so_oobmark
== 0) {
3513 so
->so_state
|= SS_RCVATMARK
;
3515 * delay posting the actual event until
3516 * after any delayed copy processing
3524 if (offset
== so
->so_oobmark
)
3528 if (flags
& MSG_EOR
)
3531 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3532 * (for non-atomic socket), we must not quit until
3533 * "uio->uio_resid == 0" or an error termination.
3534 * If a signal/timeout occurs, return with a short
3535 * count but without error. Keep sockbuf locked
3536 * against other readers.
3538 while (flags
& (MSG_WAITALL
|MSG_WAITSTREAM
) && m
== NULL
&&
3539 (uio_resid(uio
) - delayed_copy_len
) > 0 &&
3540 !sosendallatonce(so
) && !nextrecord
) {
3541 if (so
->so_error
|| ((so
->so_state
& SS_CANTRCVMORE
)
3543 && cfil_sock_data_pending(&so
->so_rcv
) == 0
3544 #endif /* CONTENT_FILTER */
3549 * Depending on the protocol (e.g. TCP), the following
3550 * might cause the socket lock to be dropped and later
3551 * be reacquired, and more data could have arrived and
3552 * have been appended to the receive socket buffer by
3553 * the time it returns. Therefore, we only sleep in
3554 * sbwait() below if and only if the socket buffer is
3555 * empty, in order to avoid a false sleep.
3557 if (pr
->pr_flags
& PR_WANTRCVD
&& so
->so_pcb
&&
3558 (((struct inpcb
*)so
->so_pcb
)->inp_state
!=
3560 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
3562 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 2");
3563 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 2");
3565 if (so
->so_rcv
.sb_mb
== NULL
&& sbwait(&so
->so_rcv
)) {
3570 * have to wait until after we get back from the sbwait
3571 * to do the copy because we will drop the lock if we
3572 * have enough data that has been delayed... by dropping
3573 * the lock we open up a window allowing the netisr
3574 * thread to process the incoming packets and to change
3575 * the state of this socket... we're issuing the sbwait
3576 * because the socket is empty and we're expecting the
3577 * netisr thread to wake us up when more packets arrive;
3578 * if we allow that processing to happen and then sbwait
3579 * we could stall forever with packets sitting in the
3580 * socket if no further packets arrive from the remote
3583 * we want to copy before we've collected all the data
3584 * to satisfy this request to allow the copy to overlap
3585 * the incoming packet processing on an MP system
3587 if (delayed_copy_len
> sorecvmincopy
&&
3588 (delayed_copy_len
> (so
->so_rcv
.sb_hiwat
/ 2))) {
3589 error
= sodelayed_copy(so
, uio
,
3590 &free_list
, &delayed_copy_len
);
3595 m
= so
->so_rcv
.sb_mb
;
3597 nextrecord
= m
->m_nextpkt
;
3599 SB_MB_CHECK(&so
->so_rcv
);
3602 #ifdef MORE_LOCKING_DEBUG
3603 if (so
->so_usecount
<= 1) {
3604 panic("%s: after big while so=%p ref=%d on socket\n",
3605 __func__
, so
, so
->so_usecount
);
3610 if (m
!= NULL
&& pr
->pr_flags
& PR_ATOMIC
) {
3611 if (so
->so_options
& SO_DONTTRUNC
) {
3612 flags
|= MSG_RCVMORE
;
3615 if ((flags
& MSG_PEEK
) == 0)
3616 (void) sbdroprecord(&so
->so_rcv
);
3621 * pru_rcvd below (for TCP) may cause more data to be received
3622 * if the socket lock is dropped prior to sending the ACK; some
3623 * legacy OpenTransport applications don't handle this well
3624 * (if it receives less data than requested while MSG_HAVEMORE
3625 * is set), and so we set the flag now based on what we know
3626 * prior to calling pru_rcvd.
3628 if ((so
->so_options
& SO_WANTMORE
) && so
->so_rcv
.sb_cc
> 0)
3629 flags
|= MSG_HAVEMORE
;
3631 if ((flags
& MSG_PEEK
) == 0) {
3633 so
->so_rcv
.sb_mb
= nextrecord
;
3635 * First part is an inline SB_EMPTY_FIXUP(). Second
3636 * part makes sure sb_lastrecord is up-to-date if
3637 * there is still data in the socket buffer.
3639 if (so
->so_rcv
.sb_mb
== NULL
) {
3640 so
->so_rcv
.sb_mbtail
= NULL
;
3641 so
->so_rcv
.sb_lastrecord
= NULL
;
3642 } else if (nextrecord
->m_nextpkt
== NULL
) {
3643 so
->so_rcv
.sb_lastrecord
= nextrecord
;
3645 SB_MB_CHECK(&so
->so_rcv
);
3647 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 4");
3648 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 4");
3649 if (pr
->pr_flags
& PR_WANTRCVD
&& so
->so_pcb
)
3650 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
3653 if (delayed_copy_len
) {
3654 error
= sodelayed_copy(so
, uio
, &free_list
, &delayed_copy_len
);
3658 if (free_list
!= NULL
) {
3659 m_freem_list(free_list
);
3663 postevent(so
, 0, EV_OOB
);
3665 if (orig_resid
== uio_resid(uio
) && orig_resid
&&
3666 (flags
& MSG_EOR
) == 0 && (so
->so_state
& SS_CANTRCVMORE
) == 0) {
3667 sbunlock(&so
->so_rcv
, TRUE
); /* keep socket locked */
3674 #ifdef MORE_LOCKING_DEBUG
3675 if (so
->so_usecount
<= 1) {
3676 panic("%s: release so=%p ref=%d on socket\n", __func__
,
3677 so
, so
->so_usecount
);
3681 if (delayed_copy_len
)
3682 error
= sodelayed_copy(so
, uio
, &free_list
, &delayed_copy_len
);
3684 if (free_list
!= NULL
)
3685 m_freem_list(free_list
);
3687 sbunlock(&so
->so_rcv
, FALSE
); /* will unlock socket */
3690 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_END
,
3691 VM_KERNEL_ADDRPERM(so
),
3692 ((error
== EWOULDBLOCK
) ? kEnTrFlagNoWork
: 0),
3693 (int64_t)(orig_resid
- uio_resid(uio
)));
3695 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, so
, uio_resid(uio
),
3696 so
->so_rcv
.sb_cc
, 0, error
);
3702 * Returns: 0 Success
3706 sodelayed_copy(struct socket
*so
, struct uio
*uio
, struct mbuf
**free_list
,
3707 user_ssize_t
*resid
)
3714 socket_unlock(so
, 0);
3716 while (m
!= NULL
&& error
== 0) {
3717 error
= uiomove(mtod(m
, caddr_t
), (int)m
->m_len
, uio
);
3720 m_freem_list(*free_list
);
3731 sodelayed_copy_list(struct socket
*so
, struct recv_msg_elem
*msgarray
,
3732 u_int uiocnt
, struct mbuf
**free_list
, user_ssize_t
*resid
)
3736 struct mbuf
*ml
, *m
;
3740 for (ml
= *free_list
, i
= 0; ml
!= NULL
&& i
< uiocnt
;
3741 ml
= ml
->m_nextpkt
, i
++) {
3742 auio
= msgarray
[i
].uio
;
3743 for (m
= ml
; m
!= NULL
; m
= m
->m_next
) {
3744 error
= uiomove(mtod(m
, caddr_t
), m
->m_len
, auio
);
3750 m_freem_list(*free_list
);
3759 soreceive_list(struct socket
*so
, struct recv_msg_elem
*msgarray
, u_int uiocnt
,
3763 struct mbuf
*nextrecord
;
3764 struct mbuf
*ml
= NULL
, *free_list
= NULL
, *free_tail
= NULL
;
3766 user_ssize_t len
, pktlen
, delayed_copy_len
= 0;
3767 struct protosw
*pr
= so
->so_proto
;
3769 struct proc
*p
= current_proc();
3770 struct uio
*auio
= NULL
;
3773 struct sockaddr
**psa
= NULL
;
3774 struct mbuf
**controlp
= NULL
;
3777 struct mbuf
*free_others
= NULL
;
3779 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST
| DBG_FUNC_START
,
3781 so
->so_rcv
.sb_cc
, so
->so_rcv
.sb_lowat
, so
->so_rcv
.sb_hiwat
);
3785 * - Only supports don't wait flags
3786 * - Only support datagram sockets (could be extended to raw)
3788 * - Protocol must support packet chains
3789 * - The uio array is NULL (should we panic?)
3795 if (flags
& ~(MSG_PEEK
| MSG_WAITALL
| MSG_DONTWAIT
| MSG_NEEDSA
|
3797 printf("%s invalid flags 0x%x\n", __func__
, flags
);
3801 if (so
->so_type
!= SOCK_DGRAM
) {
3805 if (sosendallatonce(so
) == 0) {
3809 if (so
->so_proto
->pr_usrreqs
->pru_send_list
== NULL
) {
3810 error
= EPROTONOSUPPORT
;
3813 if (msgarray
== NULL
) {
3814 printf("%s uioarray is NULL\n", __func__
);
3819 printf("%s uiocnt is 0\n", __func__
);
3824 * Sanity check on the length passed by caller as we are making 'int'
3827 resid
= recv_msg_array_resid(msgarray
, uiocnt
);
3828 if (resid
< 0 || resid
> INT_MAX
) {
3833 if (!(flags
& MSG_PEEK
) && sorecvmincopy
> 0)
3839 so_update_last_owner_locked(so
, p
);
3840 so_update_policy(so
);
3843 so_update_necp_policy(so
, NULL
, NULL
);
3847 * If a recv attempt is made on a previously-accepted socket
3848 * that has been marked as inactive (disconnected), reject
3851 if (so
->so_flags
& SOF_DEFUNCT
) {
3852 struct sockbuf
*sb
= &so
->so_rcv
;
3855 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
3856 __func__
, proc_pid(p
), (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
3857 SOCK_DOM(so
), SOCK_TYPE(so
), error
));
3859 * This socket should have been disconnected and flushed
3860 * prior to being returned from sodefunct(); there should
3861 * be no data on its receive list, so panic otherwise.
3863 if (so
->so_state
& SS_DEFUNCT
)
3864 sb_empty_assert(sb
, __func__
);
3870 * The uio may be empty
3872 if (npkts
>= uiocnt
) {
3878 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3879 * and if so just return to the caller. This could happen when
3880 * soreceive() is called by a socket upcall function during the
3881 * time the socket is freed. The socket buffer would have been
3882 * locked across the upcall, therefore we cannot put this thread
3883 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3884 * we may livelock), because the lock on the socket buffer will
3885 * only be released when the upcall routine returns to its caller.
3886 * Because the socket has been officially closed, there can be
3887 * no further read on it.
3889 if ((so
->so_state
& (SS_NOFDREF
| SS_CANTRCVMORE
)) ==
3890 (SS_NOFDREF
| SS_CANTRCVMORE
)) {
3895 error
= sblock(&so
->so_rcv
, SBLOCKWAIT(flags
));
3901 m
= so
->so_rcv
.sb_mb
;
3903 * Block awaiting more datagram if needed
3905 if (m
== NULL
|| (((flags
& MSG_DONTWAIT
) == 0 &&
3906 (so
->so_rcv
.sb_cc
< so
->so_rcv
.sb_lowat
||
3907 ((flags
& MSG_WAITALL
) && npkts
< uiocnt
))))) {
3909 * Panic if we notice inconsistencies in the socket's
3910 * receive list; both sb_mb and sb_cc should correctly
3911 * reflect the contents of the list, otherwise we may
3912 * end up with false positives during select() or poll()
3913 * which could put the application in a bad state.
3915 SB_MB_CHECK(&so
->so_rcv
);
3918 error
= so
->so_error
;
3919 if ((flags
& MSG_PEEK
) == 0)
3923 if (so
->so_state
& SS_CANTRCVMORE
) {
3926 if ((so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
)) == 0 &&
3927 (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
)) {
3931 if ((so
->so_state
& SS_NBIO
) ||
3932 (flags
& (MSG_DONTWAIT
|MSG_NBIO
))) {
3933 error
= EWOULDBLOCK
;
3937 * Do not block if we got some data
3939 if (free_list
!= NULL
) {
3944 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 1");
3945 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 1");
3947 sbunlock(&so
->so_rcv
, TRUE
); /* keep socket locked */
3950 error
= sbwait(&so
->so_rcv
);
3957 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgrcv
);
3958 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1");
3959 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1");
3962 * Consume the current uio index as we have a datagram
3964 auio
= msgarray
[npkts
].uio
;
3965 resid
= uio_resid(auio
);
3966 msgarray
[npkts
].which
|= SOCK_MSG_DATA
;
3967 psa
= (msgarray
[npkts
].which
& SOCK_MSG_SA
) ?
3968 &msgarray
[npkts
].psa
: NULL
;
3969 controlp
= (msgarray
[npkts
].which
& SOCK_MSG_CONTROL
) ?
3970 &msgarray
[npkts
].controlp
: NULL
;
3972 nextrecord
= m
->m_nextpkt
;
3974 if ((pr
->pr_flags
& PR_ADDR
) && m
->m_type
== MT_SONAME
) {
3975 error
= soreceive_addr(p
, so
, psa
, flags
, &m
, &nextrecord
, 1);
3976 if (error
== ERESTART
)
3978 else if (error
!= 0)
3982 if (m
!= NULL
&& m
->m_type
== MT_CONTROL
) {
3983 error
= soreceive_ctl(so
, controlp
, flags
, &m
, &nextrecord
);
3988 if (m
->m_pkthdr
.len
== 0) {
3989 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
3991 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
3992 (uint64_t)DEBUG_KERNEL_ADDRPERM(m
),
3997 * Loop to copy the mbufs of the current record
3998 * Support zero length packets
4002 while (m
!= NULL
&& (len
= resid
- pktlen
) >= 0 && error
== 0) {
4004 panic("%p m_len zero", m
);
4006 panic("%p m_type zero", m
);
4008 * Clip to the residual length
4014 * Copy the mbufs via the uio or delay the copy
4015 * Sockbuf must be consistent here (points to current mbuf,
4016 * it points to next record) when we drop priority;
4017 * we must note any additions to the sockbuf when we
4018 * block interrupts again.
4020 if (len
> 0 && can_delay
== 0) {
4021 socket_unlock(so
, 0);
4022 error
= uiomove(mtod(m
, caddr_t
), (int)len
, auio
);
4027 delayed_copy_len
+= len
;
4030 if (len
== m
->m_len
) {
4032 * m was entirely copied
4034 sbfree(&so
->so_rcv
, m
);
4035 nextrecord
= m
->m_nextpkt
;
4036 m
->m_nextpkt
= NULL
;
4039 * Set the first packet to the head of the free list
4041 if (free_list
== NULL
)
4044 * Link current packet to tail of free list
4047 if (free_tail
!= NULL
)
4048 free_tail
->m_nextpkt
= m
;
4052 * Link current mbuf to last mbuf of current packet
4059 * Move next buf to head of socket buffer
4061 so
->so_rcv
.sb_mb
= m
= ml
->m_next
;
4065 m
->m_nextpkt
= nextrecord
;
4066 if (nextrecord
== NULL
)
4067 so
->so_rcv
.sb_lastrecord
= m
;
4069 so
->so_rcv
.sb_mb
= nextrecord
;
4070 SB_EMPTY_FIXUP(&so
->so_rcv
);
4072 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 3");
4073 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 3");
4076 * Stop the loop on partial copy
4081 #ifdef MORE_LOCKING_DEBUG
4082 if (so
->so_usecount
<= 1) {
4083 panic("%s: after big while so=%llx ref=%d on socket\n",
4085 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), so
->so_usecount
);
4090 * Tell the caller we made a partial copy
4093 if (so
->so_options
& SO_DONTTRUNC
) {
4095 * Copyout first the freelist then the partial mbuf
4097 socket_unlock(so
, 0);
4098 if (delayed_copy_len
)
4099 error
= sodelayed_copy_list(so
, msgarray
,
4100 uiocnt
, &free_list
, &delayed_copy_len
);
4103 error
= uiomove(mtod(m
, caddr_t
), (int)len
,
4112 so
->so_rcv
.sb_cc
-= len
;
4113 flags
|= MSG_RCVMORE
;
4115 (void) sbdroprecord(&so
->so_rcv
);
4116 nextrecord
= so
->so_rcv
.sb_mb
;
4123 so
->so_rcv
.sb_mb
= nextrecord
;
4125 * First part is an inline SB_EMPTY_FIXUP(). Second
4126 * part makes sure sb_lastrecord is up-to-date if
4127 * there is still data in the socket buffer.
4129 if (so
->so_rcv
.sb_mb
== NULL
) {
4130 so
->so_rcv
.sb_mbtail
= NULL
;
4131 so
->so_rcv
.sb_lastrecord
= NULL
;
4132 } else if (nextrecord
->m_nextpkt
== NULL
) {
4133 so
->so_rcv
.sb_lastrecord
= nextrecord
;
4135 SB_MB_CHECK(&so
->so_rcv
);
4137 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 4");
4138 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 4");
4141 * We can continue to the next packet as long as:
4142 * - We haven't exhausted the uio array
4143 * - There was no error
4144 * - A packet was not truncated
4145 * - We can still receive more data
4147 if (npkts
< uiocnt
&& error
== 0 &&
4148 (flags
& (MSG_RCVMORE
| MSG_TRUNC
)) == 0 &&
4149 (so
->so_state
& SS_CANTRCVMORE
) == 0) {
4150 sbunlock(&so
->so_rcv
, TRUE
); /* keep socket locked */
4160 * pru_rcvd may cause more data to be received if the socket lock
4161 * is dropped so we set MSG_HAVEMORE now based on what we know.
4162 * That way the caller won't be surprised if it receives less data
4165 if ((so
->so_options
& SO_WANTMORE
) && so
->so_rcv
.sb_cc
> 0)
4166 flags
|= MSG_HAVEMORE
;
4168 if (pr
->pr_flags
& PR_WANTRCVD
&& so
->so_pcb
)
4169 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
4172 sbunlock(&so
->so_rcv
, FALSE
); /* will unlock socket */
4174 socket_unlock(so
, 1);
4176 if (delayed_copy_len
)
4177 error
= sodelayed_copy_list(so
, msgarray
, uiocnt
,
4178 &free_list
, &delayed_copy_len
);
4181 * Amortize the cost of freeing the mbufs
4183 if (free_list
!= NULL
)
4184 m_freem_list(free_list
);
4185 if (free_others
!= NULL
)
4186 m_freem_list(free_others
);
4188 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST
| DBG_FUNC_END
, error
,
4194 * Returns: 0 Success
4197 * <pru_shutdown>:EINVAL
4198 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4199 * <pru_shutdown>:ENOBUFS[TCP]
4200 * <pru_shutdown>:EMSGSIZE[TCP]
4201 * <pru_shutdown>:EHOSTUNREACH[TCP]
4202 * <pru_shutdown>:ENETUNREACH[TCP]
4203 * <pru_shutdown>:ENETDOWN[TCP]
4204 * <pru_shutdown>:ENOMEM[TCP]
4205 * <pru_shutdown>:EACCES[TCP]
4206 * <pru_shutdown>:EMSGSIZE[TCP]
4207 * <pru_shutdown>:ENOBUFS[TCP]
4208 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4209 * <pru_shutdown>:??? [other protocol families]
4212 soshutdown(struct socket
*so
, int how
)
4216 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN
| DBG_FUNC_START
, how
, 0, 0, 0, 0);
4224 (SS_ISCONNECTED
|SS_ISCONNECTING
|SS_ISDISCONNECTING
)) == 0) {
4227 error
= soshutdownlock(so
, how
);
4229 socket_unlock(so
, 1);
4236 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN
| DBG_FUNC_END
, how
, error
, 0, 0, 0);
4242 soshutdownlock_final(struct socket
*so
, int how
)
4244 struct protosw
*pr
= so
->so_proto
;
4247 sflt_notify(so
, sock_evt_shutdown
, &how
);
4249 if (how
!= SHUT_WR
) {
4250 if ((so
->so_state
& SS_CANTRCVMORE
) != 0) {
4251 /* read already shut down */
4256 postevent(so
, 0, EV_RCLOSED
);
4258 if (how
!= SHUT_RD
) {
4259 if ((so
->so_state
& SS_CANTSENDMORE
) != 0) {
4260 /* write already shut down */
4264 error
= (*pr
->pr_usrreqs
->pru_shutdown
)(so
);
4265 postevent(so
, 0, EV_WCLOSED
);
4268 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN
, how
, 1, 0, 0, 0);
4273 soshutdownlock(struct socket
*so
, int how
)
4279 * A content filter may delay the actual shutdown until it
4280 * has processed the pending data
4282 if (so
->so_flags
& SOF_CONTENT_FILTER
) {
4283 error
= cfil_sock_shutdown(so
, &how
);
4284 if (error
== EJUSTRETURN
) {
4287 } else if (error
!= 0) {
4291 #endif /* CONTENT_FILTER */
4293 error
= soshutdownlock_final(so
, how
);
4300 sowflush(struct socket
*so
)
4302 struct sockbuf
*sb
= &so
->so_snd
;
4304 lck_mtx_t
*mutex_held
;
4306 * XXX: This code is currently commented out, because we may get here
4307 * as part of sofreelastref(), and at that time, pr_getlock() may no
4308 * longer be able to return us the lock; this will be fixed in future.
4310 if (so
->so_proto
->pr_getlock
!= NULL
)
4311 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
4313 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
4315 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
4319 * Obtain lock on the socket buffer (SB_LOCK). This is required
4320 * to prevent the socket buffer from being unexpectedly altered
4321 * while it is used by another thread in socket send/receive.
4323 * sblock() must not fail here, hence the assertion.
4325 (void) sblock(sb
, SBL_WAIT
| SBL_NOINTR
| SBL_IGNDEFUNCT
);
4326 VERIFY(sb
->sb_flags
& SB_LOCK
);
4328 sb
->sb_flags
&= ~(SB_SEL
|SB_UPCALL
);
4329 sb
->sb_flags
|= SB_DROP
;
4330 sb
->sb_upcall
= NULL
;
4331 sb
->sb_upcallarg
= NULL
;
4333 sbunlock(sb
, TRUE
); /* keep socket locked */
4335 selthreadclear(&sb
->sb_sel
);
4340 sorflush(struct socket
*so
)
4342 struct sockbuf
*sb
= &so
->so_rcv
;
4343 struct protosw
*pr
= so
->so_proto
;
4346 lck_mtx_t
*mutex_held
;
4348 * XXX: This code is currently commented out, because we may get here
4349 * as part of sofreelastref(), and at that time, pr_getlock() may no
4350 * longer be able to return us the lock; this will be fixed in future.
4352 if (so
->so_proto
->pr_getlock
!= NULL
)
4353 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
4355 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
4357 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
4360 sflt_notify(so
, sock_evt_flush_read
, NULL
);
4365 * Obtain lock on the socket buffer (SB_LOCK). This is required
4366 * to prevent the socket buffer from being unexpectedly altered
4367 * while it is used by another thread in socket send/receive.
4369 * sblock() must not fail here, hence the assertion.
4371 (void) sblock(sb
, SBL_WAIT
| SBL_NOINTR
| SBL_IGNDEFUNCT
);
4372 VERIFY(sb
->sb_flags
& SB_LOCK
);
4375 * Copy only the relevant fields from "sb" to "asb" which we
4376 * need for sbrelease() to function. In particular, skip
4377 * sb_sel as it contains the wait queue linkage, which would
4378 * wreak havoc if we were to issue selthreadclear() on "asb".
4379 * Make sure to not carry over SB_LOCK in "asb", as we need
4380 * to acquire it later as part of sbrelease().
4382 bzero(&asb
, sizeof (asb
));
4383 asb
.sb_cc
= sb
->sb_cc
;
4384 asb
.sb_hiwat
= sb
->sb_hiwat
;
4385 asb
.sb_mbcnt
= sb
->sb_mbcnt
;
4386 asb
.sb_mbmax
= sb
->sb_mbmax
;
4387 asb
.sb_ctl
= sb
->sb_ctl
;
4388 asb
.sb_lowat
= sb
->sb_lowat
;
4389 asb
.sb_mb
= sb
->sb_mb
;
4390 asb
.sb_mbtail
= sb
->sb_mbtail
;
4391 asb
.sb_lastrecord
= sb
->sb_lastrecord
;
4392 asb
.sb_so
= sb
->sb_so
;
4393 asb
.sb_flags
= sb
->sb_flags
;
4394 asb
.sb_flags
&= ~(SB_LOCK
|SB_SEL
|SB_KNOTE
|SB_UPCALL
);
4395 asb
.sb_flags
|= SB_DROP
;
4398 * Ideally we'd bzero() these and preserve the ones we need;
4399 * but to do that we'd need to shuffle things around in the
4400 * sockbuf, and we can't do it now because there are KEXTS
4401 * that are directly referring to the socket structure.
4403 * Setting SB_DROP acts as a barrier to prevent further appends.
4404 * Clearing SB_SEL is done for selthreadclear() below.
4413 sb
->sb_mbtail
= NULL
;
4414 sb
->sb_lastrecord
= NULL
;
4415 sb
->sb_timeo
.tv_sec
= 0;
4416 sb
->sb_timeo
.tv_usec
= 0;
4417 sb
->sb_upcall
= NULL
;
4418 sb
->sb_upcallarg
= NULL
;
4419 sb
->sb_flags
&= ~(SB_SEL
|SB_UPCALL
);
4420 sb
->sb_flags
|= SB_DROP
;
4422 sbunlock(sb
, TRUE
); /* keep socket locked */
4425 * Note that selthreadclear() is called on the original "sb" and
4426 * not the local "asb" because of the way wait queue linkage is
4427 * implemented. Given that selwakeup() may be triggered, SB_SEL
4428 * should no longer be set (cleared above.)
4430 selthreadclear(&sb
->sb_sel
);
4432 if ((pr
->pr_flags
& PR_RIGHTS
) && pr
->pr_domain
->dom_dispose
)
4433 (*pr
->pr_domain
->dom_dispose
)(asb
.sb_mb
);
4439 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4440 * an additional variant to handle the case where the option value needs
4441 * to be some kind of integer, but not a specific size.
4442 * In addition to their use here, these functions are also called by the
4443 * protocol-level pr_ctloutput() routines.
4445 * Returns: 0 Success
4450 sooptcopyin(struct sockopt
*sopt
, void *buf
, size_t len
, size_t minlen
)
4455 * If the user gives us more than we wanted, we ignore it,
4456 * but if we don't get the minimum length the caller
4457 * wants, we return EINVAL. On success, sopt->sopt_valsize
4458 * is set to however much we actually retrieved.
4460 if ((valsize
= sopt
->sopt_valsize
) < minlen
)
4463 sopt
->sopt_valsize
= valsize
= len
;
4465 if (sopt
->sopt_p
!= kernproc
)
4466 return (copyin(sopt
->sopt_val
, buf
, valsize
));
4468 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), buf
, valsize
);
4473 * sooptcopyin_timeval
4474 * Copy in a timeval value into tv_p, and take into account whether the
4475 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4476 * code here so that we can verify the 64-bit tv_sec value before we lose
4477 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4480 sooptcopyin_timeval(struct sockopt
*sopt
, struct timeval
*tv_p
)
4484 if (proc_is64bit(sopt
->sopt_p
)) {
4485 struct user64_timeval tv64
;
4487 if (sopt
->sopt_valsize
< sizeof (tv64
))
4490 sopt
->sopt_valsize
= sizeof (tv64
);
4491 if (sopt
->sopt_p
!= kernproc
) {
4492 error
= copyin(sopt
->sopt_val
, &tv64
, sizeof (tv64
));
4496 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), &tv64
,
4499 if (tv64
.tv_sec
< 0 || tv64
.tv_sec
> LONG_MAX
||
4500 tv64
.tv_usec
< 0 || tv64
.tv_usec
>= 1000000)
4503 tv_p
->tv_sec
= tv64
.tv_sec
;
4504 tv_p
->tv_usec
= tv64
.tv_usec
;
4506 struct user32_timeval tv32
;
4508 if (sopt
->sopt_valsize
< sizeof (tv32
))
4511 sopt
->sopt_valsize
= sizeof (tv32
);
4512 if (sopt
->sopt_p
!= kernproc
) {
4513 error
= copyin(sopt
->sopt_val
, &tv32
, sizeof (tv32
));
4518 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), &tv32
,
4523 * K64todo "comparison is always false due to
4524 * limited range of data type"
4526 if (tv32
.tv_sec
< 0 || tv32
.tv_sec
> LONG_MAX
||
4527 tv32
.tv_usec
< 0 || tv32
.tv_usec
>= 1000000)
4530 tv_p
->tv_sec
= tv32
.tv_sec
;
4531 tv_p
->tv_usec
= tv32
.tv_usec
;
4537 * Returns: 0 Success
4542 * sooptcopyin:EINVAL
4543 * sooptcopyin:EFAULT
4544 * sooptcopyin_timeval:EINVAL
4545 * sooptcopyin_timeval:EFAULT
4546 * sooptcopyin_timeval:EDOM
4547 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4548 * <pr_ctloutput>:???w
4549 * sflt_attach_private:??? [whatever a filter author chooses]
4550 * <sf_setoption>:??? [whatever a filter author chooses]
4552 * Notes: Other <pru_listen> returns depend on the protocol family; all
4553 * <sf_listen> returns depend on what the filter author causes
4554 * their filter to return.
4557 sosetoptlock(struct socket
*so
, struct sockopt
*sopt
, int dolock
)
4562 #if CONFIG_MACF_SOCKET
4564 #endif /* MAC_SOCKET */
4566 if (sopt
->sopt_dir
!= SOPT_SET
)
4567 sopt
->sopt_dir
= SOPT_SET
;
4572 if ((so
->so_state
& (SS_CANTRCVMORE
| SS_CANTSENDMORE
)) ==
4573 (SS_CANTRCVMORE
| SS_CANTSENDMORE
) &&
4574 (so
->so_flags
& SOF_NPX_SETOPTSHUT
) == 0) {
4575 /* the socket has been shutdown, no more sockopt's */
4580 error
= sflt_setsockopt(so
, sopt
);
4582 if (error
== EJUSTRETURN
)
4587 if (sopt
->sopt_level
!= SOL_SOCKET
) {
4588 if (so
->so_proto
!= NULL
&&
4589 so
->so_proto
->pr_ctloutput
!= NULL
) {
4590 error
= (*so
->so_proto
->pr_ctloutput
)(so
, sopt
);
4593 error
= ENOPROTOOPT
;
4596 * Allow socket-level (SOL_SOCKET) options to be filtered by
4597 * the protocol layer, if needed. A zero value returned from
4598 * the handler means use default socket-level processing as
4599 * done by the rest of this routine. Otherwise, any other
4600 * return value indicates that the option is unsupported.
4602 if (so
->so_proto
!= NULL
&& (error
= so
->so_proto
->pr_usrreqs
->
4603 pru_socheckopt(so
, sopt
)) != 0)
4607 switch (sopt
->sopt_name
) {
4610 error
= sooptcopyin(sopt
, &l
, sizeof (l
), sizeof (l
));
4614 so
->so_linger
= (sopt
->sopt_name
== SO_LINGER
) ?
4615 l
.l_linger
: l
.l_linger
* hz
;
4617 so
->so_options
|= SO_LINGER
;
4619 so
->so_options
&= ~SO_LINGER
;
4625 case SO_USELOOPBACK
:
4631 case SO_TIMESTAMP_MONOTONIC
:
4634 case SO_WANTOOBFLAG
:
4635 case SO_NOWAKEFROMSLEEP
:
4636 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4641 so
->so_options
|= sopt
->sopt_name
;
4643 so
->so_options
&= ~sopt
->sopt_name
;
4650 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4656 * Values < 1 make no sense for any of these
4657 * options, so disallow them.
4664 switch (sopt
->sopt_name
) {
4667 struct sockbuf
*sb
=
4668 (sopt
->sopt_name
== SO_SNDBUF
) ?
4669 &so
->so_snd
: &so
->so_rcv
;
4670 if (sbreserve(sb
, (u_int32_t
)optval
) == 0) {
4674 sb
->sb_flags
|= SB_USRSIZE
;
4675 sb
->sb_flags
&= ~SB_AUTOSIZE
;
4676 sb
->sb_idealsize
= (u_int32_t
)optval
;
4680 * Make sure the low-water is never greater than
4684 int space
= sbspace(&so
->so_snd
);
4685 u_int32_t hiwat
= so
->so_snd
.sb_hiwat
;
4687 if (so
->so_snd
.sb_flags
& SB_UNIX
) {
4689 (struct unpcb
*)(so
->so_pcb
);
4691 unp
->unp_conn
!= NULL
) {
4692 hiwat
+= unp
->unp_conn
->unp_cc
;
4696 so
->so_snd
.sb_lowat
=
4700 if (space
>= so
->so_snd
.sb_lowat
) {
4707 so
->so_rcv
.sb_lowat
=
4708 (optval
> so
->so_rcv
.sb_hiwat
) ?
4709 so
->so_rcv
.sb_hiwat
: optval
;
4710 data_len
= so
->so_rcv
.sb_cc
4711 - so
->so_rcv
.sb_ctl
;
4712 if (data_len
>= so
->so_rcv
.sb_lowat
)
4721 error
= sooptcopyin_timeval(sopt
, &tv
);
4725 switch (sopt
->sopt_name
) {
4727 so
->so_snd
.sb_timeo
= tv
;
4730 so
->so_rcv
.sb_timeo
= tv
;
4738 error
= sooptcopyin(sopt
, &nke
, sizeof (nke
),
4743 error
= sflt_attach_internal(so
, nke
.nke_handle
);
4748 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4753 so
->so_flags
|= SOF_NOSIGPIPE
;
4755 so
->so_flags
&= ~SOF_NOSIGPIPE
;
4759 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4764 so
->so_flags
|= SOF_NOADDRAVAIL
;
4766 so
->so_flags
&= ~SOF_NOADDRAVAIL
;
4769 case SO_REUSESHAREUID
:
4770 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4775 so
->so_flags
|= SOF_REUSESHAREUID
;
4777 so
->so_flags
&= ~SOF_REUSESHAREUID
;
4780 case SO_NOTIFYCONFLICT
:
4781 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
4785 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4790 so
->so_flags
|= SOF_NOTIFYCONFLICT
;
4792 so
->so_flags
&= ~SOF_NOTIFYCONFLICT
;
4795 case SO_RESTRICTIONS
:
4796 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4801 error
= so_set_restrictions(so
, optval
);
4804 case SO_AWDL_UNRESTRICTED
:
4805 if (SOCK_DOM(so
) != PF_INET
&&
4806 SOCK_DOM(so
) != PF_INET6
) {
4810 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
4815 kauth_cred_t cred
= NULL
;
4816 proc_t ep
= PROC_NULL
;
4818 if (so
->so_flags
& SOF_DELEGATED
) {
4819 ep
= proc_find(so
->e_pid
);
4821 cred
= kauth_cred_proc_ref(ep
);
4823 error
= priv_check_cred(
4824 cred
? cred
: so
->so_cred
,
4825 PRIV_NET_RESTRICTED_AWDL
, 0);
4827 inp_set_awdl_unrestricted(
4830 kauth_cred_unref(&cred
);
4831 if (ep
!= PROC_NULL
)
4834 inp_clear_awdl_unrestricted(sotoinpcb(so
));
4838 #if CONFIG_MACF_SOCKET
4839 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof (extmac
),
4840 sizeof (extmac
))) != 0)
4843 error
= mac_setsockopt_label(proc_ucred(sopt
->sopt_p
),
4847 #endif /* MAC_SOCKET */
4850 case SO_UPCALLCLOSEWAIT
:
4851 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4856 so
->so_flags
|= SOF_UPCALLCLOSEWAIT
;
4858 so
->so_flags
&= ~SOF_UPCALLCLOSEWAIT
;
4862 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4867 so
->so_flags
|= SOF_BINDRANDOMPORT
;
4869 so
->so_flags
&= ~SOF_BINDRANDOMPORT
;
4872 case SO_NP_EXTENSIONS
: {
4873 struct so_np_extensions sonpx
;
4875 error
= sooptcopyin(sopt
, &sonpx
, sizeof (sonpx
),
4879 if (sonpx
.npx_mask
& ~SONPX_MASK_VALID
) {
4884 * Only one bit defined for now
4886 if ((sonpx
.npx_mask
& SONPX_SETOPTSHUT
)) {
4887 if ((sonpx
.npx_flags
& SONPX_SETOPTSHUT
))
4888 so
->so_flags
|= SOF_NPX_SETOPTSHUT
;
4890 so
->so_flags
&= ~SOF_NPX_SETOPTSHUT
;
4895 case SO_TRAFFIC_CLASS
: {
4896 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4900 error
= so_set_traffic_class(so
, optval
);
4906 case SO_RECV_TRAFFIC_CLASS
: {
4907 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4912 so
->so_flags
&= ~SOF_RECV_TRAFFIC_CLASS
;
4914 so
->so_flags
|= SOF_RECV_TRAFFIC_CLASS
;
4918 case SO_TRAFFIC_CLASS_DBG
: {
4919 struct so_tcdbg so_tcdbg
;
4921 error
= sooptcopyin(sopt
, &so_tcdbg
,
4922 sizeof (struct so_tcdbg
), sizeof (struct so_tcdbg
));
4925 error
= so_set_tcdbg(so
, &so_tcdbg
);
4931 case SO_PRIVILEGED_TRAFFIC_CLASS
:
4932 error
= priv_check_cred(kauth_cred_get(),
4933 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS
, 0);
4936 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4941 so
->so_flags
&= ~SOF_PRIVILEGED_TRAFFIC_CLASS
;
4943 so
->so_flags
|= SOF_PRIVILEGED_TRAFFIC_CLASS
;
4947 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4949 if (error
!= 0 || (so
->so_flags
& SOF_DEFUNCT
)) {
4955 * Any process can set SO_DEFUNCTOK (clear
4956 * SOF_NODEFUNCT), but only root can clear
4957 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
4960 kauth_cred_issuser(kauth_cred_get()) == 0) {
4965 so
->so_flags
&= ~SOF_NODEFUNCT
;
4967 so
->so_flags
|= SOF_NODEFUNCT
;
4969 if (SOCK_DOM(so
) == PF_INET
||
4970 SOCK_DOM(so
) == PF_INET6
) {
4971 char s
[MAX_IPv6_STR_LEN
];
4972 char d
[MAX_IPv6_STR_LEN
];
4973 struct inpcb
*inp
= sotoinpcb(so
);
4975 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%s %s:%d -> "
4976 "%s:%d] is now marked as %seligible for "
4977 "defunct\n", __func__
, proc_selfpid(),
4978 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
4979 (SOCK_TYPE(so
) == SOCK_STREAM
) ?
4980 "TCP" : "UDP", inet_ntop(SOCK_DOM(so
),
4981 ((SOCK_DOM(so
) == PF_INET
) ?
4982 (void *)&inp
->inp_laddr
.s_addr
:
4983 (void *)&inp
->in6p_laddr
), s
, sizeof (s
)),
4984 ntohs(inp
->in6p_lport
),
4985 inet_ntop(SOCK_DOM(so
),
4986 (SOCK_DOM(so
) == PF_INET
) ?
4987 (void *)&inp
->inp_faddr
.s_addr
:
4988 (void *)&inp
->in6p_faddr
, d
, sizeof (d
)),
4989 ntohs(inp
->in6p_fport
),
4990 (so
->so_flags
& SOF_NODEFUNCT
) ?
4993 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d] is "
4994 "now marked as %seligible for defunct\n",
4995 __func__
, proc_selfpid(),
4996 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
4997 SOCK_DOM(so
), SOCK_TYPE(so
),
4998 (so
->so_flags
& SOF_NODEFUNCT
) ?
5004 /* This option is not settable */
5008 case SO_OPPORTUNISTIC
:
5009 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
5012 error
= so_set_opportunistic(so
, optval
);
5016 /* This option is handled by lower layer(s) */
5021 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
5024 error
= so_set_recv_anyif(so
, optval
);
5027 case SO_TRAFFIC_MGT_BACKGROUND
: {
5028 /* This option is handled by lower layer(s) */
5034 case SO_FLOW_DIVERT_TOKEN
:
5035 error
= flow_divert_token_set(so
, sopt
);
5037 #endif /* FLOW_DIVERT */
5041 if ((error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
5042 sizeof (optval
))) != 0)
5045 error
= so_set_effective_pid(so
, optval
, sopt
->sopt_p
);
5048 case SO_DELEGATED_UUID
: {
5051 if ((error
= sooptcopyin(sopt
, &euuid
, sizeof (euuid
),
5052 sizeof (euuid
))) != 0)
5055 error
= so_set_effective_uuid(so
, euuid
, sopt
->sopt_p
);
5060 case SO_NECP_ATTRIBUTES
:
5061 error
= necp_set_socket_attributes(so
, sopt
);
5066 case SO_MPTCP_FASTJOIN
:
5067 if (!((so
->so_flags
& SOF_MP_SUBFLOW
) ||
5068 ((SOCK_CHECK_DOM(so
, PF_MULTIPATH
)) &&
5069 (SOCK_CHECK_PROTO(so
, IPPROTO_TCP
))))) {
5070 error
= ENOPROTOOPT
;
5074 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
5079 so
->so_flags
&= ~SOF_MPTCP_FASTJOIN
;
5081 so
->so_flags
|= SOF_MPTCP_FASTJOIN
;
5085 case SO_EXTENDED_BK_IDLE
:
5086 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
5089 error
= so_set_extended_bk_idle(so
, optval
);
5092 case SO_MARK_CELLFALLBACK
:
5093 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5102 so
->so_flags1
&= ~SOF1_CELLFALLBACK
;
5104 so
->so_flags1
|= SOF1_CELLFALLBACK
;
5107 error
= ENOPROTOOPT
;
5110 if (error
== 0 && so
->so_proto
!= NULL
&&
5111 so
->so_proto
->pr_ctloutput
!= NULL
) {
5112 (void) so
->so_proto
->pr_ctloutput(so
, sopt
);
5117 socket_unlock(so
, 1);
5121 /* Helper routines for getsockopt */
5123 sooptcopyout(struct sockopt
*sopt
, void *buf
, size_t len
)
5131 * Documented get behavior is that we always return a value,
5132 * possibly truncated to fit in the user's buffer.
5133 * Traditional behavior is that we always tell the user
5134 * precisely how much we copied, rather than something useful
5135 * like the total amount we had available for her.
5136 * Note that this interface is not idempotent; the entire answer must
5137 * generated ahead of time.
5139 valsize
= min(len
, sopt
->sopt_valsize
);
5140 sopt
->sopt_valsize
= valsize
;
5141 if (sopt
->sopt_val
!= USER_ADDR_NULL
) {
5142 if (sopt
->sopt_p
!= kernproc
)
5143 error
= copyout(buf
, sopt
->sopt_val
, valsize
);
5145 bcopy(buf
, CAST_DOWN(caddr_t
, sopt
->sopt_val
), valsize
);
5151 sooptcopyout_timeval(struct sockopt
*sopt
, const struct timeval
*tv_p
)
5155 struct user64_timeval tv64
;
5156 struct user32_timeval tv32
;
5161 if (proc_is64bit(sopt
->sopt_p
)) {
5162 len
= sizeof (tv64
);
5163 tv64
.tv_sec
= tv_p
->tv_sec
;
5164 tv64
.tv_usec
= tv_p
->tv_usec
;
5167 len
= sizeof (tv32
);
5168 tv32
.tv_sec
= tv_p
->tv_sec
;
5169 tv32
.tv_usec
= tv_p
->tv_usec
;
5172 valsize
= min(len
, sopt
->sopt_valsize
);
5173 sopt
->sopt_valsize
= valsize
;
5174 if (sopt
->sopt_val
!= USER_ADDR_NULL
) {
5175 if (sopt
->sopt_p
!= kernproc
)
5176 error
= copyout(val
, sopt
->sopt_val
, valsize
);
5178 bcopy(val
, CAST_DOWN(caddr_t
, sopt
->sopt_val
), valsize
);
5186 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5187 * <pr_ctloutput>:???
5188 * <sf_getoption>:???
5191 sogetoptlock(struct socket
*so
, struct sockopt
*sopt
, int dolock
)
5196 #if CONFIG_MACF_SOCKET
5198 #endif /* MAC_SOCKET */
5200 if (sopt
->sopt_dir
!= SOPT_GET
)
5201 sopt
->sopt_dir
= SOPT_GET
;
5206 error
= sflt_getsockopt(so
, sopt
);
5208 if (error
== EJUSTRETURN
)
5213 if (sopt
->sopt_level
!= SOL_SOCKET
) {
5214 if (so
->so_proto
!= NULL
&&
5215 so
->so_proto
->pr_ctloutput
!= NULL
) {
5216 error
= (*so
->so_proto
->pr_ctloutput
)(so
, sopt
);
5219 error
= ENOPROTOOPT
;
5222 * Allow socket-level (SOL_SOCKET) options to be filtered by
5223 * the protocol layer, if needed. A zero value returned from
5224 * the handler means use default socket-level processing as
5225 * done by the rest of this routine. Otherwise, any other
5226 * return value indicates that the option is unsupported.
5228 if (so
->so_proto
!= NULL
&& (error
= so
->so_proto
->pr_usrreqs
->
5229 pru_socheckopt(so
, sopt
)) != 0)
5233 switch (sopt
->sopt_name
) {
5236 l
.l_onoff
= ((so
->so_options
& SO_LINGER
) ? 1 : 0);
5237 l
.l_linger
= (sopt
->sopt_name
== SO_LINGER
) ?
5238 so
->so_linger
: so
->so_linger
/ hz
;
5239 error
= sooptcopyout(sopt
, &l
, sizeof (l
));
5242 case SO_USELOOPBACK
:
5251 case SO_TIMESTAMP_MONOTONIC
:
5254 case SO_WANTOOBFLAG
:
5255 case SO_NOWAKEFROMSLEEP
:
5256 optval
= so
->so_options
& sopt
->sopt_name
;
5258 error
= sooptcopyout(sopt
, &optval
, sizeof (optval
));
5262 optval
= so
->so_type
;
5266 if (so
->so_proto
->pr_flags
& PR_ATOMIC
) {
5271 m1
= so
->so_rcv
.sb_mb
;
5272 while (m1
!= NULL
) {
5273 if (m1
->m_type
== MT_DATA
||
5274 m1
->m_type
== MT_HEADER
||
5275 m1
->m_type
== MT_OOBDATA
)
5276 pkt_total
+= m1
->m_len
;
5281 optval
= so
->so_rcv
.sb_cc
- so
->so_rcv
.sb_ctl
;
5286 if (so
->so_proto
->pr_flags
& PR_ATOMIC
) {
5290 m1
= so
->so_rcv
.sb_mb
;
5291 while (m1
!= NULL
) {
5292 if (m1
->m_type
== MT_DATA
||
5293 m1
->m_type
== MT_HEADER
||
5294 m1
->m_type
== MT_OOBDATA
)
5306 optval
= so
->so_snd
.sb_cc
;
5310 optval
= so
->so_error
;
5315 u_int32_t hiwat
= so
->so_snd
.sb_hiwat
;
5317 if (so
->so_snd
.sb_flags
& SB_UNIX
) {
5319 (struct unpcb
*)(so
->so_pcb
);
5320 if (unp
!= NULL
&& unp
->unp_conn
!= NULL
) {
5321 hiwat
+= unp
->unp_conn
->unp_cc
;
5329 optval
= so
->so_rcv
.sb_hiwat
;
5333 optval
= so
->so_snd
.sb_lowat
;
5337 optval
= so
->so_rcv
.sb_lowat
;
5342 tv
= (sopt
->sopt_name
== SO_SNDTIMEO
?
5343 so
->so_snd
.sb_timeo
: so
->so_rcv
.sb_timeo
);
5345 error
= sooptcopyout_timeval(sopt
, &tv
);
5349 optval
= (so
->so_flags
& SOF_NOSIGPIPE
);
5353 optval
= (so
->so_flags
& SOF_NOADDRAVAIL
);
5356 case SO_REUSESHAREUID
:
5357 optval
= (so
->so_flags
& SOF_REUSESHAREUID
);
5361 case SO_NOTIFYCONFLICT
:
5362 optval
= (so
->so_flags
& SOF_NOTIFYCONFLICT
);
5365 case SO_RESTRICTIONS
:
5366 optval
= so_get_restrictions(so
);
5369 case SO_AWDL_UNRESTRICTED
:
5370 if (SOCK_DOM(so
) == PF_INET
||
5371 SOCK_DOM(so
) == PF_INET6
) {
5372 optval
= inp_get_awdl_unrestricted(
5380 #if CONFIG_MACF_SOCKET
5381 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof (extmac
),
5382 sizeof (extmac
))) != 0 ||
5383 (error
= mac_socket_label_get(proc_ucred(
5384 sopt
->sopt_p
), so
, &extmac
)) != 0)
5387 error
= sooptcopyout(sopt
, &extmac
, sizeof (extmac
));
5390 #endif /* MAC_SOCKET */
5394 #if CONFIG_MACF_SOCKET
5395 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof (extmac
),
5396 sizeof (extmac
))) != 0 ||
5397 (error
= mac_socketpeer_label_get(proc_ucred(
5398 sopt
->sopt_p
), so
, &extmac
)) != 0)
5401 error
= sooptcopyout(sopt
, &extmac
, sizeof (extmac
));
5404 #endif /* MAC_SOCKET */
5407 #ifdef __APPLE_API_PRIVATE
5408 case SO_UPCALLCLOSEWAIT
:
5409 optval
= (so
->so_flags
& SOF_UPCALLCLOSEWAIT
);
5413 optval
= (so
->so_flags
& SOF_BINDRANDOMPORT
);
5416 case SO_NP_EXTENSIONS
: {
5417 struct so_np_extensions sonpx
;
5419 sonpx
.npx_flags
= (so
->so_flags
& SOF_NPX_SETOPTSHUT
) ?
5420 SONPX_SETOPTSHUT
: 0;
5421 sonpx
.npx_mask
= SONPX_MASK_VALID
;
5423 error
= sooptcopyout(sopt
, &sonpx
,
5424 sizeof (struct so_np_extensions
));
5428 case SO_TRAFFIC_CLASS
:
5429 optval
= so
->so_traffic_class
;
5432 case SO_RECV_TRAFFIC_CLASS
:
5433 optval
= (so
->so_flags
& SOF_RECV_TRAFFIC_CLASS
);
5436 case SO_TRAFFIC_CLASS_STATS
:
5437 error
= sooptcopyout(sopt
, &so
->so_tc_stats
,
5438 sizeof (so
->so_tc_stats
));
5441 case SO_TRAFFIC_CLASS_DBG
:
5442 error
= sogetopt_tcdbg(so
, sopt
);
5445 case SO_PRIVILEGED_TRAFFIC_CLASS
:
5446 optval
= (so
->so_flags
& SOF_PRIVILEGED_TRAFFIC_CLASS
);
5450 optval
= !(so
->so_flags
& SOF_NODEFUNCT
);
5454 optval
= (so
->so_flags
& SOF_DEFUNCT
);
5457 case SO_OPPORTUNISTIC
:
5458 optval
= so_get_opportunistic(so
);
5462 /* This option is not gettable */
5467 optval
= so_get_recv_anyif(so
);
5470 case SO_TRAFFIC_MGT_BACKGROUND
:
5471 /* This option is handled by lower layer(s) */
5472 if (so
->so_proto
!= NULL
&&
5473 so
->so_proto
->pr_ctloutput
!= NULL
) {
5474 (void) so
->so_proto
->pr_ctloutput(so
, sopt
);
5479 case SO_FLOW_DIVERT_TOKEN
:
5480 error
= flow_divert_token_get(so
, sopt
);
5482 #endif /* FLOW_DIVERT */
5485 case SO_NECP_ATTRIBUTES
:
5486 error
= necp_get_socket_attributes(so
, sopt
);
5491 case SO_CFIL_SOCK_ID
: {
5492 cfil_sock_id_t sock_id
;
5494 sock_id
= cfil_sock_id_from_socket(so
);
5496 error
= sooptcopyout(sopt
, &sock_id
,
5497 sizeof(cfil_sock_id_t
));
5500 #endif /* CONTENT_FILTER */
5503 case SO_MPTCP_FASTJOIN
:
5504 if (!((so
->so_flags
& SOF_MP_SUBFLOW
) ||
5505 ((SOCK_CHECK_DOM(so
, PF_MULTIPATH
)) &&
5506 (SOCK_CHECK_PROTO(so
, IPPROTO_TCP
))))) {
5507 error
= ENOPROTOOPT
;
5510 optval
= (so
->so_flags
& SOF_MPTCP_FASTJOIN
);
5511 /* Fixed along with rdar://19391339 */
5515 case SO_EXTENDED_BK_IDLE
:
5516 optval
= (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
);
5518 case SO_MARK_CELLFALLBACK
:
5519 optval
= ((so
->so_flags1
& SOF1_CELLFALLBACK
) > 0)
5523 error
= ENOPROTOOPT
;
5529 socket_unlock(so
, 1);
5534 * The size limits on our soopt_getm is different from that on FreeBSD.
5535 * We limit the size of options to MCLBYTES. This will have to change
5536 * if we need to define options that need more space than MCLBYTES.
5539 soopt_getm(struct sockopt
*sopt
, struct mbuf
**mp
)
5541 struct mbuf
*m
, *m_prev
;
5542 int sopt_size
= sopt
->sopt_valsize
;
5545 if (sopt_size
<= 0 || sopt_size
> MCLBYTES
)
5548 how
= sopt
->sopt_p
!= kernproc
? M_WAIT
: M_DONTWAIT
;
5549 MGET(m
, how
, MT_DATA
);
5552 if (sopt_size
> MLEN
) {
5554 if ((m
->m_flags
& M_EXT
) == 0) {
5558 m
->m_len
= min(MCLBYTES
, sopt_size
);
5560 m
->m_len
= min(MLEN
, sopt_size
);
5562 sopt_size
-= m
->m_len
;
5566 while (sopt_size
> 0) {
5567 MGET(m
, how
, MT_DATA
);
5572 if (sopt_size
> MLEN
) {
5574 if ((m
->m_flags
& M_EXT
) == 0) {
5579 m
->m_len
= min(MCLBYTES
, sopt_size
);
5581 m
->m_len
= min(MLEN
, sopt_size
);
5583 sopt_size
-= m
->m_len
;
5590 /* copyin sopt data into mbuf chain */
5592 soopt_mcopyin(struct sockopt
*sopt
, struct mbuf
*m
)
5594 struct mbuf
*m0
= m
;
5596 if (sopt
->sopt_val
== USER_ADDR_NULL
)
5598 while (m
!= NULL
&& sopt
->sopt_valsize
>= m
->m_len
) {
5599 if (sopt
->sopt_p
!= kernproc
) {
5602 error
= copyin(sopt
->sopt_val
, mtod(m
, char *),
5609 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
),
5610 mtod(m
, char *), m
->m_len
);
5612 sopt
->sopt_valsize
-= m
->m_len
;
5613 sopt
->sopt_val
+= m
->m_len
;
5616 /* should be allocated enoughly at ip6_sooptmcopyin() */
5618 panic("soopt_mcopyin");
5624 /* copyout mbuf chain data into soopt */
5626 soopt_mcopyout(struct sockopt
*sopt
, struct mbuf
*m
)
5628 struct mbuf
*m0
= m
;
5631 if (sopt
->sopt_val
== USER_ADDR_NULL
)
5633 while (m
!= NULL
&& sopt
->sopt_valsize
>= m
->m_len
) {
5634 if (sopt
->sopt_p
!= kernproc
) {
5637 error
= copyout(mtod(m
, char *), sopt
->sopt_val
,
5644 bcopy(mtod(m
, char *),
5645 CAST_DOWN(caddr_t
, sopt
->sopt_val
), m
->m_len
);
5647 sopt
->sopt_valsize
-= m
->m_len
;
5648 sopt
->sopt_val
+= m
->m_len
;
5649 valsize
+= m
->m_len
;
5653 /* enough soopt buffer should be given from user-land */
5657 sopt
->sopt_valsize
= valsize
;
5662 sohasoutofband(struct socket
*so
)
5664 if (so
->so_pgid
< 0)
5665 gsignal(-so
->so_pgid
, SIGURG
);
5666 else if (so
->so_pgid
> 0)
5667 proc_signal(so
->so_pgid
, SIGURG
);
5668 selwakeup(&so
->so_rcv
.sb_sel
);
5672 sopoll(struct socket
*so
, int events
, kauth_cred_t cred
, void * wql
)
5674 #pragma unused(cred)
5675 struct proc
*p
= current_proc();
5679 so_update_last_owner_locked(so
, PROC_NULL
);
5680 so_update_policy(so
);
5682 if (events
& (POLLIN
| POLLRDNORM
))
5684 revents
|= events
& (POLLIN
| POLLRDNORM
);
5686 if (events
& (POLLOUT
| POLLWRNORM
))
5687 if (sowriteable(so
))
5688 revents
|= events
& (POLLOUT
| POLLWRNORM
);
5690 if (events
& (POLLPRI
| POLLRDBAND
))
5691 if (so
->so_oobmark
|| (so
->so_state
& SS_RCVATMARK
))
5692 revents
|= events
& (POLLPRI
| POLLRDBAND
);
5695 if (events
& (POLLIN
| POLLPRI
| POLLRDNORM
| POLLRDBAND
)) {
5697 * Darwin sets the flag first,
5698 * BSD calls selrecord first
5700 so
->so_rcv
.sb_flags
|= SB_SEL
;
5701 selrecord(p
, &so
->so_rcv
.sb_sel
, wql
);
5704 if (events
& (POLLOUT
| POLLWRNORM
)) {
5706 * Darwin sets the flag first,
5707 * BSD calls selrecord first
5709 so
->so_snd
.sb_flags
|= SB_SEL
;
5710 selrecord(p
, &so
->so_snd
.sb_sel
, wql
);
5714 socket_unlock(so
, 1);
5719 soo_kqfilter(struct fileproc
*fp
, struct knote
*kn
, vfs_context_t ctx
)
5722 #if !CONFIG_MACF_SOCKET
5724 #endif /* MAC_SOCKET */
5725 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
5729 so_update_last_owner_locked(so
, PROC_NULL
);
5730 so_update_policy(so
);
5732 #if CONFIG_MACF_SOCKET
5733 if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx
)),
5735 socket_unlock(so
, 1);
5738 #endif /* MAC_SOCKET */
5740 switch (kn
->kn_filter
) {
5742 kn
->kn_fop
= &soread_filtops
;
5744 * If the caller explicitly asked for OOB results (e.g. poll()),
5745 * save that off in the hookid field and reserve the kn_flags
5746 * EV_OOBAND bit for output only.
5748 if (kn
->kn_flags
& EV_OOBAND
) {
5749 kn
->kn_flags
&= ~EV_OOBAND
;
5750 kn
->kn_hookid
= EV_OOBAND
;
5754 skl
= &so
->so_rcv
.sb_sel
.si_note
;
5757 kn
->kn_fop
= &sowrite_filtops
;
5758 skl
= &so
->so_snd
.sb_sel
.si_note
;
5761 kn
->kn_fop
= &sock_filtops
;
5762 skl
= &so
->so_klist
;
5764 kn
->kn_status
|= KN_TOUCH
;
5767 socket_unlock(so
, 1);
5771 if (KNOTE_ATTACH(skl
, kn
)) {
5772 switch (kn
->kn_filter
) {
5774 so
->so_rcv
.sb_flags
|= SB_KNOTE
;
5777 so
->so_snd
.sb_flags
|= SB_KNOTE
;
5780 so
->so_flags
|= SOF_KNOTE
;
5783 socket_unlock(so
, 1);
5787 socket_unlock(so
, 1);
5792 filt_sordetach(struct knote
*kn
)
5794 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
5797 if (so
->so_rcv
.sb_flags
& SB_KNOTE
)
5798 if (KNOTE_DETACH(&so
->so_rcv
.sb_sel
.si_note
, kn
))
5799 so
->so_rcv
.sb_flags
&= ~SB_KNOTE
;
5800 socket_unlock(so
, 1);
5805 filt_soread(struct knote
*kn
, long hint
)
5807 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
5809 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
5812 if (so
->so_options
& SO_ACCEPTCONN
) {
5816 * Radar 6615193 handle the listen case dynamically
5817 * for kqueue read filter. This allows to call listen()
5818 * after registering the kqueue EVFILT_READ.
5821 kn
->kn_data
= so
->so_qlen
;
5822 isempty
= ! TAILQ_EMPTY(&so
->so_comp
);
5824 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
5825 socket_unlock(so
, 1);
5830 /* socket isn't a listener */
5832 * NOTE_LOWAT specifies new low water mark in data, i.e.
5833 * the bytes of protocol data. We therefore exclude any
5836 kn
->kn_data
= so
->so_rcv
.sb_cc
- so
->so_rcv
.sb_ctl
;
5839 * Clear out EV_OOBAND that filt_soread may have set in the
5842 kn
->kn_flags
&= ~EV_OOBAND
;
5843 if ((so
->so_oobmark
) || (so
->so_state
& SS_RCVATMARK
)) {
5844 kn
->kn_flags
|= EV_OOBAND
;
5846 * If caller registered explicit interest in OOB data,
5847 * return immediately (data == amount beyond mark, for
5848 * legacy reasons - that should be changed later).
5850 if (kn
->kn_hookid
== EV_OOBAND
) {
5852 * When so_state is SS_RCVATMARK, so_oobmark
5855 kn
->kn_data
-= so
->so_oobmark
;
5856 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
5857 socket_unlock(so
, 1);
5862 if ((so
->so_state
& SS_CANTRCVMORE
)
5864 && cfil_sock_data_pending(&so
->so_rcv
) == 0
5865 #endif /* CONTENT_FILTER */
5867 kn
->kn_flags
|= EV_EOF
;
5868 kn
->kn_fflags
= so
->so_error
;
5869 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
5870 socket_unlock(so
, 1);
5874 if (so
->so_error
) { /* temporary udp error */
5875 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
5876 socket_unlock(so
, 1);
5880 int64_t lowwat
= so
->so_rcv
.sb_lowat
;
5882 * Ensure that when NOTE_LOWAT is used, the derived
5883 * low water mark is bounded by socket's rcv buf's
5884 * high and low water mark values.
5886 if (kn
->kn_sfflags
& NOTE_LOWAT
) {
5887 if (kn
->kn_sdata
> so
->so_rcv
.sb_hiwat
)
5888 lowwat
= so
->so_rcv
.sb_hiwat
;
5889 else if (kn
->kn_sdata
> lowwat
)
5890 lowwat
= kn
->kn_sdata
;
5893 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
5894 socket_unlock(so
, 1);
5897 * The order below is important. Since NOTE_LOWAT
5898 * overrides sb_lowat, check for NOTE_LOWAT case
5901 if (kn
->kn_sfflags
& NOTE_LOWAT
)
5902 return (kn
->kn_data
>= lowwat
);
5904 return (so
->so_rcv
.sb_cc
>= lowwat
);
5908 filt_sowdetach(struct knote
*kn
)
5910 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
5913 if (so
->so_snd
.sb_flags
& SB_KNOTE
)
5914 if (KNOTE_DETACH(&so
->so_snd
.sb_sel
.si_note
, kn
))
5915 so
->so_snd
.sb_flags
&= ~SB_KNOTE
;
5916 socket_unlock(so
, 1);
5920 so_wait_for_if_feedback(struct socket
*so
)
5922 if ((SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) &&
5923 (so
->so_state
& SS_ISCONNECTED
)) {
5924 struct inpcb
*inp
= sotoinpcb(so
);
5925 if (INP_WAIT_FOR_IF_FEEDBACK(inp
))
5933 filt_sowrite(struct knote
*kn
, long hint
)
5935 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
5938 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
5941 kn
->kn_data
= sbspace(&so
->so_snd
);
5942 if (so
->so_state
& SS_CANTSENDMORE
) {
5943 kn
->kn_flags
|= EV_EOF
;
5944 kn
->kn_fflags
= so
->so_error
;
5948 if (so
->so_error
) { /* temporary udp error */
5952 if (!socanwrite(so
)) {
5956 if (so
->so_flags1
& SOF1_PRECONNECT_DATA
) {
5960 int64_t lowwat
= so
->so_snd
.sb_lowat
;
5961 if (kn
->kn_sfflags
& NOTE_LOWAT
) {
5962 if (kn
->kn_sdata
> so
->so_snd
.sb_hiwat
)
5963 lowwat
= so
->so_snd
.sb_hiwat
;
5964 else if (kn
->kn_sdata
> lowwat
)
5965 lowwat
= kn
->kn_sdata
;
5967 if (kn
->kn_data
>= lowwat
) {
5968 if (so
->so_flags
& SOF_NOTSENT_LOWAT
) {
5969 if ((SOCK_DOM(so
) == PF_INET
5970 || SOCK_DOM(so
) == PF_INET6
)
5971 && so
->so_type
== SOCK_STREAM
) {
5972 ret
= tcp_notsent_lowat_check(so
);
5975 else if ((SOCK_DOM(so
) == PF_MULTIPATH
) &&
5976 (SOCK_PROTO(so
) == IPPROTO_TCP
)) {
5977 ret
= mptcp_notsent_lowat_check(so
);
5988 if (so_wait_for_if_feedback(so
))
5991 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
5992 socket_unlock(so
, 1);
5997 filt_sockdetach(struct knote
*kn
)
5999 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6002 if ((so
->so_flags
& SOF_KNOTE
) != 0)
6003 if (KNOTE_DETACH(&so
->so_klist
, kn
))
6004 so
->so_flags
&= ~SOF_KNOTE
;
6005 socket_unlock(so
, 1);
6009 filt_sockev(struct knote
*kn
, long hint
)
6011 int ret
= 0, locked
= 0;
6012 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6013 long ev_hint
= (hint
& SO_FILT_HINT_EV
);
6014 uint32_t level_trigger
= 0;
6016 if ((hint
& SO_FILT_HINT_LOCKED
) == 0) {
6021 if (ev_hint
& SO_FILT_HINT_CONNRESET
) {
6022 kn
->kn_fflags
|= NOTE_CONNRESET
;
6024 if (ev_hint
& SO_FILT_HINT_TIMEOUT
) {
6025 kn
->kn_fflags
|= NOTE_TIMEOUT
;
6027 if (ev_hint
& SO_FILT_HINT_NOSRCADDR
) {
6028 kn
->kn_fflags
|= NOTE_NOSRCADDR
;
6030 if (ev_hint
& SO_FILT_HINT_IFDENIED
) {
6031 kn
->kn_fflags
|= NOTE_IFDENIED
;
6033 if (ev_hint
& SO_FILT_HINT_KEEPALIVE
) {
6034 kn
->kn_fflags
|= NOTE_KEEPALIVE
;
6036 if (ev_hint
& SO_FILT_HINT_ADAPTIVE_WTIMO
) {
6037 kn
->kn_fflags
|= NOTE_ADAPTIVE_WTIMO
;
6039 if (ev_hint
& SO_FILT_HINT_ADAPTIVE_RTIMO
) {
6040 kn
->kn_fflags
|= NOTE_ADAPTIVE_RTIMO
;
6042 if ((ev_hint
& SO_FILT_HINT_CONNECTED
) ||
6043 (so
->so_state
& SS_ISCONNECTED
)) {
6044 kn
->kn_fflags
|= NOTE_CONNECTED
;
6045 level_trigger
|= NOTE_CONNECTED
;
6047 if ((ev_hint
& SO_FILT_HINT_DISCONNECTED
) ||
6048 (so
->so_state
& SS_ISDISCONNECTED
)) {
6049 kn
->kn_fflags
|= NOTE_DISCONNECTED
;
6050 level_trigger
|= NOTE_DISCONNECTED
;
6052 if (ev_hint
& SO_FILT_HINT_CONNINFO_UPDATED
) {
6053 if (so
->so_proto
!= NULL
&&
6054 (so
->so_proto
->pr_flags
& PR_EVCONNINFO
))
6055 kn
->kn_fflags
|= NOTE_CONNINFO_UPDATED
;
6058 if ((so
->so_state
& SS_CANTRCVMORE
)
6060 && cfil_sock_data_pending(&so
->so_rcv
) == 0
6061 #endif /* CONTENT_FILTER */
6063 kn
->kn_fflags
|= NOTE_READCLOSED
;
6064 level_trigger
|= NOTE_READCLOSED
;
6067 if (so
->so_state
& SS_CANTSENDMORE
) {
6068 kn
->kn_fflags
|= NOTE_WRITECLOSED
;
6069 level_trigger
|= NOTE_WRITECLOSED
;
6072 if ((ev_hint
& SO_FILT_HINT_SUSPEND
) ||
6073 (so
->so_flags
& SOF_SUSPENDED
)) {
6074 kn
->kn_fflags
&= ~(NOTE_SUSPEND
| NOTE_RESUME
);
6076 /* If resume event was delivered before, reset it */
6077 kn
->kn_hookid
&= ~NOTE_RESUME
;
6079 kn
->kn_fflags
|= NOTE_SUSPEND
;
6080 level_trigger
|= NOTE_SUSPEND
;
6083 if ((ev_hint
& SO_FILT_HINT_RESUME
) ||
6084 (so
->so_flags
& SOF_SUSPENDED
) == 0) {
6085 kn
->kn_fflags
&= ~(NOTE_SUSPEND
| NOTE_RESUME
);
6087 /* If suspend event was delivered before, reset it */
6088 kn
->kn_hookid
&= ~NOTE_SUSPEND
;
6090 kn
->kn_fflags
|= NOTE_RESUME
;
6091 level_trigger
|= NOTE_RESUME
;
6094 if (so
->so_error
!= 0) {
6096 kn
->kn_data
= so
->so_error
;
6097 kn
->kn_flags
|= EV_EOF
;
6099 get_sockev_state(so
, (u_int32_t
*)&(kn
->kn_data
));
6102 /* Reset any events that are not requested on this knote */
6103 kn
->kn_fflags
&= (kn
->kn_sfflags
& EVFILT_SOCK_ALL_MASK
);
6104 level_trigger
&= (kn
->kn_sfflags
& EVFILT_SOCK_ALL_MASK
);
6106 /* Find the level triggerred events that are already delivered */
6107 level_trigger
&= kn
->kn_hookid
;
6108 level_trigger
&= EVFILT_SOCK_LEVEL_TRIGGER_MASK
;
6110 /* Do not deliver level triggerred events more than once */
6111 if ((kn
->kn_fflags
& ~level_trigger
) != 0)
6115 socket_unlock(so
, 1);
6121 filt_socktouch(struct knote
*kn
, struct kevent_internal_s
*kev
, long type
)
6125 case EVENT_REGISTER
:
6127 uint32_t changed_flags
;
6128 changed_flags
= (kn
->kn_sfflags
^ kn
->kn_hookid
);
6131 * Since we keep track of events that are already
6132 * delivered, if any of those events are not requested
6133 * anymore the state related to them can be reset
6136 ~(changed_flags
& EVFILT_SOCK_LEVEL_TRIGGER_MASK
);
6141 * Store the state of the events being delivered. This
6142 * state can be used to deliver level triggered events
6143 * ateast once and still avoid waking up the application
6144 * multiple times as long as the event is active.
6146 if (kn
->kn_fflags
!= 0)
6147 kn
->kn_hookid
|= (kn
->kn_fflags
&
6148 EVFILT_SOCK_LEVEL_TRIGGER_MASK
);
6151 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6152 * only one of them and remember the last one that was
6155 if (kn
->kn_fflags
& NOTE_SUSPEND
)
6156 kn
->kn_hookid
&= ~NOTE_RESUME
;
6157 if (kn
->kn_fflags
& NOTE_RESUME
)
6158 kn
->kn_hookid
&= ~NOTE_SUSPEND
;
6166 get_sockev_state(struct socket
*so
, u_int32_t
*statep
)
6168 u_int32_t state
= *(statep
);
6170 if (so
->so_state
& SS_ISCONNECTED
)
6171 state
|= SOCKEV_CONNECTED
;
6173 state
&= ~(SOCKEV_CONNECTED
);
6174 state
|= ((so
->so_state
& SS_ISDISCONNECTED
) ? SOCKEV_DISCONNECTED
: 0);
6178 #define SO_LOCK_HISTORY_STR_LEN \
6179 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
6181 __private_extern__
const char *
6182 solockhistory_nr(struct socket
*so
)
6186 static char lock_history_str
[SO_LOCK_HISTORY_STR_LEN
];
6188 bzero(lock_history_str
, sizeof (lock_history_str
));
6189 for (i
= SO_LCKDBG_MAX
- 1; i
>= 0; i
--) {
6190 n
+= snprintf(lock_history_str
+ n
,
6191 SO_LOCK_HISTORY_STR_LEN
- n
, "%p:%p ",
6192 so
->lock_lr
[(so
->next_lock_lr
+ i
) % SO_LCKDBG_MAX
],
6193 so
->unlock_lr
[(so
->next_unlock_lr
+ i
) % SO_LCKDBG_MAX
]);
6195 return (lock_history_str
);
6199 socket_lock(struct socket
*so
, int refcount
)
6204 lr_saved
= __builtin_return_address(0);
6206 if (so
->so_proto
->pr_lock
) {
6207 error
= (*so
->so_proto
->pr_lock
)(so
, refcount
, lr_saved
);
6209 #ifdef MORE_LOCKING_DEBUG
6210 lck_mtx_assert(so
->so_proto
->pr_domain
->dom_mtx
,
6211 LCK_MTX_ASSERT_NOTOWNED
);
6213 lck_mtx_lock(so
->so_proto
->pr_domain
->dom_mtx
);
6216 so
->lock_lr
[so
->next_lock_lr
] = lr_saved
;
6217 so
->next_lock_lr
= (so
->next_lock_lr
+1) % SO_LCKDBG_MAX
;
6224 socket_unlock(struct socket
*so
, int refcount
)
6228 lck_mtx_t
*mutex_held
;
6230 lr_saved
= __builtin_return_address(0);
6232 if (so
->so_proto
== NULL
) {
6233 panic("%s: null so_proto so=%p\n", __func__
, so
);
6237 if (so
&& so
->so_proto
->pr_unlock
) {
6238 error
= (*so
->so_proto
->pr_unlock
)(so
, refcount
, lr_saved
);
6240 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
6241 #ifdef MORE_LOCKING_DEBUG
6242 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
6244 so
->unlock_lr
[so
->next_unlock_lr
] = lr_saved
;
6245 so
->next_unlock_lr
= (so
->next_unlock_lr
+1) % SO_LCKDBG_MAX
;
6248 if (so
->so_usecount
<= 0) {
6249 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
6250 "lrh=%s", __func__
, so
->so_usecount
, so
,
6251 SOCK_DOM(so
), so
->so_type
,
6252 SOCK_PROTO(so
), solockhistory_nr(so
));
6257 if (so
->so_usecount
== 0)
6258 sofreelastref(so
, 1);
6260 lck_mtx_unlock(mutex_held
);
6266 /* Called with socket locked, will unlock socket */
6268 sofree(struct socket
*so
)
6270 lck_mtx_t
*mutex_held
;
6272 if (so
->so_proto
->pr_getlock
!= NULL
)
6273 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
6275 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
6276 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
6278 sofreelastref(so
, 0);
6282 soreference(struct socket
*so
)
6284 socket_lock(so
, 1); /* locks & take one reference on socket */
6285 socket_unlock(so
, 0); /* unlock only */
6289 sodereference(struct socket
*so
)
6292 socket_unlock(so
, 1);
6296 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
6297 * possibility of using jumbo clusters. Caller must ensure to hold
6301 somultipages(struct socket
*so
, boolean_t set
)
6304 so
->so_flags
|= SOF_MULTIPAGES
;
6306 so
->so_flags
&= ~SOF_MULTIPAGES
;
6310 soif2kcl(struct socket
*so
, boolean_t set
)
6313 so
->so_flags1
|= SOF1_IF_2KCL
;
6315 so
->so_flags1
&= ~SOF1_IF_2KCL
;
6319 so_isdstlocal(struct socket
*so
) {
6321 struct inpcb
*inp
= (struct inpcb
*)so
->so_pcb
;
6323 if (SOCK_DOM(so
) == PF_INET
)
6324 return (inaddr_local(inp
->inp_faddr
));
6325 else if (SOCK_DOM(so
) == PF_INET6
)
6326 return (in6addr_local(&inp
->in6p_faddr
));
6332 sosetdefunct(struct proc
*p
, struct socket
*so
, int level
, boolean_t noforce
)
6334 struct sockbuf
*rcv
, *snd
;
6335 int err
= 0, defunct
;
6340 defunct
= (so
->so_flags
& SOF_DEFUNCT
);
6342 if (!(snd
->sb_flags
& rcv
->sb_flags
& SB_DROP
)) {
6343 panic("%s: SB_DROP not set", __func__
);
6349 if (so
->so_flags
& SOF_NODEFUNCT
) {
6352 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) "
6353 "so 0x%llx [%d,%d] is not eligible for defunct "
6354 "(%d)\n", __func__
, proc_selfpid(), proc_pid(p
),
6355 level
, (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
6356 SOCK_DOM(so
), SOCK_TYPE(so
), err
));
6359 so
->so_flags
&= ~SOF_NODEFUNCT
;
6360 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
6361 "[%d,%d] defunct by force\n", __func__
, proc_selfpid(),
6362 proc_pid(p
), level
, (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
6363 SOCK_DOM(so
), SOCK_TYPE(so
)));
6364 } else if (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
) {
6365 struct inpcb
*inp
= (struct inpcb
*)so
->so_pcb
;
6366 struct ifnet
*ifp
= inp
->inp_last_outifp
;
6368 if (ifp
&& IFNET_IS_CELLULAR(ifp
)) {
6369 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_nocell
);
6370 } else if (so
->so_flags
& SOF_DELEGATED
) {
6371 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_nodlgtd
);
6372 } else if (soextbkidlestat
.so_xbkidle_time
== 0) {
6373 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_notime
);
6374 } else if (noforce
) {
6375 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_active
);
6377 so
->so_flags1
|= SOF1_EXTEND_BK_IDLE_INPROG
;
6378 so
->so_extended_bk_start
= net_uptime();
6379 OSBitOrAtomic(P_LXBKIDLEINPROG
, &p
->p_ladvflag
);
6381 inpcb_timer_sched(inp
->inp_pcbinfo
, INPCB_TIMER_LAZY
);
6384 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) "
6386 "so 0x%llx rcv hw %d cc %d\n",
6387 __func__
, proc_selfpid(), proc_pid(p
),
6388 level
, (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
6389 so
->so_rcv
.sb_hiwat
, so
->so_rcv
.sb_cc
));
6392 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_forced
);
6396 so
->so_flags
|= SOF_DEFUNCT
;
6398 /* Prevent further data from being appended to the socket buffers */
6399 snd
->sb_flags
|= SB_DROP
;
6400 rcv
->sb_flags
|= SB_DROP
;
6402 /* Flush any existing data in the socket buffers */
6403 if (rcv
->sb_cc
!= 0) {
6404 rcv
->sb_flags
&= ~SB_SEL
;
6405 selthreadclear(&rcv
->sb_sel
);
6408 if (snd
->sb_cc
!= 0) {
6409 snd
->sb_flags
&= ~SB_SEL
;
6410 selthreadclear(&snd
->sb_sel
);
6415 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%d,%d] %s "
6416 "defunct%s\n", __func__
, proc_selfpid(), proc_pid(p
), level
,
6417 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), SOCK_DOM(so
), SOCK_TYPE(so
),
6418 defunct
? "is already" : "marked as",
6419 (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
) ? " extbkidle" : ""));
6425 sodefunct(struct proc
*p
, struct socket
*so
, int level
)
6427 struct sockbuf
*rcv
, *snd
;
6429 if (!(so
->so_flags
& SOF_DEFUNCT
)) {
6430 panic("%s improperly called", __func__
);
6433 if (so
->so_state
& SS_DEFUNCT
)
6439 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
6440 char s
[MAX_IPv6_STR_LEN
];
6441 char d
[MAX_IPv6_STR_LEN
];
6442 struct inpcb
*inp
= sotoinpcb(so
);
6444 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%s "
6445 "%s:%d -> %s:%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
6446 "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__
, proc_selfpid(),
6447 proc_pid(p
), level
, (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
6448 (SOCK_TYPE(so
) == SOCK_STREAM
) ? "TCP" : "UDP",
6449 inet_ntop(SOCK_DOM(so
), ((SOCK_DOM(so
) == PF_INET
) ?
6450 (void *)&inp
->inp_laddr
.s_addr
: (void *)&inp
->in6p_laddr
),
6451 s
, sizeof (s
)), ntohs(inp
->in6p_lport
),
6452 inet_ntop(SOCK_DOM(so
), (SOCK_DOM(so
) == PF_INET
) ?
6453 (void *)&inp
->inp_faddr
.s_addr
: (void *)&inp
->in6p_faddr
,
6454 d
, sizeof (d
)), ntohs(inp
->in6p_fport
),
6455 (uint32_t)rcv
->sb_sel
.si_flags
,
6456 (uint32_t)snd
->sb_sel
.si_flags
,
6457 rcv
->sb_flags
, snd
->sb_flags
));
6459 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
6460 "[%d,%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
6461 "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__
, proc_selfpid(),
6462 proc_pid(p
), level
, (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
6463 SOCK_DOM(so
), SOCK_TYPE(so
), (uint32_t)rcv
->sb_sel
.si_flags
,
6464 (uint32_t)snd
->sb_sel
.si_flags
, rcv
->sb_flags
,
6469 * Unwedge threads blocked on sbwait() and sb_lock().
6474 so
->so_flags1
|= SOF1_DEFUNCTINPROG
;
6475 if (rcv
->sb_flags
& SB_LOCK
)
6476 sbunlock(rcv
, TRUE
); /* keep socket locked */
6477 if (snd
->sb_flags
& SB_LOCK
)
6478 sbunlock(snd
, TRUE
); /* keep socket locked */
6481 * Flush the buffers and disconnect. We explicitly call shutdown
6482 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
6483 * states are set for the socket. This would also flush out data
6484 * hanging off the receive list of this socket.
6486 (void) soshutdownlock_final(so
, SHUT_RD
);
6487 (void) soshutdownlock_final(so
, SHUT_WR
);
6488 (void) sodisconnectlocked(so
);
6491 * Explicitly handle connectionless-protocol disconnection
6492 * and release any remaining data in the socket buffers.
6494 if (!(so
->so_flags
& SS_ISDISCONNECTED
))
6495 (void) soisdisconnected(so
);
6497 if (so
->so_error
== 0)
6498 so
->so_error
= EBADF
;
6500 if (rcv
->sb_cc
!= 0) {
6501 rcv
->sb_flags
&= ~SB_SEL
;
6502 selthreadclear(&rcv
->sb_sel
);
6505 if (snd
->sb_cc
!= 0) {
6506 snd
->sb_flags
&= ~SB_SEL
;
6507 selthreadclear(&snd
->sb_sel
);
6510 so
->so_state
|= SS_DEFUNCT
;
6517 soresume(struct proc
*p
, struct socket
*so
, int locked
)
6522 if (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_INPROG
) {
6523 SODEFUNCTLOG(("%s[%d]: )target pid %d) so 0x%llx [%d,%d] "
6524 "resumed from bk idle\n",
6525 __func__
, proc_selfpid(), proc_pid(p
),
6526 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
6527 SOCK_DOM(so
), SOCK_TYPE(so
)));
6529 so
->so_flags1
&= ~SOF1_EXTEND_BK_IDLE_INPROG
;
6530 so
->so_extended_bk_start
= 0;
6531 OSBitAndAtomic(~P_LXBKIDLEINPROG
, &p
->p_ladvflag
);
6533 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_resumed
);
6534 OSDecrementAtomic(&soextbkidlestat
.so_xbkidle_active
);
6535 VERIFY(soextbkidlestat
.so_xbkidle_active
>= 0);
6538 socket_unlock(so
, 1);
6544 * Does not attempt to account for sockets that are delegated from
6545 * the current process
6548 so_set_extended_bk_idle(struct socket
*so
, int optval
)
6552 if ((SOCK_DOM(so
) != PF_INET
&& SOCK_DOM(so
) != PF_INET6
) ||
6553 SOCK_PROTO(so
) != IPPROTO_TCP
) {
6554 OSDecrementAtomic(&soextbkidlestat
.so_xbkidle_notsupp
);
6556 } else if (optval
== 0) {
6557 so
->so_flags1
&= ~SOF1_EXTEND_BK_IDLE_WANTED
;
6559 soresume(current_proc(), so
, 1);
6561 struct proc
*p
= current_proc();
6563 struct filedesc
*fdp
;
6569 for (i
= 0; i
< fdp
->fd_nfiles
; i
++) {
6570 struct fileproc
*fp
= fdp
->fd_ofiles
[i
];
6574 (fdp
->fd_ofileflags
[i
] & UF_RESERVED
) != 0 ||
6575 FILEGLOB_DTYPE(fp
->f_fglob
) != DTYPE_SOCKET
)
6578 so2
= (struct socket
*)fp
->f_fglob
->fg_data
;
6580 so2
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
)
6582 if (count
>= soextbkidlestat
.so_xbkidle_maxperproc
)
6585 if (count
>= soextbkidlestat
.so_xbkidle_maxperproc
) {
6586 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_toomany
);
6588 } else if (so
->so_flags
& SOF_DELEGATED
) {
6589 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_nodlgtd
);
6592 so
->so_flags1
|= SOF1_EXTEND_BK_IDLE_WANTED
;
6593 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_wantok
);
6595 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d] "
6596 "%s marked for extended bk idle\n",
6597 __func__
, proc_selfpid(),
6598 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
6599 SOCK_DOM(so
), SOCK_TYPE(so
),
6600 (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
) ?
6610 so_stop_extended_bk_idle(struct socket
*so
)
6612 so
->so_flags1
&= ~SOF1_EXTEND_BK_IDLE_INPROG
;
6613 so
->so_extended_bk_start
= 0;
6615 OSDecrementAtomic(&soextbkidlestat
.so_xbkidle_active
);
6616 VERIFY(soextbkidlestat
.so_xbkidle_active
>= 0);
6620 sosetdefunct(current_proc(), so
,
6621 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL
, FALSE
);
6622 if (so
->so_flags
& SOF_DEFUNCT
) {
6623 sodefunct(current_proc(), so
,
6624 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL
);
6629 so_drain_extended_bk_idle(struct socket
*so
)
6631 if (so
&& (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_INPROG
)) {
6633 * Only penalize sockets that have outstanding data
6635 if (so
->so_rcv
.sb_cc
|| so
->so_snd
.sb_cc
) {
6636 so_stop_extended_bk_idle(so
);
6638 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_drained
);
6644 * Return values tells if socket is still in extended background idle
6647 so_check_extended_bk_idle_time(struct socket
*so
)
6651 if ((so
->so_flags1
& SOF1_EXTEND_BK_IDLE_INPROG
)) {
6652 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d]\n",
6653 __func__
, proc_selfpid(),
6654 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
6655 SOCK_DOM(so
), SOCK_TYPE(so
)));
6656 if (net_uptime() - so
->so_extended_bk_start
>
6657 soextbkidlestat
.so_xbkidle_time
) {
6658 so_stop_extended_bk_idle(so
);
6660 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_expired
);
6664 struct inpcb
*inp
= (struct inpcb
*)so
->so_pcb
;
6666 inpcb_timer_sched(inp
->inp_pcbinfo
, INPCB_TIMER_LAZY
);
6667 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_resched
);
6675 resume_proc_sockets(proc_t p
)
6677 if (p
->p_ladvflag
& P_LXBKIDLEINPROG
) {
6678 struct filedesc
*fdp
;
6683 for (i
= 0; i
< fdp
->fd_nfiles
; i
++) {
6684 struct fileproc
*fp
;
6687 fp
= fdp
->fd_ofiles
[i
];
6689 (fdp
->fd_ofileflags
[i
] & UF_RESERVED
) != 0 ||
6690 FILEGLOB_DTYPE(fp
->f_fglob
) != DTYPE_SOCKET
)
6693 so
= (struct socket
*)fp
->f_fglob
->fg_data
;
6694 (void) soresume(p
, so
, 0);
6698 OSBitAndAtomic(~P_LXBKIDLEINPROG
, &p
->p_ladvflag
);
6702 __private_extern__
int
6703 so_set_recv_anyif(struct socket
*so
, int optval
)
6708 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
6710 if (SOCK_DOM(so
) == PF_INET
) {
6713 sotoinpcb(so
)->inp_flags
|= INP_RECV_ANYIF
;
6715 sotoinpcb(so
)->inp_flags
&= ~INP_RECV_ANYIF
;
6721 __private_extern__
int
6722 so_get_recv_anyif(struct socket
*so
)
6727 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
6729 if (SOCK_DOM(so
) == PF_INET
) {
6731 ret
= (sotoinpcb(so
)->inp_flags
& INP_RECV_ANYIF
) ? 1 : 0;
6738 so_set_restrictions(struct socket
*so
, uint32_t vals
)
6740 int nocell_old
, nocell_new
;
6741 int noexpensive_old
, noexpensive_new
;
6744 * Deny-type restrictions are trapdoors; once set they cannot be
6745 * unset for the lifetime of the socket. This allows them to be
6746 * issued by a framework on behalf of the application without
6747 * having to worry that they can be undone.
6749 * Note here that socket-level restrictions overrides any protocol
6750 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
6751 * socket restriction issued on the socket has a higher precendence
6752 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
6753 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
6754 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
6756 nocell_old
= (so
->so_restrictions
& SO_RESTRICT_DENY_CELLULAR
);
6757 noexpensive_old
= (so
->so_restrictions
& SO_RESTRICT_DENY_EXPENSIVE
);
6758 so
->so_restrictions
|= (vals
& (SO_RESTRICT_DENY_IN
|
6759 SO_RESTRICT_DENY_OUT
| SO_RESTRICT_DENY_CELLULAR
|
6760 SO_RESTRICT_DENY_EXPENSIVE
));
6761 nocell_new
= (so
->so_restrictions
& SO_RESTRICT_DENY_CELLULAR
);
6762 noexpensive_new
= (so
->so_restrictions
& SO_RESTRICT_DENY_EXPENSIVE
);
6764 /* we can only set, not clear restrictions */
6765 if ((nocell_new
- nocell_old
) == 0 &&
6766 (noexpensive_new
- noexpensive_old
) == 0)
6769 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
6771 if (SOCK_DOM(so
) == PF_INET
) {
6773 if (nocell_new
- nocell_old
!= 0) {
6775 * if deny cellular is now set, do what's needed
6778 inp_set_nocellular(sotoinpcb(so
));
6780 if (noexpensive_new
- noexpensive_old
!= 0) {
6781 inp_set_noexpensive(sotoinpcb(so
));
6789 so_get_restrictions(struct socket
*so
)
6791 return (so
->so_restrictions
& (SO_RESTRICT_DENY_IN
|
6792 SO_RESTRICT_DENY_OUT
|
6793 SO_RESTRICT_DENY_CELLULAR
| SO_RESTRICT_DENY_EXPENSIVE
));
6796 struct sockaddr_entry
*
6797 sockaddrentry_alloc(int how
)
6799 struct sockaddr_entry
*se
;
6801 se
= (how
== M_WAITOK
) ? zalloc(se_zone
) : zalloc_noblock(se_zone
);
6803 bzero(se
, se_zone_size
);
6809 sockaddrentry_free(struct sockaddr_entry
*se
)
6811 if (se
->se_addr
!= NULL
) {
6812 FREE(se
->se_addr
, M_SONAME
);
6818 struct sockaddr_entry
*
6819 sockaddrentry_dup(const struct sockaddr_entry
*src_se
, int how
)
6821 struct sockaddr_entry
*dst_se
;
6823 dst_se
= sockaddrentry_alloc(how
);
6824 if (dst_se
!= NULL
) {
6825 int len
= src_se
->se_addr
->sa_len
;
6827 MALLOC(dst_se
->se_addr
, struct sockaddr
*,
6828 len
, M_SONAME
, how
| M_ZERO
);
6829 if (dst_se
->se_addr
!= NULL
) {
6830 bcopy(src_se
->se_addr
, dst_se
->se_addr
, len
);
6832 sockaddrentry_free(dst_se
);
6840 struct sockaddr_list
*
6841 sockaddrlist_alloc(int how
)
6843 struct sockaddr_list
*sl
;
6845 sl
= (how
== M_WAITOK
) ? zalloc(sl_zone
) : zalloc_noblock(sl_zone
);
6847 bzero(sl
, sl_zone_size
);
6848 TAILQ_INIT(&sl
->sl_head
);
6854 sockaddrlist_free(struct sockaddr_list
*sl
)
6856 struct sockaddr_entry
*se
, *tse
;
6858 TAILQ_FOREACH_SAFE(se
, &sl
->sl_head
, se_link
, tse
) {
6859 sockaddrlist_remove(sl
, se
);
6860 sockaddrentry_free(se
);
6862 VERIFY(sl
->sl_cnt
== 0 && TAILQ_EMPTY(&sl
->sl_head
));
6867 sockaddrlist_insert(struct sockaddr_list
*sl
, struct sockaddr_entry
*se
)
6869 VERIFY(!(se
->se_flags
& SEF_ATTACHED
));
6870 se
->se_flags
|= SEF_ATTACHED
;
6871 TAILQ_INSERT_TAIL(&sl
->sl_head
, se
, se_link
);
6873 VERIFY(sl
->sl_cnt
!= 0);
6877 sockaddrlist_remove(struct sockaddr_list
*sl
, struct sockaddr_entry
*se
)
6879 VERIFY(se
->se_flags
& SEF_ATTACHED
);
6880 se
->se_flags
&= ~SEF_ATTACHED
;
6881 VERIFY(sl
->sl_cnt
!= 0);
6883 TAILQ_REMOVE(&sl
->sl_head
, se
, se_link
);
6886 struct sockaddr_list
*
6887 sockaddrlist_dup(const struct sockaddr_list
*src_sl
, int how
)
6889 struct sockaddr_entry
*src_se
, *tse
;
6890 struct sockaddr_list
*dst_sl
;
6892 dst_sl
= sockaddrlist_alloc(how
);
6896 TAILQ_FOREACH_SAFE(src_se
, &src_sl
->sl_head
, se_link
, tse
) {
6897 struct sockaddr_entry
*dst_se
;
6899 if (src_se
->se_addr
== NULL
)
6902 dst_se
= sockaddrentry_dup(src_se
, how
);
6903 if (dst_se
== NULL
) {
6904 sockaddrlist_free(dst_sl
);
6908 sockaddrlist_insert(dst_sl
, dst_se
);
6910 VERIFY(src_sl
->sl_cnt
== dst_sl
->sl_cnt
);
6916 so_set_effective_pid(struct socket
*so
, int epid
, struct proc
*p
)
6918 struct proc
*ep
= PROC_NULL
;
6921 /* pid 0 is reserved for kernel */
6928 * If this is an in-kernel socket, prevent its delegate
6929 * association from changing unless the socket option is
6930 * coming from within the kernel itself.
6932 if (so
->last_pid
== 0 && p
!= kernproc
) {
6938 * If this is issued by a process that's recorded as the
6939 * real owner of the socket, or if the pid is the same as
6940 * the process's own pid, then proceed. Otherwise ensure
6941 * that the issuing process has the necessary privileges.
6943 if (epid
!= so
->last_pid
|| epid
!= proc_pid(p
)) {
6944 if ((error
= priv_check_cred(kauth_cred_get(),
6945 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE
, 0))) {
6951 /* Find the process that corresponds to the effective pid */
6952 if ((ep
= proc_find(epid
)) == PROC_NULL
) {
6958 * If a process tries to delegate the socket to itself, then
6959 * there's really nothing to do; treat it as a way for the
6960 * delegate association to be cleared. Note that we check
6961 * the passed-in proc rather than calling proc_selfpid(),
6962 * as we need to check the process issuing the socket option
6963 * which could be kernproc. Given that we don't allow 0 for
6964 * effective pid, it means that a delegated in-kernel socket
6965 * stays delegated during its lifetime (which is probably OK.)
6967 if (epid
== proc_pid(p
)) {
6968 so
->so_flags
&= ~SOF_DELEGATED
;
6971 uuid_clear(so
->e_uuid
);
6973 so
->so_flags
|= SOF_DELEGATED
;
6974 so
->e_upid
= proc_uniqueid(ep
);
6975 so
->e_pid
= proc_pid(ep
);
6976 proc_getexecutableuuid(ep
, so
->e_uuid
, sizeof (so
->e_uuid
));
6979 if (error
== 0 && net_io_policy_log
) {
6982 uuid_unparse(so
->e_uuid
, buf
);
6983 log(LOG_DEBUG
, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
6984 "euuid %s%s\n", __func__
, proc_name_address(p
),
6985 proc_pid(p
), (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
6986 SOCK_DOM(so
), SOCK_TYPE(so
),
6987 so
->e_pid
, proc_name_address(ep
), buf
,
6988 ((so
->so_flags
& SOF_DELEGATED
) ? " [delegated]" : ""));
6989 } else if (error
!= 0 && net_io_policy_log
) {
6990 log(LOG_ERR
, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
6991 "ERROR (%d)\n", __func__
, proc_name_address(p
),
6992 proc_pid(p
), (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
6993 SOCK_DOM(so
), SOCK_TYPE(so
),
6994 epid
, (ep
== PROC_NULL
) ? "PROC_NULL" :
6995 proc_name_address(ep
), error
);
6998 /* Update this socket's policy upon success */
7000 so
->so_policy_gencnt
*= -1;
7001 so_update_policy(so
);
7003 so_update_necp_policy(so
, NULL
, NULL
);
7007 if (ep
!= PROC_NULL
)
7014 so_set_effective_uuid(struct socket
*so
, uuid_t euuid
, struct proc
*p
)
7020 /* UUID must not be all-zeroes (reserved for kernel) */
7021 if (uuid_is_null(euuid
)) {
7027 * If this is an in-kernel socket, prevent its delegate
7028 * association from changing unless the socket option is
7029 * coming from within the kernel itself.
7031 if (so
->last_pid
== 0 && p
!= kernproc
) {
7036 /* Get the UUID of the issuing process */
7037 proc_getexecutableuuid(p
, uuid
, sizeof (uuid
));
7040 * If this is issued by a process that's recorded as the
7041 * real owner of the socket, or if the uuid is the same as
7042 * the process's own uuid, then proceed. Otherwise ensure
7043 * that the issuing process has the necessary privileges.
7045 if (uuid_compare(euuid
, so
->last_uuid
) != 0 ||
7046 uuid_compare(euuid
, uuid
) != 0) {
7047 if ((error
= priv_check_cred(kauth_cred_get(),
7048 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE
, 0))) {
7055 * If a process tries to delegate the socket to itself, then
7056 * there's really nothing to do; treat it as a way for the
7057 * delegate association to be cleared. Note that we check
7058 * the uuid of the passed-in proc rather than that of the
7059 * current process, as we need to check the process issuing
7060 * the socket option which could be kernproc itself. Given
7061 * that we don't allow 0 for effective uuid, it means that
7062 * a delegated in-kernel socket stays delegated during its
7063 * lifetime (which is okay.)
7065 if (uuid_compare(euuid
, uuid
) == 0) {
7066 so
->so_flags
&= ~SOF_DELEGATED
;
7069 uuid_clear(so
->e_uuid
);
7071 so
->so_flags
|= SOF_DELEGATED
;
7073 * Unlike so_set_effective_pid(), we only have the UUID
7074 * here and the process ID is not known. Inherit the
7075 * real {pid,upid} of the socket.
7077 so
->e_upid
= so
->last_upid
;
7078 so
->e_pid
= so
->last_pid
;
7079 uuid_copy(so
->e_uuid
, euuid
);
7083 if (error
== 0 && net_io_policy_log
) {
7084 uuid_unparse(so
->e_uuid
, buf
);
7085 log(LOG_DEBUG
, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7086 "euuid %s%s\n", __func__
, proc_name_address(p
), proc_pid(p
),
7087 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), SOCK_DOM(so
),
7088 SOCK_TYPE(so
), so
->e_pid
, buf
,
7089 ((so
->so_flags
& SOF_DELEGATED
) ? " [delegated]" : ""));
7090 } else if (error
!= 0 && net_io_policy_log
) {
7091 uuid_unparse(euuid
, buf
);
7092 log(LOG_DEBUG
, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7093 "ERROR (%d)\n", __func__
, proc_name_address(p
), proc_pid(p
),
7094 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), SOCK_DOM(so
),
7095 SOCK_TYPE(so
), buf
, error
);
7098 /* Update this socket's policy upon success */
7100 so
->so_policy_gencnt
*= -1;
7101 so_update_policy(so
);
7103 so_update_necp_policy(so
, NULL
, NULL
);
7111 netpolicy_post_msg(uint32_t ev_code
, struct netpolicy_event_data
*ev_data
,
7112 uint32_t ev_datalen
)
7114 struct kev_msg ev_msg
;
7117 * A netpolicy event always starts with a netpolicy_event_data
7118 * structure, but the caller can provide for a longer event
7119 * structure to post, depending on the event code.
7121 VERIFY(ev_data
!= NULL
&& ev_datalen
>= sizeof (*ev_data
));
7123 bzero(&ev_msg
, sizeof (ev_msg
));
7124 ev_msg
.vendor_code
= KEV_VENDOR_APPLE
;
7125 ev_msg
.kev_class
= KEV_NETWORK_CLASS
;
7126 ev_msg
.kev_subclass
= KEV_NETPOLICY_SUBCLASS
;
7127 ev_msg
.event_code
= ev_code
;
7129 ev_msg
.dv
[0].data_ptr
= ev_data
;
7130 ev_msg
.dv
[0].data_length
= ev_datalen
;
7132 kev_post_msg(&ev_msg
);
7136 socket_post_kev_msg(uint32_t ev_code
,
7137 struct kev_socket_event_data
*ev_data
,
7138 uint32_t ev_datalen
)
7140 struct kev_msg ev_msg
;
7142 bzero(&ev_msg
, sizeof(ev_msg
));
7143 ev_msg
.vendor_code
= KEV_VENDOR_APPLE
;
7144 ev_msg
.kev_class
= KEV_NETWORK_CLASS
;
7145 ev_msg
.kev_subclass
= KEV_SOCKET_SUBCLASS
;
7146 ev_msg
.event_code
= ev_code
;
7148 ev_msg
.dv
[0].data_ptr
= ev_data
;
7149 ev_msg
.dv
[0]. data_length
= ev_datalen
;
7151 kev_post_msg(&ev_msg
);
7155 socket_post_kev_msg_closed(struct socket
*so
)
7157 struct kev_socket_closed ev
;
7158 struct sockaddr
*socksa
= NULL
, *peersa
= NULL
;
7160 bzero(&ev
, sizeof(ev
));
7161 err
= (*so
->so_proto
->pr_usrreqs
->pru_sockaddr
)(so
, &socksa
);
7163 err
= (*so
->so_proto
->pr_usrreqs
->pru_peeraddr
)(so
,
7166 memcpy(&ev
.ev_data
.kev_sockname
, socksa
,
7168 sizeof (ev
.ev_data
.kev_sockname
)));
7169 memcpy(&ev
.ev_data
.kev_peername
, peersa
,
7171 sizeof (ev
.ev_data
.kev_peername
)));
7172 socket_post_kev_msg(KEV_SOCKET_CLOSED
,
7173 &ev
.ev_data
, sizeof (ev
));
7177 FREE(socksa
, M_SONAME
);
7179 FREE(peersa
, M_SONAME
);