2 * Copyright (c) 1998-2016 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
92 #include <sys/uio_internal.h>
94 #include <sys/kdebug.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/ntstat.h>
102 #include <net/content_filter.h>
103 #include <netinet/in.h>
104 #include <netinet/in_pcb.h>
105 #include <netinet/in_tclass.h>
106 #include <netinet/tcp_var.h>
107 #include <netinet/ip6.h>
108 #include <netinet6/ip6_var.h>
109 #include <netinet/flow_divert.h>
110 #include <kern/zalloc.h>
111 #include <kern/locks.h>
112 #include <machine/limits.h>
113 #include <libkern/OSAtomic.h>
114 #include <pexpert/pexpert.h>
115 #include <kern/assert.h>
116 #include <kern/task.h>
117 #include <kern/policy_internal.h>
119 #include <sys/kpi_mbuf.h>
120 #include <sys/mcache.h>
121 #include <sys/unpcb.h>
124 #include <security/mac.h>
125 #include <security/mac_framework.h>
129 #include <netinet/mp_pcb.h>
130 #include <netinet/mptcp_var.h>
131 #endif /* MULTIPATH */
133 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
135 #if DEBUG || DEVELOPMENT
136 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
138 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
141 /* TODO: this should be in a header file somewhere */
142 extern char *proc_name_address(void *p
);
143 extern char *proc_best_name(proc_t
);
145 static u_int32_t so_cache_hw
; /* High water mark for socache */
146 static u_int32_t so_cache_timeouts
; /* number of timeouts */
147 static u_int32_t so_cache_max_freed
; /* max freed per timeout */
148 static u_int32_t cached_sock_count
= 0;
149 STAILQ_HEAD(, socket
) so_cache_head
;
150 int max_cached_sock_count
= MAX_CACHED_SOCKETS
;
151 static u_int32_t so_cache_time
;
152 static int socketinit_done
;
153 static struct zone
*so_cache_zone
;
155 static lck_grp_t
*so_cache_mtx_grp
;
156 static lck_attr_t
*so_cache_mtx_attr
;
157 static lck_grp_attr_t
*so_cache_mtx_grp_attr
;
158 static lck_mtx_t
*so_cache_mtx
;
160 #include <machine/limits.h>
162 static int filt_sorattach(struct knote
*kn
);
163 static void filt_sordetach(struct knote
*kn
);
164 static int filt_soread(struct knote
*kn
, long hint
);
165 static int filt_sortouch(struct knote
*kn
, struct kevent_internal_s
*kev
);
166 static int filt_sorprocess(struct knote
*kn
, struct filt_process_s
*data
, struct kevent_internal_s
*kev
);
168 static int filt_sowattach(struct knote
*kn
);
169 static void filt_sowdetach(struct knote
*kn
);
170 static int filt_sowrite(struct knote
*kn
, long hint
);
171 static int filt_sowtouch(struct knote
*kn
, struct kevent_internal_s
*kev
);
172 static int filt_sowprocess(struct knote
*kn
, struct filt_process_s
*data
, struct kevent_internal_s
*kev
);
174 static int filt_sockattach(struct knote
*kn
);
175 static void filt_sockdetach(struct knote
*kn
);
176 static int filt_sockev(struct knote
*kn
, long hint
);
177 static int filt_socktouch(struct knote
*kn
, struct kevent_internal_s
*kev
);
178 static int filt_sockprocess(struct knote
*kn
, struct filt_process_s
*data
, struct kevent_internal_s
*kev
);
180 static int sooptcopyin_timeval(struct sockopt
*, struct timeval
*);
181 static int sooptcopyout_timeval(struct sockopt
*, const struct timeval
*);
183 struct filterops soread_filtops
= {
185 .f_attach
= filt_sorattach
,
186 .f_detach
= filt_sordetach
,
187 .f_event
= filt_soread
,
188 .f_touch
= filt_sortouch
,
189 .f_process
= filt_sorprocess
,
192 struct filterops sowrite_filtops
= {
194 .f_attach
= filt_sowattach
,
195 .f_detach
= filt_sowdetach
,
196 .f_event
= filt_sowrite
,
197 .f_touch
= filt_sowtouch
,
198 .f_process
= filt_sowprocess
,
201 struct filterops sock_filtops
= {
203 .f_attach
= filt_sockattach
,
204 .f_detach
= filt_sockdetach
,
205 .f_event
= filt_sockev
,
206 .f_touch
= filt_socktouch
,
207 .f_process
= filt_sockprocess
,
210 struct filterops soexcept_filtops
= {
212 .f_attach
= filt_sorattach
,
213 .f_detach
= filt_sordetach
,
214 .f_event
= filt_soread
,
215 .f_touch
= filt_sortouch
,
216 .f_process
= filt_sorprocess
,
219 SYSCTL_DECL(_kern_ipc
);
221 #define EVEN_MORE_LOCKING_DEBUG 0
223 int socket_debug
= 0;
224 SYSCTL_INT(_kern_ipc
, OID_AUTO
, socket_debug
,
225 CTLFLAG_RW
| CTLFLAG_LOCKED
, &socket_debug
, 0, "");
227 static unsigned long sodefunct_calls
= 0;
228 SYSCTL_LONG(_kern_ipc
, OID_AUTO
, sodefunct_calls
, CTLFLAG_LOCKED
,
229 &sodefunct_calls
, "");
231 static int socket_zone
= M_SOCKET
;
232 so_gen_t so_gencnt
; /* generation count for sockets */
234 MALLOC_DEFINE(M_SONAME
, "soname", "socket name");
235 MALLOC_DEFINE(M_PCB
, "pcb", "protocol control block");
237 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
238 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
239 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
240 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
241 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
242 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
243 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
244 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
245 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
247 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
249 int somaxconn
= SOMAXCONN
;
250 SYSCTL_INT(_kern_ipc
, KIPC_SOMAXCONN
, somaxconn
,
251 CTLFLAG_RW
| CTLFLAG_LOCKED
, &somaxconn
, 0, "");
253 /* Should we get a maximum also ??? */
254 static int sosendmaxchain
= 65536;
255 static int sosendminchain
= 16384;
256 static int sorecvmincopy
= 16384;
257 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendminchain
,
258 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sosendminchain
, 0, "");
259 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sorecvmincopy
,
260 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sorecvmincopy
, 0, "");
263 * Set to enable jumbo clusters (if available) for large writes when
264 * the socket is marked with SOF_MULTIPAGES; see below.
267 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendjcl
,
268 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sosendjcl
, 0, "");
271 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
272 * writes on the socket for all protocols on any network interfaces,
273 * depending upon sosendjcl above. Be extra careful when setting this
274 * to 1, because sending down packets that cross physical pages down to
275 * broken drivers (those that falsely assume that the physical pages
276 * are contiguous) might lead to system panics or silent data corruption.
277 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
278 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
279 * capable. Set this to 1 only for testing/debugging purposes.
281 int sosendjcl_ignore_capab
= 0;
282 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendjcl_ignore_capab
,
283 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sosendjcl_ignore_capab
, 0, "");
286 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
287 * writes on the socket for all protocols on any network interfaces.
288 * Be extra careful when setting this to 1, because sending down packets with
289 * clusters larger that 2 KB might lead to system panics or data corruption.
290 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
291 * on the outgoing interface
292 * Set this to 1 for testing/debugging purposes only.
294 int sosendbigcl_ignore_capab
= 0;
295 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendbigcl_ignore_capab
,
296 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sosendbigcl_ignore_capab
, 0, "");
298 int sodefunctlog
= 0;
299 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sodefunctlog
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
300 &sodefunctlog
, 0, "");
302 int sothrottlelog
= 0;
303 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sothrottlelog
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
304 &sothrottlelog
, 0, "");
306 int sorestrictrecv
= 1;
307 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sorestrictrecv
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
308 &sorestrictrecv
, 0, "Enable inbound interface restrictions");
310 int sorestrictsend
= 1;
311 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sorestrictsend
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
312 &sorestrictsend
, 0, "Enable outbound interface restrictions");
314 int soreserveheadroom
= 1;
315 SYSCTL_INT(_kern_ipc
, OID_AUTO
, soreserveheadroom
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
316 &soreserveheadroom
, 0, "To allocate contiguous datagram buffers");
318 #if (DEBUG || DEVELOPMENT)
319 int so_notsent_lowat_check
= 1;
320 SYSCTL_INT(_kern_ipc
, OID_AUTO
, notsent_lowat
, CTLFLAG_RW
|CTLFLAG_LOCKED
,
321 &so_notsent_lowat_check
, 0, "enable/disable notsnet lowat check");
322 #endif /* DEBUG || DEVELOPMENT */
324 extern struct inpcbinfo tcbinfo
;
326 /* TODO: these should be in header file */
327 extern int get_inpcb_str_size(void);
328 extern int get_tcp_str_size(void);
330 static unsigned int sl_zone_size
; /* size of sockaddr_list */
331 static struct zone
*sl_zone
; /* zone for sockaddr_list */
333 static unsigned int se_zone_size
; /* size of sockaddr_entry */
334 static struct zone
*se_zone
; /* zone for sockaddr_entry */
336 vm_size_t so_cache_zone_element_size
;
338 static int sodelayed_copy(struct socket
*, struct uio
*, struct mbuf
**,
340 static void cached_sock_alloc(struct socket
**, int);
341 static void cached_sock_free(struct socket
*);
344 * Maximum of extended background idle sockets per process
345 * Set to zero to disable further setting of the option
348 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
349 #define SO_IDLE_BK_IDLE_TIME 600
350 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
352 struct soextbkidlestat soextbkidlestat
;
354 SYSCTL_UINT(_kern_ipc
, OID_AUTO
, maxextbkidleperproc
,
355 CTLFLAG_RW
| CTLFLAG_LOCKED
, &soextbkidlestat
.so_xbkidle_maxperproc
, 0,
356 "Maximum of extended background idle sockets per process");
358 SYSCTL_UINT(_kern_ipc
, OID_AUTO
, extbkidletime
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
359 &soextbkidlestat
.so_xbkidle_time
, 0,
360 "Time in seconds to keep extended background idle sockets");
362 SYSCTL_UINT(_kern_ipc
, OID_AUTO
, extbkidlercvhiwat
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
363 &soextbkidlestat
.so_xbkidle_rcvhiwat
, 0,
364 "High water mark for extended background idle sockets");
366 SYSCTL_STRUCT(_kern_ipc
, OID_AUTO
, extbkidlestat
, CTLFLAG_RD
| CTLFLAG_LOCKED
,
367 &soextbkidlestat
, soextbkidlestat
, "");
369 int so_set_extended_bk_idle(struct socket
*, int);
372 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
373 * setting the DSCP code on the packet based on the service class; see
374 * <rdar://problem/11277343> for details.
376 __private_extern__ u_int32_t sotcdb
= 0;
377 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sotcdb
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
383 _CASSERT(sizeof(so_gencnt
) == sizeof(uint64_t));
384 VERIFY(IS_P2ALIGNED(&so_gencnt
, sizeof(uint32_t)));
387 _CASSERT(sizeof(struct sa_endpoints
) == sizeof(struct user64_sa_endpoints
));
388 _CASSERT(offsetof(struct sa_endpoints
, sae_srcif
) == offsetof(struct user64_sa_endpoints
, sae_srcif
));
389 _CASSERT(offsetof(struct sa_endpoints
, sae_srcaddr
) == offsetof(struct user64_sa_endpoints
, sae_srcaddr
));
390 _CASSERT(offsetof(struct sa_endpoints
, sae_srcaddrlen
) == offsetof(struct user64_sa_endpoints
, sae_srcaddrlen
));
391 _CASSERT(offsetof(struct sa_endpoints
, sae_dstaddr
) == offsetof(struct user64_sa_endpoints
, sae_dstaddr
));
392 _CASSERT(offsetof(struct sa_endpoints
, sae_dstaddrlen
) == offsetof(struct user64_sa_endpoints
, sae_dstaddrlen
));
394 _CASSERT(sizeof(struct sa_endpoints
) == sizeof(struct user32_sa_endpoints
));
395 _CASSERT(offsetof(struct sa_endpoints
, sae_srcif
) == offsetof(struct user32_sa_endpoints
, sae_srcif
));
396 _CASSERT(offsetof(struct sa_endpoints
, sae_srcaddr
) == offsetof(struct user32_sa_endpoints
, sae_srcaddr
));
397 _CASSERT(offsetof(struct sa_endpoints
, sae_srcaddrlen
) == offsetof(struct user32_sa_endpoints
, sae_srcaddrlen
));
398 _CASSERT(offsetof(struct sa_endpoints
, sae_dstaddr
) == offsetof(struct user32_sa_endpoints
, sae_dstaddr
));
399 _CASSERT(offsetof(struct sa_endpoints
, sae_dstaddrlen
) == offsetof(struct user32_sa_endpoints
, sae_dstaddrlen
));
402 if (socketinit_done
) {
403 printf("socketinit: already called...\n");
408 PE_parse_boot_argn("socket_debug", &socket_debug
,
409 sizeof (socket_debug
));
412 * allocate lock group attribute and group for socket cache mutex
414 so_cache_mtx_grp_attr
= lck_grp_attr_alloc_init();
415 so_cache_mtx_grp
= lck_grp_alloc_init("so_cache",
416 so_cache_mtx_grp_attr
);
419 * allocate the lock attribute for socket cache mutex
421 so_cache_mtx_attr
= lck_attr_alloc_init();
423 /* cached sockets mutex */
424 so_cache_mtx
= lck_mtx_alloc_init(so_cache_mtx_grp
, so_cache_mtx_attr
);
425 if (so_cache_mtx
== NULL
) {
426 panic("%s: unable to allocate so_cache_mtx\n", __func__
);
429 STAILQ_INIT(&so_cache_head
);
431 so_cache_zone_element_size
= (vm_size_t
)(sizeof (struct socket
) + 4
432 + get_inpcb_str_size() + 4 + get_tcp_str_size());
434 so_cache_zone
= zinit(so_cache_zone_element_size
,
435 (120000 * so_cache_zone_element_size
), 8192, "socache zone");
436 zone_change(so_cache_zone
, Z_CALLERACCT
, FALSE
);
437 zone_change(so_cache_zone
, Z_NOENCRYPT
, TRUE
);
439 sl_zone_size
= sizeof (struct sockaddr_list
);
440 if ((sl_zone
= zinit(sl_zone_size
, 1024 * sl_zone_size
, 1024,
441 "sockaddr_list")) == NULL
) {
442 panic("%s: unable to allocate sockaddr_list zone\n", __func__
);
445 zone_change(sl_zone
, Z_CALLERACCT
, FALSE
);
446 zone_change(sl_zone
, Z_EXPAND
, TRUE
);
448 se_zone_size
= sizeof (struct sockaddr_entry
);
449 if ((se_zone
= zinit(se_zone_size
, 1024 * se_zone_size
, 1024,
450 "sockaddr_entry")) == NULL
) {
451 panic("%s: unable to allocate sockaddr_entry zone\n", __func__
);
454 zone_change(se_zone
, Z_CALLERACCT
, FALSE
);
455 zone_change(se_zone
, Z_EXPAND
, TRUE
);
457 bzero(&soextbkidlestat
, sizeof(struct soextbkidlestat
));
458 soextbkidlestat
.so_xbkidle_maxperproc
= SO_IDLE_BK_IDLE_MAX_PER_PROC
;
459 soextbkidlestat
.so_xbkidle_time
= SO_IDLE_BK_IDLE_TIME
;
460 soextbkidlestat
.so_xbkidle_rcvhiwat
= SO_IDLE_BK_IDLE_RCV_HIWAT
;
464 socket_tclass_init();
467 #endif /* MULTIPATH */
471 cached_sock_alloc(struct socket
**so
, int waitok
)
476 lck_mtx_lock(so_cache_mtx
);
478 if (!STAILQ_EMPTY(&so_cache_head
)) {
479 VERIFY(cached_sock_count
> 0);
481 *so
= STAILQ_FIRST(&so_cache_head
);
482 STAILQ_REMOVE_HEAD(&so_cache_head
, so_cache_ent
);
483 STAILQ_NEXT((*so
), so_cache_ent
) = NULL
;
486 lck_mtx_unlock(so_cache_mtx
);
488 temp
= (*so
)->so_saved_pcb
;
489 bzero((caddr_t
)*so
, sizeof (struct socket
));
491 (*so
)->so_saved_pcb
= temp
;
494 lck_mtx_unlock(so_cache_mtx
);
497 *so
= (struct socket
*)zalloc(so_cache_zone
);
499 *so
= (struct socket
*)zalloc_noblock(so_cache_zone
);
504 bzero((caddr_t
)*so
, sizeof (struct socket
));
507 * Define offsets for extra structures into our
508 * single block of memory. Align extra structures
509 * on longword boundaries.
512 offset
= (uintptr_t)*so
;
513 offset
+= sizeof (struct socket
);
515 offset
= ALIGN(offset
);
517 (*so
)->so_saved_pcb
= (caddr_t
)offset
;
518 offset
+= get_inpcb_str_size();
520 offset
= ALIGN(offset
);
522 ((struct inpcb
*)(void *)(*so
)->so_saved_pcb
)->inp_saved_ppcb
=
526 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER
, &(*so
)->so_flags1
);
530 cached_sock_free(struct socket
*so
)
533 lck_mtx_lock(so_cache_mtx
);
535 so_cache_time
= net_uptime();
536 if (++cached_sock_count
> max_cached_sock_count
) {
538 lck_mtx_unlock(so_cache_mtx
);
539 zfree(so_cache_zone
, so
);
541 if (so_cache_hw
< cached_sock_count
)
542 so_cache_hw
= cached_sock_count
;
544 STAILQ_INSERT_TAIL(&so_cache_head
, so
, so_cache_ent
);
546 so
->cache_timestamp
= so_cache_time
;
547 lck_mtx_unlock(so_cache_mtx
);
552 so_update_last_owner_locked(struct socket
*so
, proc_t self
)
554 if (so
->last_pid
!= 0) {
556 * last_pid and last_upid should remain zero for sockets
557 * created using sock_socket. The check above achieves that
559 if (self
== PROC_NULL
)
560 self
= current_proc();
562 if (so
->last_upid
!= proc_uniqueid(self
) ||
563 so
->last_pid
!= proc_pid(self
)) {
564 so
->last_upid
= proc_uniqueid(self
);
565 so
->last_pid
= proc_pid(self
);
566 proc_getexecutableuuid(self
, so
->last_uuid
,
567 sizeof (so
->last_uuid
));
569 proc_pidoriginatoruuid(so
->so_vuuid
, sizeof(so
->so_vuuid
));
574 so_update_policy(struct socket
*so
)
576 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
)
577 (void) inp_update_policy(sotoinpcb(so
));
582 so_update_necp_policy(struct socket
*so
, struct sockaddr
*override_local_addr
,
583 struct sockaddr
*override_remote_addr
)
585 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
)
586 inp_update_necp_policy(sotoinpcb(so
), override_local_addr
,
587 override_remote_addr
, 0);
596 boolean_t rc
= FALSE
;
598 lck_mtx_lock(so_cache_mtx
);
600 so_cache_time
= net_uptime();
602 while (!STAILQ_EMPTY(&so_cache_head
)) {
603 VERIFY(cached_sock_count
> 0);
604 p
= STAILQ_FIRST(&so_cache_head
);
605 if ((so_cache_time
- p
->cache_timestamp
) <
609 STAILQ_REMOVE_HEAD(&so_cache_head
, so_cache_ent
);
612 zfree(so_cache_zone
, p
);
614 if (++n_freed
>= SO_CACHE_MAX_FREE_BATCH
) {
615 so_cache_max_freed
++;
620 /* Schedule again if there is more to cleanup */
621 if (!STAILQ_EMPTY(&so_cache_head
))
624 lck_mtx_unlock(so_cache_mtx
);
629 * Get a socket structure from our zone, and initialize it.
630 * We don't implement `waitok' yet (see comments in uipc_domain.c).
631 * Note that it would probably be better to allocate socket
632 * and PCB at the same time, but I'm not convinced that all
633 * the protocols can be easily modified to do this.
636 soalloc(int waitok
, int dom
, int type
)
640 if ((dom
== PF_INET
) && (type
== SOCK_STREAM
)) {
641 cached_sock_alloc(&so
, waitok
);
643 MALLOC_ZONE(so
, struct socket
*, sizeof (*so
), socket_zone
,
646 bzero(so
, sizeof (*so
));
649 so
->so_gencnt
= OSIncrementAtomic64((SInt64
*)&so_gencnt
);
650 so
->so_zone
= socket_zone
;
651 #if CONFIG_MACF_SOCKET
652 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
653 if (mac_socket_label_init(so
, !waitok
) != 0) {
657 #endif /* MAC_SOCKET */
664 socreate_internal(int dom
, struct socket
**aso
, int type
, int proto
,
665 struct proc
*p
, uint32_t flags
, struct proc
*ep
)
672 extern int tcpconsdebug
;
679 prp
= pffindproto(dom
, proto
, type
);
681 prp
= pffindtype(dom
, type
);
683 if (prp
== NULL
|| prp
->pr_usrreqs
->pru_attach
== NULL
) {
684 if (pffinddomain(dom
) == NULL
)
685 return (EAFNOSUPPORT
);
687 if (pffindprotonotype(dom
, proto
) != NULL
)
690 return (EPROTONOSUPPORT
);
692 if (prp
->pr_type
!= type
)
694 so
= soalloc(1, dom
, type
);
698 if (flags
& SOCF_ASYNC
)
699 so
->so_state
|= SS_NBIO
;
701 if (flags
& SOCF_MP_SUBFLOW
) {
703 * A multipath subflow socket is used internally in the kernel,
704 * therefore it does not have a file desciptor associated by
707 so
->so_state
|= SS_NOFDREF
;
708 so
->so_flags
|= SOF_MP_SUBFLOW
;
710 #endif /* MULTIPATH */
712 TAILQ_INIT(&so
->so_incomp
);
713 TAILQ_INIT(&so
->so_comp
);
715 so
->last_upid
= proc_uniqueid(p
);
716 so
->last_pid
= proc_pid(p
);
717 proc_getexecutableuuid(p
, so
->last_uuid
, sizeof (so
->last_uuid
));
718 proc_pidoriginatoruuid(so
->so_vuuid
, sizeof(so
->so_vuuid
));
720 if (ep
!= PROC_NULL
&& ep
!= p
) {
721 so
->e_upid
= proc_uniqueid(ep
);
722 so
->e_pid
= proc_pid(ep
);
723 proc_getexecutableuuid(ep
, so
->e_uuid
, sizeof (so
->e_uuid
));
724 so
->so_flags
|= SOF_DELEGATED
;
727 so
->so_cred
= kauth_cred_proc_ref(p
);
728 if (!suser(kauth_cred_get(), NULL
))
729 so
->so_state
|= SS_PRIV
;
732 so
->so_rcv
.sb_flags
|= SB_RECV
;
733 so
->so_rcv
.sb_so
= so
->so_snd
.sb_so
= so
;
734 so
->next_lock_lr
= 0;
735 so
->next_unlock_lr
= 0;
737 #if CONFIG_MACF_SOCKET
738 mac_socket_label_associate(kauth_cred_get(), so
);
739 #endif /* MAC_SOCKET */
742 * Attachment will create the per pcb lock if necessary and
743 * increase refcount for creation, make sure it's done before
744 * socket is inserted in lists.
748 error
= (*prp
->pr_usrreqs
->pru_attach
)(so
, proto
, p
);
752 * If so_pcb is not zero, the socket will be leaked,
753 * so protocol attachment handler must be coded carefuly
755 so
->so_state
|= SS_NOFDREF
;
756 VERIFY(so
->so_usecount
> 0);
758 sofreelastref(so
, 1); /* will deallocate the socket */
762 atomic_add_32(&prp
->pr_domain
->dom_refs
, 1);
763 TAILQ_INIT(&so
->so_evlist
);
765 /* Attach socket filters for this protocol */
768 if (tcpconsdebug
== 2)
769 so
->so_options
|= SO_DEBUG
;
771 so_set_default_traffic_class(so
);
774 * If this thread or task is marked to create backgrounded sockets,
775 * mark the socket as background.
777 if (proc_get_effective_thread_policy(current_thread(),
778 TASK_POLICY_NEW_SOCKETS_BG
)) {
779 socket_set_traffic_mgt_flags(so
, TRAFFIC_MGT_SO_BACKGROUND
);
780 so
->so_background_thread
= current_thread();
785 * Don't mark Unix domain, system or multipath sockets as
786 * eligible for defunct by default.
791 so
->so_flags
|= SOF_NODEFUNCT
;
798 * Entitlements can't be checked at socket creation time except if the
799 * application requested a feature guarded by a privilege (c.f., socket
801 * The priv(9) and the Sandboxing APIs are designed with the idea that
802 * a privilege check should only be triggered by a userland request.
803 * A privilege check at socket creation time is time consuming and
804 * could trigger many authorisation error messages from the security
819 * <pru_attach>:ENOBUFS[AF_UNIX]
820 * <pru_attach>:ENOBUFS[TCP]
821 * <pru_attach>:ENOMEM[TCP]
822 * <pru_attach>:??? [other protocol families, IPSEC]
825 socreate(int dom
, struct socket
**aso
, int type
, int proto
)
827 return (socreate_internal(dom
, aso
, type
, proto
, current_proc(), 0,
832 socreate_delegate(int dom
, struct socket
**aso
, int type
, int proto
, pid_t epid
)
835 struct proc
*ep
= PROC_NULL
;
837 if ((proc_selfpid() != epid
) && ((ep
= proc_find(epid
)) == PROC_NULL
)) {
842 error
= socreate_internal(dom
, aso
, type
, proto
, current_proc(), 0, ep
);
845 * It might not be wise to hold the proc reference when calling
846 * socreate_internal since it calls soalloc with M_WAITOK
857 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
858 * <pru_bind>:EAFNOSUPPORT Address family not supported
859 * <pru_bind>:EADDRNOTAVAIL Address not available.
860 * <pru_bind>:EINVAL Invalid argument
861 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
862 * <pru_bind>:EACCES Permission denied
863 * <pru_bind>:EADDRINUSE Address in use
864 * <pru_bind>:EAGAIN Resource unavailable, try again
865 * <pru_bind>:EPERM Operation not permitted
869 * Notes: It's not possible to fully enumerate the return codes above,
870 * since socket filter authors and protocol family authors may
871 * not choose to limit their error returns to those listed, even
872 * though this may result in some software operating incorrectly.
874 * The error codes which are enumerated above are those known to
875 * be returned by the tcp_usr_bind function supplied.
878 sobindlock(struct socket
*so
, struct sockaddr
*nam
, int dolock
)
880 struct proc
*p
= current_proc();
885 VERIFY(so
->so_usecount
> 1);
887 so_update_last_owner_locked(so
, p
);
888 so_update_policy(so
);
891 so_update_necp_policy(so
, nam
, NULL
);
895 * If this is a bind request on a socket that has been marked
896 * as inactive, reject it now before we go any further.
898 if (so
->so_flags
& SOF_DEFUNCT
) {
900 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
901 __func__
, proc_pid(p
), proc_best_name(p
),
902 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
903 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
908 error
= sflt_bind(so
, nam
);
911 error
= (*so
->so_proto
->pr_usrreqs
->pru_bind
)(so
, nam
, p
);
914 socket_unlock(so
, 1);
916 if (error
== EJUSTRETURN
)
923 sodealloc(struct socket
*so
)
925 kauth_cred_unref(&so
->so_cred
);
927 /* Remove any filters */
931 cfil_sock_detach(so
);
932 #endif /* CONTENT_FILTER */
934 /* Delete the state allocated for msg queues on a socket */
935 if (so
->so_flags
& SOF_ENABLE_MSGS
) {
936 FREE(so
->so_msg_state
, M_TEMP
);
937 so
->so_msg_state
= NULL
;
939 VERIFY(so
->so_msg_state
== NULL
);
941 so
->so_gencnt
= OSIncrementAtomic64((SInt64
*)&so_gencnt
);
943 #if CONFIG_MACF_SOCKET
944 mac_socket_label_destroy(so
);
945 #endif /* MAC_SOCKET */
947 if (so
->so_flags1
& SOF1_CACHED_IN_SOCK_LAYER
) {
948 cached_sock_free(so
);
950 FREE_ZONE(so
, sizeof (*so
), so
->so_zone
);
958 * <pru_listen>:EINVAL[AF_UNIX]
959 * <pru_listen>:EINVAL[TCP]
960 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
961 * <pru_listen>:EINVAL[TCP] Invalid argument
962 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
963 * <pru_listen>:EACCES[TCP] Permission denied
964 * <pru_listen>:EADDRINUSE[TCP] Address in use
965 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
966 * <pru_listen>:EPERM[TCP] Operation not permitted
969 * Notes: Other <pru_listen> returns depend on the protocol family; all
970 * <sf_listen> returns depend on what the filter author causes
971 * their filter to return.
974 solisten(struct socket
*so
, int backlog
)
976 struct proc
*p
= current_proc();
981 so_update_last_owner_locked(so
, p
);
982 so_update_policy(so
);
985 so_update_necp_policy(so
, NULL
, NULL
);
988 if (so
->so_proto
== NULL
) {
992 if ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) == 0) {
998 * If the listen request is made on a socket that is not fully
999 * disconnected, or on a socket that has been marked as inactive,
1000 * reject the request now.
1003 (SS_ISCONNECTED
|SS_ISCONNECTING
|SS_ISDISCONNECTING
)) ||
1004 (so
->so_flags
& SOF_DEFUNCT
)) {
1006 if (so
->so_flags
& SOF_DEFUNCT
) {
1007 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1008 "(%d)\n", __func__
, proc_pid(p
),
1010 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
1011 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
1016 if ((so
->so_restrictions
& SO_RESTRICT_DENY_IN
) != 0) {
1021 error
= sflt_listen(so
);
1023 error
= (*so
->so_proto
->pr_usrreqs
->pru_listen
)(so
, p
);
1026 if (error
== EJUSTRETURN
)
1031 if (TAILQ_EMPTY(&so
->so_comp
))
1032 so
->so_options
|= SO_ACCEPTCONN
;
1034 * POSIX: The implementation may have an upper limit on the length of
1035 * the listen queue-either global or per accepting socket. If backlog
1036 * exceeds this limit, the length of the listen queue is set to the
1039 * If listen() is called with a backlog argument value that is less
1040 * than 0, the function behaves as if it had been called with a backlog
1041 * argument value of 0.
1043 * A backlog argument of 0 may allow the socket to accept connections,
1044 * in which case the length of the listen queue may be set to an
1045 * implementation-defined minimum value.
1047 if (backlog
<= 0 || backlog
> somaxconn
)
1048 backlog
= somaxconn
;
1050 so
->so_qlimit
= backlog
;
1052 socket_unlock(so
, 1);
1057 sofreelastref(struct socket
*so
, int dealloc
)
1059 struct socket
*head
= so
->so_head
;
1061 /* Assume socket is locked */
1063 if (!(so
->so_flags
& SOF_PCBCLEARING
) || !(so
->so_state
& SS_NOFDREF
)) {
1064 selthreadclear(&so
->so_snd
.sb_sel
);
1065 selthreadclear(&so
->so_rcv
.sb_sel
);
1066 so
->so_rcv
.sb_flags
&= ~(SB_SEL
|SB_UPCALL
);
1067 so
->so_snd
.sb_flags
&= ~(SB_SEL
|SB_UPCALL
);
1068 so
->so_event
= sonullevent
;
1073 * Need to lock the listener when the protocol has
1076 if (head
->so_proto
->pr_getlock
!= NULL
)
1077 socket_lock(head
, 1);
1079 if (so
->so_state
& SS_INCOMP
) {
1080 so
->so_state
&= ~SS_INCOMP
;
1081 TAILQ_REMOVE(&head
->so_incomp
, so
, so_list
);
1085 } else if (so
->so_state
& SS_COMP
) {
1087 * We must not decommission a socket that's
1088 * on the accept(2) queue. If we do, then
1089 * accept(2) may hang after select(2) indicated
1090 * that the listening socket was ready.
1092 selthreadclear(&so
->so_snd
.sb_sel
);
1093 selthreadclear(&so
->so_rcv
.sb_sel
);
1094 so
->so_rcv
.sb_flags
&= ~(SB_SEL
|SB_UPCALL
);
1095 so
->so_snd
.sb_flags
&= ~(SB_SEL
|SB_UPCALL
);
1096 so
->so_event
= sonullevent
;
1097 if (head
->so_proto
->pr_getlock
!= NULL
)
1098 socket_unlock(head
, 1);
1101 panic("sofree: not queued");
1103 if (head
->so_proto
->pr_getlock
!= NULL
)
1104 socket_unlock(head
, 1);
1110 if (so
->so_flags
& SOF_FLOW_DIVERT
) {
1111 flow_divert_detach(so
);
1113 #endif /* FLOW_DIVERT */
1115 /* 3932268: disable upcall */
1116 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
1117 so
->so_snd
.sb_flags
&= ~(SB_UPCALL
|SB_SNDBYTE_CNT
);
1118 so
->so_event
= sonullevent
;
1125 soclose_wait_locked(struct socket
*so
)
1127 lck_mtx_t
*mutex_held
;
1129 if (so
->so_proto
->pr_getlock
!= NULL
)
1130 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
1132 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1133 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1136 * Double check here and return if there's no outstanding upcall;
1137 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1139 if (!so
->so_upcallusecount
|| !(so
->so_flags
& SOF_UPCALLCLOSEWAIT
))
1141 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
1142 so
->so_snd
.sb_flags
&= ~SB_UPCALL
;
1143 so
->so_flags
|= SOF_CLOSEWAIT
;
1144 (void) msleep((caddr_t
)&so
->so_upcallusecount
, mutex_held
, (PZERO
- 1),
1145 "soclose_wait_locked", NULL
);
1146 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1147 so
->so_flags
&= ~SOF_CLOSEWAIT
;
1151 * Close a socket on last file table reference removal.
1152 * Initiate disconnect if connected.
1153 * Free socket when disconnect complete.
1156 soclose_locked(struct socket
*so
)
1159 lck_mtx_t
*mutex_held
;
1162 if (so
->so_usecount
== 0) {
1163 panic("soclose: so=%p refcount=0\n", so
);
1167 sflt_notify(so
, sock_evt_closing
, NULL
);
1169 if (so
->so_upcallusecount
)
1170 soclose_wait_locked(so
);
1174 * We have to wait until the content filters are done
1176 if ((so
->so_flags
& SOF_CONTENT_FILTER
) != 0) {
1177 cfil_sock_close_wait(so
);
1178 cfil_sock_is_closed(so
);
1179 cfil_sock_detach(so
);
1181 #endif /* CONTENT_FILTER */
1183 if (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_INPROG
) {
1184 soresume(current_proc(), so
, 1);
1185 so
->so_flags1
&= ~SOF1_EXTEND_BK_IDLE_WANTED
;
1188 if ((so
->so_options
& SO_ACCEPTCONN
)) {
1192 * We do not want new connection to be added
1193 * to the connection queues
1195 so
->so_options
&= ~SO_ACCEPTCONN
;
1197 while ((sp
= TAILQ_FIRST(&so
->so_incomp
)) != NULL
) {
1202 * skip sockets thrown away by tcpdropdropblreq
1203 * they will get cleanup by the garbage collection.
1204 * otherwise, remove the incomp socket from the queue
1205 * and let soabort trigger the appropriate cleanup.
1207 if (sp
->so_flags
& SOF_OVERFLOW
)
1210 if (so
->so_proto
->pr_getlock
!= NULL
) {
1212 * Lock ordering for consistency with the
1213 * rest of the stack, we lock the socket
1214 * first and then grab the head.
1216 socket_unlock(so
, 0);
1224 * The extra reference for the list insure the
1225 * validity of the socket pointer when we perform the
1226 * unlock of the head above
1228 if (sp
->so_state
& SS_INCOMP
) {
1229 sp
->so_state
&= ~SS_INCOMP
;
1231 TAILQ_REMOVE(&so
->so_incomp
, sp
, so_list
);
1239 socket_unlock(sp
, 1);
1242 while ((sp
= TAILQ_FIRST(&so
->so_comp
)) != NULL
) {
1245 /* Dequeue from so_comp since sofree() won't do it */
1246 if (so
->so_proto
->pr_getlock
!= NULL
) {
1248 * Lock ordering for consistency with the
1249 * rest of the stack, we lock the socket
1250 * first and then grab the head.
1252 socket_unlock(so
, 0);
1258 if (sp
->so_state
& SS_COMP
) {
1259 sp
->so_state
&= ~SS_COMP
;
1261 TAILQ_REMOVE(&so
->so_comp
, sp
, so_list
);
1268 socket_unlock(sp
, 1);
1271 if (so
->so_pcb
== NULL
) {
1272 /* 3915887: mark the socket as ready for dealloc */
1273 so
->so_flags
|= SOF_PCBCLEARING
;
1276 if (so
->so_state
& SS_ISCONNECTED
) {
1277 if ((so
->so_state
& SS_ISDISCONNECTING
) == 0) {
1278 error
= sodisconnectlocked(so
);
1282 if (so
->so_options
& SO_LINGER
) {
1283 if ((so
->so_state
& SS_ISDISCONNECTING
) &&
1284 (so
->so_state
& SS_NBIO
))
1286 if (so
->so_proto
->pr_getlock
!= NULL
)
1287 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
1289 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1290 while (so
->so_state
& SS_ISCONNECTED
) {
1291 ts
.tv_sec
= (so
->so_linger
/100);
1292 ts
.tv_nsec
= (so
->so_linger
% 100) *
1293 NSEC_PER_USEC
* 1000 * 10;
1294 error
= msleep((caddr_t
)&so
->so_timeo
,
1295 mutex_held
, PSOCK
| PCATCH
, "soclose", &ts
);
1298 * It's OK when the time fires,
1299 * don't report an error
1301 if (error
== EWOULDBLOCK
)
1309 if (so
->so_usecount
== 0) {
1310 panic("soclose: usecount is zero so=%p\n", so
);
1313 if (so
->so_pcb
!= NULL
&& !(so
->so_flags
& SOF_PCBCLEARING
)) {
1314 int error2
= (*so
->so_proto
->pr_usrreqs
->pru_detach
)(so
);
1318 if (so
->so_usecount
<= 0) {
1319 panic("soclose: usecount is zero so=%p\n", so
);
1323 if (so
->so_pcb
!= NULL
&& !(so
->so_flags
& SOF_MP_SUBFLOW
) &&
1324 (so
->so_state
& SS_NOFDREF
)) {
1325 panic("soclose: NOFDREF");
1328 so
->so_state
|= SS_NOFDREF
;
1330 if (so
->so_flags
& SOF_MP_SUBFLOW
)
1331 so
->so_flags
&= ~SOF_MP_SUBFLOW
;
1333 if ((so
->so_flags
& SOF_KNOTE
) != 0)
1334 KNOTE(&so
->so_klist
, SO_FILT_HINT_LOCKED
);
1336 atomic_add_32(&so
->so_proto
->pr_domain
->dom_refs
, -1);
1339 VERIFY(so
->so_usecount
> 0);
1346 soclose(struct socket
*so
)
1351 if (so
->so_retaincnt
== 0) {
1352 error
= soclose_locked(so
);
1355 * if the FD is going away, but socket is
1356 * retained in kernel remove its reference
1359 if (so
->so_usecount
< 2)
1360 panic("soclose: retaincnt non null and so=%p "
1361 "usecount=%d\n", so
, so
->so_usecount
);
1363 socket_unlock(so
, 1);
1368 * Must be called at splnet...
1370 /* Should already be locked */
1372 soabort(struct socket
*so
)
1376 #ifdef MORE_LOCKING_DEBUG
1377 lck_mtx_t
*mutex_held
;
1379 if (so
->so_proto
->pr_getlock
!= NULL
)
1380 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
1382 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1383 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1386 if ((so
->so_flags
& SOF_ABORTED
) == 0) {
1387 so
->so_flags
|= SOF_ABORTED
;
1388 error
= (*so
->so_proto
->pr_usrreqs
->pru_abort
)(so
);
1398 soacceptlock(struct socket
*so
, struct sockaddr
**nam
, int dolock
)
1405 so_update_last_owner_locked(so
, PROC_NULL
);
1406 so_update_policy(so
);
1408 so_update_necp_policy(so
, NULL
, NULL
);
1411 if ((so
->so_state
& SS_NOFDREF
) == 0)
1412 panic("soaccept: !NOFDREF");
1413 so
->so_state
&= ~SS_NOFDREF
;
1414 error
= (*so
->so_proto
->pr_usrreqs
->pru_accept
)(so
, nam
);
1417 socket_unlock(so
, 1);
1422 soaccept(struct socket
*so
, struct sockaddr
**nam
)
1424 return (soacceptlock(so
, nam
, 1));
1428 soacceptfilter(struct socket
*so
, struct socket
*head
)
1430 struct sockaddr
*local
= NULL
, *remote
= NULL
;
1434 * Hold the lock even if this socket has not been made visible
1435 * to the filter(s). For sockets with global locks, this protects
1436 * against the head or peer going away
1439 if (sogetaddr_locked(so
, &remote
, 1) != 0 ||
1440 sogetaddr_locked(so
, &local
, 0) != 0) {
1441 so
->so_state
&= ~SS_NOFDREF
;
1442 socket_unlock(so
, 1);
1444 /* Out of resources; try it again next time */
1445 error
= ECONNABORTED
;
1449 error
= sflt_accept(head
, so
, local
, remote
);
1452 * If we get EJUSTRETURN from one of the filters, mark this socket
1453 * as inactive and return it anyway. This newly accepted socket
1454 * will be disconnected later before we hand it off to the caller.
1456 if (error
== EJUSTRETURN
) {
1458 (void) sosetdefunct(current_proc(), so
,
1459 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL
, FALSE
);
1464 * This may seem like a duplication to the above error
1465 * handling part when we return ECONNABORTED, except
1466 * the following is done while holding the lock since
1467 * the socket has been exposed to the filter(s) earlier.
1469 so
->so_state
&= ~SS_COMP
;
1470 socket_unlock(so
, 1);
1472 /* Propagate socket filter's error code to the caller */
1474 socket_unlock(so
, 1);
1477 /* Callee checks for NULL pointer */
1478 sock_freeaddr(remote
);
1479 sock_freeaddr(local
);
1484 * Returns: 0 Success
1485 * EOPNOTSUPP Operation not supported on socket
1486 * EISCONN Socket is connected
1487 * <pru_connect>:EADDRNOTAVAIL Address not available.
1488 * <pru_connect>:EINVAL Invalid argument
1489 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1490 * <pru_connect>:EACCES Permission denied
1491 * <pru_connect>:EADDRINUSE Address in use
1492 * <pru_connect>:EAGAIN Resource unavailable, try again
1493 * <pru_connect>:EPERM Operation not permitted
1494 * <sf_connect_out>:??? [anything a filter writer might set]
1497 soconnectlock(struct socket
*so
, struct sockaddr
*nam
, int dolock
)
1500 struct proc
*p
= current_proc();
1505 so_update_last_owner_locked(so
, p
);
1506 so_update_policy(so
);
1509 so_update_necp_policy(so
, NULL
, nam
);
1513 * If this is a listening socket or if this is a previously-accepted
1514 * socket that has been marked as inactive, reject the connect request.
1516 if ((so
->so_options
& SO_ACCEPTCONN
) || (so
->so_flags
& SOF_DEFUNCT
)) {
1518 if (so
->so_flags
& SOF_DEFUNCT
) {
1519 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1520 "(%d)\n", __func__
, proc_pid(p
),
1522 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
1523 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
1526 socket_unlock(so
, 1);
1530 if ((so
->so_restrictions
& SO_RESTRICT_DENY_OUT
) != 0) {
1532 socket_unlock(so
, 1);
1537 * If protocol is connection-based, can only connect once.
1538 * Otherwise, if connected, try to disconnect first.
1539 * This allows user to disconnect by connecting to, e.g.,
1542 if (so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
) &&
1543 ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ||
1544 (error
= sodisconnectlocked(so
)))) {
1548 * Run connect filter before calling protocol:
1549 * - non-blocking connect returns before completion;
1551 error
= sflt_connectout(so
, nam
);
1553 if (error
== EJUSTRETURN
)
1556 error
= (*so
->so_proto
->pr_usrreqs
->pru_connect
)
1561 socket_unlock(so
, 1);
1566 soconnect(struct socket
*so
, struct sockaddr
*nam
)
1568 return (soconnectlock(so
, nam
, 1));
1572 * Returns: 0 Success
1573 * <pru_connect2>:EINVAL[AF_UNIX]
1574 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1575 * <pru_connect2>:??? [other protocol families]
1577 * Notes: <pru_connect2> is not supported by [TCP].
1580 soconnect2(struct socket
*so1
, struct socket
*so2
)
1584 socket_lock(so1
, 1);
1585 if (so2
->so_proto
->pr_lock
)
1586 socket_lock(so2
, 1);
1588 error
= (*so1
->so_proto
->pr_usrreqs
->pru_connect2
)(so1
, so2
);
1590 socket_unlock(so1
, 1);
1591 if (so2
->so_proto
->pr_lock
)
1592 socket_unlock(so2
, 1);
1597 soconnectxlocked(struct socket
*so
, struct sockaddr_list
**src_sl
,
1598 struct sockaddr_list
**dst_sl
, struct proc
*p
, uint32_t ifscope
,
1599 sae_associd_t aid
, sae_connid_t
*pcid
, uint32_t flags
, void *arg
,
1600 uint32_t arglen
, uio_t auio
, user_ssize_t
*bytes_written
)
1604 so_update_last_owner_locked(so
, p
);
1605 so_update_policy(so
);
1608 * If this is a listening socket or if this is a previously-accepted
1609 * socket that has been marked as inactive, reject the connect request.
1611 if ((so
->so_options
& SO_ACCEPTCONN
) || (so
->so_flags
& SOF_DEFUNCT
)) {
1613 if (so
->so_flags
& SOF_DEFUNCT
) {
1614 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1615 "(%d)\n", __func__
, proc_pid(p
),
1617 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
1618 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
1623 if ((so
->so_restrictions
& SO_RESTRICT_DENY_OUT
) != 0)
1627 * If protocol is connection-based, can only connect once
1628 * unless PR_MULTICONN is set. Otherwise, if connected,
1629 * try to disconnect first. This allows user to disconnect
1630 * by connecting to, e.g., a null address.
1632 if ((so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
)) &&
1633 !(so
->so_proto
->pr_flags
& PR_MULTICONN
) &&
1634 ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ||
1635 (error
= sodisconnectlocked(so
)) != 0)) {
1639 * Run connect filter before calling protocol:
1640 * - non-blocking connect returns before completion;
1642 error
= sflt_connectxout(so
, dst_sl
);
1644 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1645 so
->so_flags1
&= ~SOF1_PRECONNECT_DATA
;
1646 if (error
== EJUSTRETURN
)
1649 error
= (*so
->so_proto
->pr_usrreqs
->pru_connectx
)
1650 (so
, src_sl
, dst_sl
, p
, ifscope
, aid
, pcid
,
1651 flags
, arg
, arglen
, auio
, bytes_written
);
1659 sodisconnectlocked(struct socket
*so
)
1663 if ((so
->so_state
& SS_ISCONNECTED
) == 0) {
1667 if (so
->so_state
& SS_ISDISCONNECTING
) {
1672 error
= (*so
->so_proto
->pr_usrreqs
->pru_disconnect
)(so
);
1674 sflt_notify(so
, sock_evt_disconnected
, NULL
);
1680 /* Locking version */
1682 sodisconnect(struct socket
*so
)
1687 error
= sodisconnectlocked(so
);
1688 socket_unlock(so
, 1);
1693 sodisconnectxlocked(struct socket
*so
, sae_associd_t aid
, sae_connid_t cid
)
1698 * Call the protocol disconnectx handler; let it handle all
1699 * matters related to the connection state of this session.
1701 error
= (*so
->so_proto
->pr_usrreqs
->pru_disconnectx
)(so
, aid
, cid
);
1704 * The event applies only for the session, not for
1705 * the disconnection of individual subflows.
1707 if (so
->so_state
& (SS_ISDISCONNECTING
|SS_ISDISCONNECTED
))
1708 sflt_notify(so
, sock_evt_disconnected
, NULL
);
1714 sodisconnectx(struct socket
*so
, sae_associd_t aid
, sae_connid_t cid
)
1719 error
= sodisconnectxlocked(so
, aid
, cid
);
1720 socket_unlock(so
, 1);
1725 sopeelofflocked(struct socket
*so
, sae_associd_t aid
, struct socket
**psop
)
1727 return ((*so
->so_proto
->pr_usrreqs
->pru_peeloff
)(so
, aid
, psop
));
1730 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1733 * sosendcheck will lock the socket buffer if it isn't locked and
1734 * verify that there is space for the data being inserted.
1736 * Returns: 0 Success
1738 * sblock:EWOULDBLOCK
1745 sosendcheck(struct socket
*so
, struct sockaddr
*addr
, user_ssize_t resid
,
1746 int32_t clen
, int32_t atomic
, int flags
, int *sblocked
,
1747 struct mbuf
*control
)
1754 if (*sblocked
== 0) {
1755 if ((so
->so_snd
.sb_flags
& SB_LOCK
) != 0 &&
1756 so
->so_send_filt_thread
!= 0 &&
1757 so
->so_send_filt_thread
== current_thread()) {
1759 * We're being called recursively from a filter,
1760 * allow this to continue. Radar 4150520.
1761 * Don't set sblocked because we don't want
1762 * to perform an unlock later.
1766 error
= sblock(&so
->so_snd
, SBLOCKWAIT(flags
));
1768 if (so
->so_flags
& SOF_DEFUNCT
)
1777 * If a send attempt is made on a socket that has been marked
1778 * as inactive (disconnected), reject the request.
1780 if (so
->so_flags
& SOF_DEFUNCT
) {
1783 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1784 __func__
, proc_selfpid(), proc_best_name(current_proc()),
1785 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
1786 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
1790 if (so
->so_state
& SS_CANTSENDMORE
) {
1793 * Can re-inject data of half closed connections
1795 if ((so
->so_state
& SS_ISDISCONNECTED
) == 0 &&
1796 so
->so_snd
.sb_cfil_thread
== current_thread() &&
1797 cfil_sock_data_pending(&so
->so_snd
) != 0)
1799 "so %llx ignore SS_CANTSENDMORE",
1800 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
));
1802 #endif /* CONTENT_FILTER */
1806 error
= so
->so_error
;
1811 if ((so
->so_state
& SS_ISCONNECTED
) == 0) {
1812 if ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) != 0) {
1813 if (((so
->so_state
& SS_ISCONFIRMING
) == 0) &&
1814 (resid
!= 0 || clen
== 0) &&
1815 !(so
->so_flags1
& SOF1_PRECONNECT_DATA
)) {
1818 * MPTCP Fast Join sends data before the
1819 * socket is truly connected.
1821 if ((so
->so_flags
& (SOF_MP_SUBFLOW
|
1822 SOF_MPTCP_FASTJOIN
)) !=
1823 (SOF_MP_SUBFLOW
| SOF_MPTCP_FASTJOIN
))
1827 } else if (addr
== 0 && !(flags
&MSG_HOLD
)) {
1828 return ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ?
1829 ENOTCONN
: EDESTADDRREQ
);
1833 if (so
->so_flags
& SOF_ENABLE_MSGS
)
1834 space
= msgq_sbspace(so
, control
);
1836 space
= sbspace(&so
->so_snd
);
1838 if (flags
& MSG_OOB
)
1840 if ((atomic
&& resid
> so
->so_snd
.sb_hiwat
) ||
1841 clen
> so
->so_snd
.sb_hiwat
)
1844 if ((space
< resid
+ clen
&&
1845 (atomic
|| (space
< (int32_t)so
->so_snd
.sb_lowat
) ||
1847 (so
->so_type
== SOCK_STREAM
&& so_wait_for_if_feedback(so
))) {
1849 * don't block the connectx call when there's more data
1850 * than can be copied.
1852 if (so
->so_flags1
& SOF1_PRECONNECT_DATA
) {
1854 return (EWOULDBLOCK
);
1856 if (space
< (int32_t)so
->so_snd
.sb_lowat
) {
1860 if ((so
->so_state
& SS_NBIO
) || (flags
& MSG_NBIO
) ||
1862 return (EWOULDBLOCK
);
1864 sbunlock(&so
->so_snd
, TRUE
); /* keep socket locked */
1866 error
= sbwait(&so
->so_snd
);
1868 if (so
->so_flags
& SOF_DEFUNCT
)
1879 * If send must go all at once and message is larger than
1880 * send buffering, then hard error.
1881 * Lock against other senders.
1882 * If must go all at once and not enough room now, then
1883 * inform user that this would block and do nothing.
1884 * Otherwise, if nonblocking, send as much as possible.
1885 * The data to be sent is described by "uio" if nonzero,
1886 * otherwise by the mbuf chain "top" (which must be null
1887 * if uio is not). Data provided in mbuf chain must be small
1888 * enough to send all at once.
1890 * Returns nonzero on error, timeout or signal; callers
1891 * must check for short counts if EINTR/ERESTART are returned.
1892 * Data and control buffers are freed on return.
1894 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1895 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1896 * point at the mbuf chain being constructed and go from there.
1898 * Returns: 0 Success
1904 * sosendcheck:EWOULDBLOCK
1908 * sosendcheck:??? [value from so_error]
1909 * <pru_send>:ECONNRESET[TCP]
1910 * <pru_send>:EINVAL[TCP]
1911 * <pru_send>:ENOBUFS[TCP]
1912 * <pru_send>:EADDRINUSE[TCP]
1913 * <pru_send>:EADDRNOTAVAIL[TCP]
1914 * <pru_send>:EAFNOSUPPORT[TCP]
1915 * <pru_send>:EACCES[TCP]
1916 * <pru_send>:EAGAIN[TCP]
1917 * <pru_send>:EPERM[TCP]
1918 * <pru_send>:EMSGSIZE[TCP]
1919 * <pru_send>:EHOSTUNREACH[TCP]
1920 * <pru_send>:ENETUNREACH[TCP]
1921 * <pru_send>:ENETDOWN[TCP]
1922 * <pru_send>:ENOMEM[TCP]
1923 * <pru_send>:ENOBUFS[TCP]
1924 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
1925 * <pru_send>:EINVAL[AF_UNIX]
1926 * <pru_send>:EOPNOTSUPP[AF_UNIX]
1927 * <pru_send>:EPIPE[AF_UNIX]
1928 * <pru_send>:ENOTCONN[AF_UNIX]
1929 * <pru_send>:EISCONN[AF_UNIX]
1930 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
1931 * <sf_data_out>:??? [whatever a filter author chooses]
1933 * Notes: Other <pru_send> returns depend on the protocol family; all
1934 * <sf_data_out> returns depend on what the filter author causes
1935 * their filter to return.
1938 sosend(struct socket
*so
, struct sockaddr
*addr
, struct uio
*uio
,
1939 struct mbuf
*top
, struct mbuf
*control
, int flags
)
1942 struct mbuf
*m
, *freelist
= NULL
;
1943 user_ssize_t space
, len
, resid
, orig_resid
;
1944 int clen
= 0, error
, dontroute
, mlen
, sendflags
;
1945 int atomic
= sosendallatonce(so
) || top
;
1947 struct proc
*p
= current_proc();
1948 struct mbuf
*control_copy
= NULL
;
1949 uint16_t headroom
= 0;
1950 boolean_t en_tracing
= FALSE
;
1953 resid
= uio_resid(uio
);
1955 resid
= top
->m_pkthdr
.len
;
1957 KERNEL_DEBUG((DBG_FNC_SOSEND
| DBG_FUNC_START
), so
, resid
,
1958 so
->so_snd
.sb_cc
, so
->so_snd
.sb_lowat
, so
->so_snd
.sb_hiwat
);
1963 * trace if tracing & network (vs. unix) sockets & and
1966 if (ENTR_SHOULDTRACE
&&
1967 (SOCK_CHECK_DOM(so
, AF_INET
) || SOCK_CHECK_DOM(so
, AF_INET6
))) {
1968 struct inpcb
*inp
= sotoinpcb(so
);
1969 if (inp
->inp_last_outifp
!= NULL
&&
1970 !(inp
->inp_last_outifp
->if_flags
& IFF_LOOPBACK
)) {
1972 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite
, DBG_FUNC_START
,
1973 VM_KERNEL_ADDRPERM(so
),
1974 ((so
->so_state
& SS_NBIO
) ? kEnTrFlagNonBlocking
: 0),
1981 * Re-injection should not affect process accounting
1983 if ((flags
& MSG_SKIPCFIL
) == 0) {
1984 so_update_last_owner_locked(so
, p
);
1985 so_update_policy(so
);
1988 so_update_necp_policy(so
, NULL
, addr
);
1992 if (so
->so_type
!= SOCK_STREAM
&& (flags
& MSG_OOB
) != 0) {
1994 socket_unlock(so
, 1);
1999 * In theory resid should be unsigned.
2000 * However, space must be signed, as it might be less than 0
2001 * if we over-committed, and we must use a signed comparison
2002 * of space and resid. On the other hand, a negative resid
2003 * causes us to loop sending 0-length segments to the protocol.
2005 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2006 * But it will be used by sockets doing message delivery.
2008 * Note: We limit resid to be a positive int value as we use
2009 * imin() to set bytes_to_copy -- radr://14558484
2011 if (resid
< 0 || resid
> INT_MAX
|| (so
->so_type
== SOCK_STREAM
&&
2012 !(so
->so_flags
& SOF_ENABLE_MSGS
) && (flags
& MSG_EOR
))) {
2014 socket_unlock(so
, 1);
2018 dontroute
= (flags
& MSG_DONTROUTE
) &&
2019 (so
->so_options
& SO_DONTROUTE
) == 0 &&
2020 (so
->so_proto
->pr_flags
& PR_ATOMIC
);
2021 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgsnd
);
2023 if (control
!= NULL
)
2024 clen
= control
->m_len
;
2026 if (soreserveheadroom
!= 0)
2027 headroom
= so
->so_pktheadroom
;
2030 error
= sosendcheck(so
, addr
, resid
, clen
, atomic
, flags
,
2031 &sblocked
, control
);
2036 if (so
->so_flags
& SOF_ENABLE_MSGS
)
2037 space
= msgq_sbspace(so
, control
);
2039 space
= sbspace(&so
->so_snd
) - clen
;
2040 space
+= ((flags
& MSG_OOB
) ? 1024 : 0);
2045 * Data is prepackaged in "top".
2048 if (flags
& MSG_EOR
)
2049 top
->m_flags
|= M_EOR
;
2057 bytes_to_copy
= imin(resid
, space
);
2059 bytes_to_alloc
= bytes_to_copy
;
2061 bytes_to_alloc
+= headroom
;
2063 if (sosendminchain
> 0)
2066 chainlength
= sosendmaxchain
;
2069 * Use big 4 KB cluster when the outgoing interface
2070 * does not prefer 2 KB clusters
2072 bigcl
= !(so
->so_flags1
& SOF1_IF_2KCL
) ||
2073 sosendbigcl_ignore_capab
;
2076 * Attempt to use larger than system page-size
2077 * clusters for large writes only if there is
2078 * a jumbo cluster pool and if the socket is
2079 * marked accordingly.
2081 jumbocl
= sosendjcl
&& njcl
> 0 &&
2082 ((so
->so_flags
& SOF_MULTIPAGES
) ||
2083 sosendjcl_ignore_capab
) &&
2086 socket_unlock(so
, 0);
2090 int hdrs_needed
= (top
== NULL
) ? 1 : 0;
2093 * try to maintain a local cache of mbuf
2094 * clusters needed to complete this
2095 * write the list is further limited to
2096 * the number that are currently needed
2097 * to fill the socket this mechanism
2098 * allows a large number of mbufs/
2099 * clusters to be grabbed under a single
2100 * mbuf lock... if we can't get any
2101 * clusters, than fall back to trying
2102 * for mbufs if we fail early (or
2103 * miscalcluate the number needed) make
2104 * sure to release any clusters we
2105 * haven't yet consumed.
2107 if (freelist
== NULL
&&
2108 bytes_to_alloc
> MBIGCLBYTES
&&
2111 bytes_to_alloc
/ M16KCLBYTES
;
2113 if ((bytes_to_alloc
-
2114 (num_needed
* M16KCLBYTES
))
2119 m_getpackets_internal(
2120 (unsigned int *)&num_needed
,
2121 hdrs_needed
, M_WAIT
, 0,
2124 * Fall back to 4K cluster size
2125 * if allocation failed
2129 if (freelist
== NULL
&&
2130 bytes_to_alloc
> MCLBYTES
&&
2133 bytes_to_alloc
/ MBIGCLBYTES
;
2135 if ((bytes_to_alloc
-
2136 (num_needed
* MBIGCLBYTES
)) >=
2141 m_getpackets_internal(
2142 (unsigned int *)&num_needed
,
2143 hdrs_needed
, M_WAIT
, 0,
2146 * Fall back to cluster size
2147 * if allocation failed
2152 * Allocate a cluster as we want to
2153 * avoid to split the data in more
2154 * that one segment and using MINCLSIZE
2155 * would lead us to allocate two mbufs
2157 if (soreserveheadroom
!= 0 &&
2160 bytes_to_alloc
> _MHLEN
) ||
2161 bytes_to_alloc
> _MLEN
)) {
2162 num_needed
= ROUNDUP(bytes_to_alloc
, MCLBYTES
) /
2165 m_getpackets_internal(
2166 (unsigned int *)&num_needed
,
2167 hdrs_needed
, M_WAIT
, 0,
2170 * Fall back to a single mbuf
2171 * if allocation failed
2173 } else if (freelist
== NULL
&&
2174 bytes_to_alloc
> MINCLSIZE
) {
2176 bytes_to_alloc
/ MCLBYTES
;
2178 if ((bytes_to_alloc
-
2179 (num_needed
* MCLBYTES
)) >=
2184 m_getpackets_internal(
2185 (unsigned int *)&num_needed
,
2186 hdrs_needed
, M_WAIT
, 0,
2189 * Fall back to a single mbuf
2190 * if allocation failed
2194 * For datagram protocols, leave
2195 * headroom for protocol headers
2196 * in the first cluster of the chain
2198 if (freelist
!= NULL
&& atomic
&&
2199 top
== NULL
&& headroom
> 0) {
2200 freelist
->m_data
+= headroom
;
2204 * Fall back to regular mbufs without
2205 * reserving the socket headroom
2207 if (freelist
== NULL
) {
2215 if (freelist
== NULL
) {
2221 * For datagram protocols,
2222 * leave room for protocol
2223 * headers in first mbuf.
2225 if (atomic
&& top
== NULL
&&
2226 bytes_to_copy
< MHLEN
) {
2232 freelist
= m
->m_next
;
2235 if ((m
->m_flags
& M_EXT
))
2236 mlen
= m
->m_ext
.ext_size
-
2238 else if ((m
->m_flags
& M_PKTHDR
))
2240 MHLEN
- m_leadingspace(m
);
2242 mlen
= MLEN
- m_leadingspace(m
);
2243 len
= imin(mlen
, bytes_to_copy
);
2249 error
= uiomove(mtod(m
, caddr_t
),
2252 resid
= uio_resid(uio
);
2256 top
->m_pkthdr
.len
+= len
;
2261 if (flags
& MSG_EOR
)
2262 top
->m_flags
|= M_EOR
;
2265 bytes_to_copy
= min(resid
, space
);
2267 } while (space
> 0 &&
2268 (chainlength
< sosendmaxchain
|| atomic
||
2269 resid
< MINCLSIZE
));
2277 if (flags
& (MSG_HOLD
|MSG_SEND
)) {
2278 /* Enqueue for later, go away if HOLD */
2280 if (so
->so_temp
&& (flags
& MSG_FLUSH
)) {
2281 m_freem(so
->so_temp
);
2285 so
->so_tail
->m_next
= top
;
2292 if (flags
& MSG_HOLD
) {
2299 so
->so_options
|= SO_DONTROUTE
;
2302 * Compute flags here, for pru_send and NKEs
2304 * If the user set MSG_EOF, the protocol
2305 * understands this flag and nothing left to
2306 * send then use PRU_SEND_EOF instead of PRU_SEND.
2308 sendflags
= (flags
& MSG_OOB
) ? PRUS_OOB
:
2309 ((flags
& MSG_EOF
) &&
2310 (so
->so_proto
->pr_flags
& PR_IMPLOPCL
) &&
2311 (resid
<= 0)) ? PRUS_EOF
:
2312 /* If there is more to send set PRUS_MORETOCOME */
2313 (resid
> 0 && space
> 0) ? PRUS_MORETOCOME
: 0;
2315 if ((flags
& MSG_SKIPCFIL
) == 0) {
2317 * Socket filter processing
2319 error
= sflt_data_out(so
, addr
, &top
,
2320 &control
, (sendflags
& MSG_OOB
) ?
2321 sock_data_filt_flag_oob
: 0);
2323 if (error
== EJUSTRETURN
) {
2333 * Content filter processing
2335 error
= cfil_sock_data_out(so
, addr
, top
,
2336 control
, (sendflags
& MSG_OOB
) ?
2337 sock_data_filt_flag_oob
: 0);
2339 if (error
== EJUSTRETURN
) {
2347 #endif /* CONTENT_FILTER */
2349 if (so
->so_flags
& SOF_ENABLE_MSGS
) {
2351 * Make a copy of control mbuf,
2352 * so that msg priority can be
2353 * passed to subsequent mbufs.
2355 control_copy
= m_dup(control
, M_NOWAIT
);
2357 error
= (*so
->so_proto
->pr_usrreqs
->pru_send
)
2358 (so
, sendflags
, top
, addr
, control
, p
);
2360 if (flags
& MSG_SEND
)
2364 so
->so_options
&= ~SO_DONTROUTE
;
2367 control
= control_copy
;
2368 control_copy
= NULL
;
2373 } while (resid
&& space
> 0);
2378 sbunlock(&so
->so_snd
, FALSE
); /* will unlock socket */
2380 socket_unlock(so
, 1);
2384 if (control
!= NULL
)
2386 if (freelist
!= NULL
)
2387 m_freem_list(freelist
);
2388 if (control_copy
!= NULL
)
2389 m_freem(control_copy
);
2392 * One write has been done. This was enough. Get back to "normal"
2395 if (so
->so_flags1
& SOF1_PRECONNECT_DATA
)
2396 so
->so_flags1
&= ~SOF1_PRECONNECT_DATA
;
2399 /* resid passed here is the bytes left in uio */
2400 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite
, DBG_FUNC_END
,
2401 VM_KERNEL_ADDRPERM(so
),
2402 ((error
== EWOULDBLOCK
) ? kEnTrFlagNoWork
: 0),
2403 (int64_t)(orig_resid
- resid
));
2405 KERNEL_DEBUG(DBG_FNC_SOSEND
| DBG_FUNC_END
, so
, resid
,
2406 so
->so_snd
.sb_cc
, space
, error
);
2412 * Supported only connected sockets (no address) without ancillary data
2413 * (control mbuf) for atomic protocols
2416 sosend_list(struct socket
*so
, struct uio
**uioarray
, u_int uiocnt
, int flags
)
2418 struct mbuf
*m
, *freelist
= NULL
;
2419 user_ssize_t len
, resid
;
2420 int error
, dontroute
, mlen
;
2421 int atomic
= sosendallatonce(so
);
2423 struct proc
*p
= current_proc();
2426 struct mbuf
*top
= NULL
;
2427 uint16_t headroom
= 0;
2430 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST
| DBG_FUNC_START
), so
, uiocnt
,
2431 so
->so_snd
.sb_cc
, so
->so_snd
.sb_lowat
, so
->so_snd
.sb_hiwat
);
2433 if (so
->so_type
!= SOCK_DGRAM
) {
2441 if (so
->so_proto
->pr_usrreqs
->pru_send_list
== NULL
) {
2442 error
= EPROTONOSUPPORT
;
2445 if (flags
& ~(MSG_DONTWAIT
| MSG_NBIO
)) {
2449 resid
= uio_array_resid(uioarray
, uiocnt
);
2452 * In theory resid should be unsigned.
2453 * However, space must be signed, as it might be less than 0
2454 * if we over-committed, and we must use a signed comparison
2455 * of space and resid. On the other hand, a negative resid
2456 * causes us to loop sending 0-length segments to the protocol.
2458 * Note: We limit resid to be a positive int value as we use
2459 * imin() to set bytes_to_copy -- radr://14558484
2461 if (resid
< 0 || resid
> INT_MAX
) {
2467 so_update_last_owner_locked(so
, p
);
2468 so_update_policy(so
);
2471 so_update_necp_policy(so
, NULL
, NULL
);
2474 dontroute
= (flags
& MSG_DONTROUTE
) &&
2475 (so
->so_options
& SO_DONTROUTE
) == 0 &&
2476 (so
->so_proto
->pr_flags
& PR_ATOMIC
);
2477 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgsnd
);
2479 error
= sosendcheck(so
, NULL
, resid
, 0, atomic
, flags
,
2485 * Use big 4 KB clusters when the outgoing interface does not prefer
2488 bigcl
= !(so
->so_flags1
& SOF1_IF_2KCL
) || sosendbigcl_ignore_capab
;
2490 if (soreserveheadroom
!= 0)
2491 headroom
= so
->so_pktheadroom
;
2497 size_t maxpktlen
= 0;
2500 if (sosendminchain
> 0)
2503 chainlength
= sosendmaxchain
;
2505 socket_unlock(so
, 0);
2508 * Find a set of uio that fit in a reasonable number
2511 for (i
= uiofirst
; i
< uiocnt
; i
++) {
2512 struct uio
*auio
= uioarray
[i
];
2514 len
= uio_resid(auio
);
2516 /* Do nothing for empty messages */
2523 if (len
> maxpktlen
)
2527 if (chainlength
> sosendmaxchain
)
2531 * Nothing left to send
2533 if (num_needed
== 0) {
2538 * Allocate buffer large enough to include headroom space for
2539 * network and link header
2542 bytes_to_alloc
= maxpktlen
+ headroom
;
2545 * Allocate a single contiguous buffer of the smallest available
2546 * size when possible
2548 if (bytes_to_alloc
> MCLBYTES
&&
2549 bytes_to_alloc
<= MBIGCLBYTES
&& bigcl
) {
2550 freelist
= m_getpackets_internal(
2551 (unsigned int *)&num_needed
,
2552 num_needed
, M_WAIT
, 1,
2554 } else if (bytes_to_alloc
> _MHLEN
&&
2555 bytes_to_alloc
<= MCLBYTES
) {
2556 freelist
= m_getpackets_internal(
2557 (unsigned int *)&num_needed
,
2558 num_needed
, M_WAIT
, 1,
2561 freelist
= m_allocpacket_internal(
2562 (unsigned int *)&num_needed
,
2563 bytes_to_alloc
, NULL
, M_WAIT
, 1, 0);
2566 if (freelist
== NULL
) {
2572 * Copy each uio of the set into its own mbuf packet
2574 for (i
= uiofirst
, m
= freelist
;
2575 i
< uiolast
&& m
!= NULL
;
2579 struct uio
*auio
= uioarray
[i
];
2581 bytes_to_copy
= uio_resid(auio
);
2583 /* Do nothing for empty messages */
2584 if (bytes_to_copy
== 0)
2587 * Leave headroom for protocol headers
2588 * in the first mbuf of the chain
2590 m
->m_data
+= headroom
;
2592 for (n
= m
; n
!= NULL
; n
= n
->m_next
) {
2593 if ((m
->m_flags
& M_EXT
))
2594 mlen
= m
->m_ext
.ext_size
-
2596 else if ((m
->m_flags
& M_PKTHDR
))
2598 MHLEN
- m_leadingspace(m
);
2600 mlen
= MLEN
- m_leadingspace(m
);
2601 len
= imin(mlen
, bytes_to_copy
);
2604 * Note: uiomove() decrements the iovec
2607 error
= uiomove(mtod(n
, caddr_t
),
2612 m
->m_pkthdr
.len
+= len
;
2614 VERIFY(m
->m_pkthdr
.len
<= maxpktlen
);
2616 bytes_to_copy
-= len
;
2619 if (m
->m_pkthdr
.len
== 0) {
2621 "%s:%d so %llx pkt %llx type %u len null\n",
2623 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
2624 (uint64_t)DEBUG_KERNEL_ADDRPERM(m
),
2640 so
->so_options
|= SO_DONTROUTE
;
2642 if ((flags
& MSG_SKIPCFIL
) == 0) {
2643 struct mbuf
**prevnextp
= NULL
;
2645 for (i
= uiofirst
, m
= top
;
2646 i
< uiolast
&& m
!= NULL
;
2648 struct mbuf
*nextpkt
= m
->m_nextpkt
;
2651 * Socket filter processing
2653 error
= sflt_data_out(so
, NULL
, &m
,
2655 if (error
!= 0 && error
!= EJUSTRETURN
)
2661 * Content filter processing
2663 error
= cfil_sock_data_out(so
, NULL
, m
,
2665 if (error
!= 0 && error
!= EJUSTRETURN
)
2668 #endif /* CONTENT_FILTER */
2670 * Remove packet from the list when
2671 * swallowed by a filter
2673 if (error
== EJUSTRETURN
) {
2675 if (prevnextp
!= NULL
)
2676 *prevnextp
= nextpkt
;
2683 prevnextp
= &m
->m_nextpkt
;
2687 error
= (*so
->so_proto
->pr_usrreqs
->pru_send_list
)
2688 (so
, 0, top
, NULL
, NULL
, p
);
2691 so
->so_options
&= ~SO_DONTROUTE
;
2695 } while (resid
> 0 && error
== 0);
2698 sbunlock(&so
->so_snd
, FALSE
); /* will unlock socket */
2700 socket_unlock(so
, 1);
2704 if (freelist
!= NULL
)
2705 m_freem_list(freelist
);
2707 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST
| DBG_FUNC_END
, so
, resid
,
2708 so
->so_snd
.sb_cc
, 0, error
);
2714 * May return ERESTART when packet is dropped by MAC policy check
2717 soreceive_addr(struct proc
*p
, struct socket
*so
, struct sockaddr
**psa
,
2718 int flags
, struct mbuf
**mp
, struct mbuf
**nextrecordp
, int canwait
)
2721 struct mbuf
*m
= *mp
;
2722 struct mbuf
*nextrecord
= *nextrecordp
;
2724 KASSERT(m
->m_type
== MT_SONAME
, ("receive 1a"));
2725 #if CONFIG_MACF_SOCKET_SUBSET
2727 * Call the MAC framework for policy checking if we're in
2728 * the user process context and the socket isn't connected.
2730 if (p
!= kernproc
&& !(so
->so_state
& SS_ISCONNECTED
)) {
2731 struct mbuf
*m0
= m
;
2733 * Dequeue this record (temporarily) from the receive
2734 * list since we're about to drop the socket's lock
2735 * where a new record may arrive and be appended to
2736 * the list. Upon MAC policy failure, the record
2737 * will be freed. Otherwise, we'll add it back to
2738 * the head of the list. We cannot rely on SB_LOCK
2739 * because append operation uses the socket's lock.
2742 m
->m_nextpkt
= NULL
;
2743 sbfree(&so
->so_rcv
, m
);
2745 } while (m
!= NULL
);
2747 so
->so_rcv
.sb_mb
= nextrecord
;
2748 SB_EMPTY_FIXUP(&so
->so_rcv
);
2749 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1a");
2750 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1a");
2751 socket_unlock(so
, 0);
2753 if (mac_socket_check_received(proc_ucred(p
), so
,
2754 mtod(m
, struct sockaddr
*)) != 0) {
2756 * MAC policy failure; free this record and
2757 * process the next record (or block until
2758 * one is available). We have adjusted sb_cc
2759 * and sb_mbcnt above so there is no need to
2760 * call sbfree() again.
2764 * Clear SB_LOCK but don't unlock the socket.
2765 * Process the next record or wait for one.
2768 sbunlock(&so
->so_rcv
, TRUE
); /* stay locked */
2774 * If the socket has been defunct'd, drop it.
2776 if (so
->so_flags
& SOF_DEFUNCT
) {
2782 * Re-adjust the socket receive list and re-enqueue
2783 * the record in front of any packets which may have
2784 * been appended while we dropped the lock.
2786 for (m
= m0
; m
->m_next
!= NULL
; m
= m
->m_next
)
2787 sballoc(&so
->so_rcv
, m
);
2788 sballoc(&so
->so_rcv
, m
);
2789 if (so
->so_rcv
.sb_mb
== NULL
) {
2790 so
->so_rcv
.sb_lastrecord
= m0
;
2791 so
->so_rcv
.sb_mbtail
= m
;
2794 nextrecord
= m
->m_nextpkt
= so
->so_rcv
.sb_mb
;
2795 so
->so_rcv
.sb_mb
= m
;
2796 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1b");
2797 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1b");
2799 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2801 *psa
= dup_sockaddr(mtod(m
, struct sockaddr
*), canwait
);
2802 if ((*psa
== NULL
) && (flags
& MSG_NEEDSA
)) {
2803 error
= EWOULDBLOCK
;
2807 if (flags
& MSG_PEEK
) {
2810 sbfree(&so
->so_rcv
, m
);
2811 if (m
->m_next
== NULL
&& so
->so_rcv
.sb_cc
!= 0) {
2812 panic("%s: about to create invalid socketbuf",
2816 MFREE(m
, so
->so_rcv
.sb_mb
);
2817 m
= so
->so_rcv
.sb_mb
;
2819 m
->m_nextpkt
= nextrecord
;
2821 so
->so_rcv
.sb_mb
= nextrecord
;
2822 SB_EMPTY_FIXUP(&so
->so_rcv
);
2827 *nextrecordp
= nextrecord
;
2833 * Process one or more MT_CONTROL mbufs present before any data mbufs
2834 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2835 * just copy the data; if !MSG_PEEK, we call into the protocol to
2836 * perform externalization.
2839 soreceive_ctl(struct socket
*so
, struct mbuf
**controlp
, int flags
,
2840 struct mbuf
**mp
, struct mbuf
**nextrecordp
)
2843 struct mbuf
*cm
= NULL
, *cmn
;
2844 struct mbuf
**cme
= &cm
;
2845 struct sockbuf
*sb_rcv
= &so
->so_rcv
;
2846 struct mbuf
**msgpcm
= NULL
;
2847 struct mbuf
*m
= *mp
;
2848 struct mbuf
*nextrecord
= *nextrecordp
;
2849 struct protosw
*pr
= so
->so_proto
;
2852 * Externalizing the control messages would require us to
2853 * drop the socket's lock below. Once we re-acquire the
2854 * lock, the mbuf chain might change. In order to preserve
2855 * consistency, we unlink all control messages from the
2856 * first mbuf chain in one shot and link them separately
2857 * onto a different chain.
2860 if (flags
& MSG_PEEK
) {
2861 if (controlp
!= NULL
) {
2862 if (*controlp
== NULL
) {
2865 *controlp
= m_copy(m
, 0, m
->m_len
);
2868 * If we failed to allocate an mbuf,
2869 * release any previously allocated
2870 * mbufs for control data. Return
2871 * an error. Keep the mbufs in the
2872 * socket as this is using
2875 if (*controlp
== NULL
) {
2880 controlp
= &(*controlp
)->m_next
;
2884 m
->m_nextpkt
= NULL
;
2886 sb_rcv
->sb_mb
= m
->m_next
;
2889 cme
= &(*cme
)->m_next
;
2892 } while (m
!= NULL
&& m
->m_type
== MT_CONTROL
);
2894 if (!(flags
& MSG_PEEK
)) {
2895 if (sb_rcv
->sb_mb
!= NULL
) {
2896 sb_rcv
->sb_mb
->m_nextpkt
= nextrecord
;
2898 sb_rcv
->sb_mb
= nextrecord
;
2899 SB_EMPTY_FIXUP(sb_rcv
);
2901 if (nextrecord
== NULL
)
2902 sb_rcv
->sb_lastrecord
= m
;
2905 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive ctl");
2906 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive ctl");
2908 while (cm
!= NULL
) {
2913 cmsg_type
= mtod(cm
, struct cmsghdr
*)->cmsg_type
;
2916 * Call the protocol to externalize SCM_RIGHTS message
2917 * and return the modified message to the caller upon
2918 * success. Otherwise, all other control messages are
2919 * returned unmodified to the caller. Note that we
2920 * only get into this loop if MSG_PEEK is not set.
2922 if (pr
->pr_domain
->dom_externalize
!= NULL
&&
2923 cmsg_type
== SCM_RIGHTS
) {
2925 * Release socket lock: see 3903171. This
2926 * would also allow more records to be appended
2927 * to the socket buffer. We still have SB_LOCK
2928 * set on it, so we can be sure that the head
2929 * of the mbuf chain won't change.
2931 socket_unlock(so
, 0);
2932 error
= (*pr
->pr_domain
->dom_externalize
)(cm
);
2938 if (controlp
!= NULL
&& error
== 0) {
2940 controlp
= &(*controlp
)->m_next
;
2947 * Update the value of nextrecord in case we received new
2948 * records when the socket was unlocked above for
2949 * externalizing SCM_RIGHTS.
2952 nextrecord
= sb_rcv
->sb_mb
->m_nextpkt
;
2954 nextrecord
= sb_rcv
->sb_mb
;
2958 *nextrecordp
= nextrecord
;
2964 * Implement receive operations on a socket.
2965 * We depend on the way that records are added to the sockbuf
2966 * by sbappend*. In particular, each record (mbufs linked through m_next)
2967 * must begin with an address if the protocol so specifies,
2968 * followed by an optional mbuf or mbufs containing ancillary data,
2969 * and then zero or more mbufs of data.
2970 * In order to avoid blocking network interrupts for the entire time here,
2971 * we splx() while doing the actual copy to user space.
2972 * Although the sockbuf is locked, new data may still be appended,
2973 * and thus we must maintain consistency of the sockbuf during that time.
2975 * The caller may receive the data as a single mbuf chain by supplying
2976 * an mbuf **mp0 for use in returning the chain. The uio is then used
2977 * only for the count in uio_resid.
2979 * Returns: 0 Success
2984 * sblock:EWOULDBLOCK
2988 * sodelayed_copy:EFAULT
2989 * <pru_rcvoob>:EINVAL[TCP]
2990 * <pru_rcvoob>:EWOULDBLOCK[TCP]
2992 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
2993 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
2994 * <pr_domain->dom_externalize>:???
2996 * Notes: Additional return values from calls through <pru_rcvoob> and
2997 * <pr_domain->dom_externalize> depend on protocols other than
2998 * TCP or AF_UNIX, which are documented above.
3001 soreceive(struct socket
*so
, struct sockaddr
**psa
, struct uio
*uio
,
3002 struct mbuf
**mp0
, struct mbuf
**controlp
, int *flagsp
)
3004 struct mbuf
*m
, **mp
, *ml
= NULL
;
3005 struct mbuf
*nextrecord
, *free_list
;
3006 int flags
, error
, offset
;
3008 struct protosw
*pr
= so
->so_proto
;
3010 user_ssize_t orig_resid
= uio_resid(uio
);
3011 user_ssize_t delayed_copy_len
;
3014 struct proc
*p
= current_proc();
3015 boolean_t en_tracing
= FALSE
;
3018 * Sanity check on the length passed by caller as we are making 'int'
3021 if (orig_resid
< 0 || orig_resid
> INT_MAX
)
3024 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_START
, so
,
3025 uio_resid(uio
), so
->so_rcv
.sb_cc
, so
->so_rcv
.sb_lowat
,
3026 so
->so_rcv
.sb_hiwat
);
3029 so_update_last_owner_locked(so
, p
);
3030 so_update_policy(so
);
3032 #ifdef MORE_LOCKING_DEBUG
3033 if (so
->so_usecount
== 1) {
3034 panic("%s: so=%x no other reference on socket\n", __func__
, so
);
3041 if (controlp
!= NULL
)
3044 flags
= *flagsp
&~ MSG_EOR
;
3049 * If a recv attempt is made on a previously-accepted socket
3050 * that has been marked as inactive (disconnected), reject
3053 if (so
->so_flags
& SOF_DEFUNCT
) {
3054 struct sockbuf
*sb
= &so
->so_rcv
;
3057 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3058 __func__
, proc_pid(p
), proc_best_name(p
),
3059 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
3060 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
3062 * This socket should have been disconnected and flushed
3063 * prior to being returned from sodefunct(); there should
3064 * be no data on its receive list, so panic otherwise.
3066 if (so
->so_state
& SS_DEFUNCT
)
3067 sb_empty_assert(sb
, __func__
);
3068 socket_unlock(so
, 1);
3072 if ((so
->so_flags1
& SOF1_PRECONNECT_DATA
) &&
3073 pr
->pr_usrreqs
->pru_preconnect
) {
3075 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3076 * calling write() right after this. *If* the app calls a read
3077 * we do not want to block this read indefinetely. Thus,
3078 * we trigger a connect so that the session gets initiated.
3080 error
= (*pr
->pr_usrreqs
->pru_preconnect
)(so
);
3083 socket_unlock(so
, 1);
3088 if (ENTR_SHOULDTRACE
&&
3089 (SOCK_CHECK_DOM(so
, AF_INET
) || SOCK_CHECK_DOM(so
, AF_INET6
))) {
3091 * enable energy tracing for inet sockets that go over
3092 * non-loopback interfaces only.
3094 struct inpcb
*inp
= sotoinpcb(so
);
3095 if (inp
->inp_last_outifp
!= NULL
&&
3096 !(inp
->inp_last_outifp
->if_flags
& IFF_LOOPBACK
)) {
3098 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_START
,
3099 VM_KERNEL_ADDRPERM(so
),
3100 ((so
->so_state
& SS_NBIO
) ?
3101 kEnTrFlagNonBlocking
: 0),
3102 (int64_t)orig_resid
);
3107 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3108 * regardless of the flags argument. Here is the case were
3109 * out-of-band data is not inline.
3111 if ((flags
& MSG_OOB
) ||
3112 ((so
->so_options
& SO_WANTOOBFLAG
) != 0 &&
3113 (so
->so_options
& SO_OOBINLINE
) == 0 &&
3114 (so
->so_oobmark
|| (so
->so_state
& SS_RCVATMARK
)))) {
3115 m
= m_get(M_WAIT
, MT_DATA
);
3117 socket_unlock(so
, 1);
3118 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
,
3119 ENOBUFS
, 0, 0, 0, 0);
3122 error
= (*pr
->pr_usrreqs
->pru_rcvoob
)(so
, m
, flags
& MSG_PEEK
);
3125 socket_unlock(so
, 0);
3127 error
= uiomove(mtod(m
, caddr_t
),
3128 imin(uio_resid(uio
), m
->m_len
), uio
);
3130 } while (uio_resid(uio
) && error
== 0 && m
!= NULL
);
3136 if ((so
->so_options
& SO_WANTOOBFLAG
) != 0) {
3137 if (error
== EWOULDBLOCK
|| error
== EINVAL
) {
3139 * Let's try to get normal data:
3140 * EWOULDBLOCK: out-of-band data not
3141 * receive yet. EINVAL: out-of-band data
3146 } else if (error
== 0 && flagsp
!= NULL
) {
3150 socket_unlock(so
, 1);
3152 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_END
,
3153 VM_KERNEL_ADDRPERM(so
), 0,
3154 (int64_t)(orig_resid
- uio_resid(uio
)));
3156 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
3165 if (so
->so_state
& SS_ISCONFIRMING
&& uio_resid(uio
)) {
3166 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, 0);
3170 delayed_copy_len
= 0;
3172 #ifdef MORE_LOCKING_DEBUG
3173 if (so
->so_usecount
<= 1)
3174 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3175 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), so
->so_usecount
);
3178 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3179 * and if so just return to the caller. This could happen when
3180 * soreceive() is called by a socket upcall function during the
3181 * time the socket is freed. The socket buffer would have been
3182 * locked across the upcall, therefore we cannot put this thread
3183 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3184 * we may livelock), because the lock on the socket buffer will
3185 * only be released when the upcall routine returns to its caller.
3186 * Because the socket has been officially closed, there can be
3187 * no further read on it.
3189 * A multipath subflow socket would have its SS_NOFDREF set by
3190 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3191 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3193 if ((so
->so_state
& (SS_NOFDREF
| SS_CANTRCVMORE
)) ==
3194 (SS_NOFDREF
| SS_CANTRCVMORE
) && !(so
->so_flags
& SOF_MP_SUBFLOW
)) {
3195 socket_unlock(so
, 1);
3199 error
= sblock(&so
->so_rcv
, SBLOCKWAIT(flags
));
3201 socket_unlock(so
, 1);
3202 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
3205 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_END
,
3206 VM_KERNEL_ADDRPERM(so
), 0,
3207 (int64_t)(orig_resid
- uio_resid(uio
)));
3212 m
= so
->so_rcv
.sb_mb
;
3214 * If we have less data than requested, block awaiting more
3215 * (subject to any timeout) if:
3216 * 1. the current count is less than the low water mark, or
3217 * 2. MSG_WAITALL is set, and it is possible to do the entire
3218 * receive operation at once if we block (resid <= hiwat).
3219 * 3. MSG_DONTWAIT is not set
3220 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3221 * we have to do the receive in sections, and thus risk returning
3222 * a short count if a timeout or signal occurs after we start.
3224 if (m
== NULL
|| (((flags
& MSG_DONTWAIT
) == 0 &&
3225 so
->so_rcv
.sb_cc
< uio_resid(uio
)) &&
3226 (so
->so_rcv
.sb_cc
< so
->so_rcv
.sb_lowat
||
3227 ((flags
& MSG_WAITALL
) && uio_resid(uio
) <= so
->so_rcv
.sb_hiwat
)) &&
3228 m
->m_nextpkt
== NULL
&& (pr
->pr_flags
& PR_ATOMIC
) == 0)) {
3230 * Panic if we notice inconsistencies in the socket's
3231 * receive list; both sb_mb and sb_cc should correctly
3232 * reflect the contents of the list, otherwise we may
3233 * end up with false positives during select() or poll()
3234 * which could put the application in a bad state.
3236 SB_MB_CHECK(&so
->so_rcv
);
3241 error
= so
->so_error
;
3242 if ((flags
& MSG_PEEK
) == 0)
3246 if (so
->so_state
& SS_CANTRCVMORE
) {
3249 * Deal with half closed connections
3251 if ((so
->so_state
& SS_ISDISCONNECTED
) == 0 &&
3252 cfil_sock_data_pending(&so
->so_rcv
) != 0)
3254 "so %llx ignore SS_CANTRCVMORE",
3255 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
));
3257 #endif /* CONTENT_FILTER */
3263 for (; m
!= NULL
; m
= m
->m_next
)
3264 if (m
->m_type
== MT_OOBDATA
|| (m
->m_flags
& M_EOR
)) {
3265 m
= so
->so_rcv
.sb_mb
;
3268 if ((so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
)) == 0 &&
3269 (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
)) {
3273 if (uio_resid(uio
) == 0)
3276 if ((so
->so_state
& SS_NBIO
) ||
3277 (flags
& (MSG_DONTWAIT
|MSG_NBIO
))) {
3278 error
= EWOULDBLOCK
;
3281 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 1");
3282 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 1");
3283 sbunlock(&so
->so_rcv
, TRUE
); /* keep socket locked */
3284 #if EVEN_MORE_LOCKING_DEBUG
3286 printf("Waiting for socket data\n");
3289 error
= sbwait(&so
->so_rcv
);
3290 #if EVEN_MORE_LOCKING_DEBUG
3292 printf("SORECEIVE - sbwait returned %d\n", error
);
3294 if (so
->so_usecount
< 1) {
3295 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3296 __func__
, so
, so
->so_usecount
);
3300 socket_unlock(so
, 1);
3301 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
3304 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_END
,
3305 VM_KERNEL_ADDRPERM(so
), 0,
3306 (int64_t)(orig_resid
- uio_resid(uio
)));
3313 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgrcv
);
3314 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1");
3315 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1");
3316 nextrecord
= m
->m_nextpkt
;
3318 if ((pr
->pr_flags
& PR_ADDR
) && m
->m_type
== MT_SONAME
) {
3319 error
= soreceive_addr(p
, so
, psa
, flags
, &m
, &nextrecord
,
3321 if (error
== ERESTART
)
3323 else if (error
!= 0)
3329 * Process one or more MT_CONTROL mbufs present before any data mbufs
3330 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3331 * just copy the data; if !MSG_PEEK, we call into the protocol to
3332 * perform externalization.
3334 if (m
!= NULL
&& m
->m_type
== MT_CONTROL
) {
3335 error
= soreceive_ctl(so
, controlp
, flags
, &m
, &nextrecord
);
3342 * If the socket is a TCP socket with message delivery
3343 * enabled, then create a control msg to deliver the
3344 * relative TCP sequence number for this data. Waiting
3345 * until this point will protect against failures to
3346 * allocate an mbuf for control msgs.
3348 if (so
->so_type
== SOCK_STREAM
&& SOCK_PROTO(so
) == IPPROTO_TCP
&&
3349 (so
->so_flags
& SOF_ENABLE_MSGS
) && controlp
!= NULL
) {
3350 struct mbuf
*seq_cm
;
3352 seq_cm
= sbcreatecontrol((caddr_t
)&m
->m_pkthdr
.msg_seq
,
3353 sizeof (uint32_t), SCM_SEQNUM
, SOL_SOCKET
);
3354 if (seq_cm
== NULL
) {
3355 /* unable to allocate a control mbuf */
3360 controlp
= &seq_cm
->m_next
;
3364 if (!(flags
& MSG_PEEK
)) {
3366 * We get here because m points to an mbuf following
3367 * any MT_SONAME or MT_CONTROL mbufs which have been
3368 * processed above. In any case, m should be pointing
3369 * to the head of the mbuf chain, and the nextrecord
3370 * should be either NULL or equal to m->m_nextpkt.
3371 * See comments above about SB_LOCK.
3373 if (m
!= so
->so_rcv
.sb_mb
||
3374 m
->m_nextpkt
!= nextrecord
) {
3375 panic("%s: post-control !sync so=%p m=%p "
3376 "nextrecord=%p\n", __func__
, so
, m
,
3380 if (nextrecord
== NULL
)
3381 so
->so_rcv
.sb_lastrecord
= m
;
3384 if (type
== MT_OOBDATA
)
3387 if (!(flags
& MSG_PEEK
)) {
3388 SB_EMPTY_FIXUP(&so
->so_rcv
);
3391 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 2");
3392 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 2");
3397 if (!(flags
& MSG_PEEK
) && uio_resid(uio
) > sorecvmincopy
)
3405 (uio_resid(uio
) - delayed_copy_len
) > 0 && error
== 0) {
3406 if (m
->m_type
== MT_OOBDATA
) {
3407 if (type
!= MT_OOBDATA
)
3409 } else if (type
== MT_OOBDATA
) {
3413 * Make sure to allways set MSG_OOB event when getting
3414 * out of band data inline.
3416 if ((so
->so_options
& SO_WANTOOBFLAG
) != 0 &&
3417 (so
->so_options
& SO_OOBINLINE
) != 0 &&
3418 (so
->so_state
& SS_RCVATMARK
) != 0) {
3421 so
->so_state
&= ~SS_RCVATMARK
;
3422 len
= uio_resid(uio
) - delayed_copy_len
;
3423 if (so
->so_oobmark
&& len
> so
->so_oobmark
- offset
)
3424 len
= so
->so_oobmark
- offset
;
3425 if (len
> m
->m_len
- moff
)
3426 len
= m
->m_len
- moff
;
3428 * If mp is set, just pass back the mbufs.
3429 * Otherwise copy them out via the uio, then free.
3430 * Sockbuf must be consistent here (points to current mbuf,
3431 * it points to next record) when we drop priority;
3432 * we must note any additions to the sockbuf when we
3433 * block interrupts again.
3436 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive uiomove");
3437 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive uiomove");
3438 if (can_delay
&& len
== m
->m_len
) {
3440 * only delay the copy if we're consuming the
3441 * mbuf and we're NOT in MSG_PEEK mode
3442 * and we have enough data to make it worthwile
3443 * to drop and retake the lock... can_delay
3444 * reflects the state of the 2 latter
3445 * constraints moff should always be zero
3448 delayed_copy_len
+= len
;
3450 if (delayed_copy_len
) {
3451 error
= sodelayed_copy(so
, uio
,
3452 &free_list
, &delayed_copy_len
);
3458 * can only get here if MSG_PEEK is not
3459 * set therefore, m should point at the
3460 * head of the rcv queue; if it doesn't,
3461 * it means something drastically
3462 * changed while we were out from behind
3463 * the lock in sodelayed_copy. perhaps
3464 * a RST on the stream. in any event,
3465 * the stream has been interrupted. it's
3466 * probably best just to return whatever
3467 * data we've moved and let the caller
3470 if (m
!= so
->so_rcv
.sb_mb
) {
3474 socket_unlock(so
, 0);
3475 error
= uiomove(mtod(m
, caddr_t
) + moff
,
3483 uio_setresid(uio
, (uio_resid(uio
) - len
));
3485 if (len
== m
->m_len
- moff
) {
3486 if (m
->m_flags
& M_EOR
)
3488 if (flags
& MSG_PEEK
) {
3492 nextrecord
= m
->m_nextpkt
;
3493 sbfree(&so
->so_rcv
, m
);
3494 m
->m_nextpkt
= NULL
;
3497 * If this packet is an unordered packet
3498 * (indicated by M_UNORDERED_DATA flag), remove
3499 * the additional bytes added to the
3500 * receive socket buffer size.
3502 if ((so
->so_flags
& SOF_ENABLE_MSGS
) &&
3504 (m
->m_flags
& M_UNORDERED_DATA
) &&
3505 sbreserve(&so
->so_rcv
,
3506 so
->so_rcv
.sb_hiwat
- m
->m_len
)) {
3507 if (so
->so_msg_state
->msg_uno_bytes
>
3510 msg_uno_bytes
-= m
->m_len
;
3515 m
->m_flags
&= ~M_UNORDERED_DATA
;
3521 so
->so_rcv
.sb_mb
= m
= m
->m_next
;
3524 if (free_list
== NULL
)
3529 so
->so_rcv
.sb_mb
= m
= m
->m_next
;
3533 m
->m_nextpkt
= nextrecord
;
3534 if (nextrecord
== NULL
)
3535 so
->so_rcv
.sb_lastrecord
= m
;
3537 so
->so_rcv
.sb_mb
= nextrecord
;
3538 SB_EMPTY_FIXUP(&so
->so_rcv
);
3540 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 3");
3541 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 3");
3544 if (flags
& MSG_PEEK
) {
3550 if (flags
& MSG_DONTWAIT
)
3551 copy_flag
= M_DONTWAIT
;
3554 *mp
= m_copym(m
, 0, len
, copy_flag
);
3556 * Failed to allocate an mbuf?
3557 * Adjust uio_resid back, it was
3558 * adjusted down by len bytes which
3559 * we didn't copy over.
3563 (uio_resid(uio
) + len
));
3569 so
->so_rcv
.sb_cc
-= len
;
3572 if (so
->so_oobmark
) {
3573 if ((flags
& MSG_PEEK
) == 0) {
3574 so
->so_oobmark
-= len
;
3575 if (so
->so_oobmark
== 0) {
3576 so
->so_state
|= SS_RCVATMARK
;
3578 * delay posting the actual event until
3579 * after any delayed copy processing
3587 if (offset
== so
->so_oobmark
)
3591 if (flags
& MSG_EOR
)
3594 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3595 * (for non-atomic socket), we must not quit until
3596 * "uio->uio_resid == 0" or an error termination.
3597 * If a signal/timeout occurs, return with a short
3598 * count but without error. Keep sockbuf locked
3599 * against other readers.
3601 while (flags
& (MSG_WAITALL
|MSG_WAITSTREAM
) && m
== NULL
&&
3602 (uio_resid(uio
) - delayed_copy_len
) > 0 &&
3603 !sosendallatonce(so
) && !nextrecord
) {
3604 if (so
->so_error
|| ((so
->so_state
& SS_CANTRCVMORE
)
3606 && cfil_sock_data_pending(&so
->so_rcv
) == 0
3607 #endif /* CONTENT_FILTER */
3612 * Depending on the protocol (e.g. TCP), the following
3613 * might cause the socket lock to be dropped and later
3614 * be reacquired, and more data could have arrived and
3615 * have been appended to the receive socket buffer by
3616 * the time it returns. Therefore, we only sleep in
3617 * sbwait() below if and only if the socket buffer is
3618 * empty, in order to avoid a false sleep.
3620 if (pr
->pr_flags
& PR_WANTRCVD
&& so
->so_pcb
&&
3621 (((struct inpcb
*)so
->so_pcb
)->inp_state
!=
3623 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
3625 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 2");
3626 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 2");
3628 if (so
->so_rcv
.sb_mb
== NULL
&& sbwait(&so
->so_rcv
)) {
3633 * have to wait until after we get back from the sbwait
3634 * to do the copy because we will drop the lock if we
3635 * have enough data that has been delayed... by dropping
3636 * the lock we open up a window allowing the netisr
3637 * thread to process the incoming packets and to change
3638 * the state of this socket... we're issuing the sbwait
3639 * because the socket is empty and we're expecting the
3640 * netisr thread to wake us up when more packets arrive;
3641 * if we allow that processing to happen and then sbwait
3642 * we could stall forever with packets sitting in the
3643 * socket if no further packets arrive from the remote
3646 * we want to copy before we've collected all the data
3647 * to satisfy this request to allow the copy to overlap
3648 * the incoming packet processing on an MP system
3650 if (delayed_copy_len
> sorecvmincopy
&&
3651 (delayed_copy_len
> (so
->so_rcv
.sb_hiwat
/ 2))) {
3652 error
= sodelayed_copy(so
, uio
,
3653 &free_list
, &delayed_copy_len
);
3658 m
= so
->so_rcv
.sb_mb
;
3660 nextrecord
= m
->m_nextpkt
;
3662 SB_MB_CHECK(&so
->so_rcv
);
3665 #ifdef MORE_LOCKING_DEBUG
3666 if (so
->so_usecount
<= 1) {
3667 panic("%s: after big while so=%p ref=%d on socket\n",
3668 __func__
, so
, so
->so_usecount
);
3673 if (m
!= NULL
&& pr
->pr_flags
& PR_ATOMIC
) {
3674 if (so
->so_options
& SO_DONTTRUNC
) {
3675 flags
|= MSG_RCVMORE
;
3678 if ((flags
& MSG_PEEK
) == 0)
3679 (void) sbdroprecord(&so
->so_rcv
);
3684 * pru_rcvd below (for TCP) may cause more data to be received
3685 * if the socket lock is dropped prior to sending the ACK; some
3686 * legacy OpenTransport applications don't handle this well
3687 * (if it receives less data than requested while MSG_HAVEMORE
3688 * is set), and so we set the flag now based on what we know
3689 * prior to calling pru_rcvd.
3691 if ((so
->so_options
& SO_WANTMORE
) && so
->so_rcv
.sb_cc
> 0)
3692 flags
|= MSG_HAVEMORE
;
3694 if ((flags
& MSG_PEEK
) == 0) {
3696 so
->so_rcv
.sb_mb
= nextrecord
;
3698 * First part is an inline SB_EMPTY_FIXUP(). Second
3699 * part makes sure sb_lastrecord is up-to-date if
3700 * there is still data in the socket buffer.
3702 if (so
->so_rcv
.sb_mb
== NULL
) {
3703 so
->so_rcv
.sb_mbtail
= NULL
;
3704 so
->so_rcv
.sb_lastrecord
= NULL
;
3705 } else if (nextrecord
->m_nextpkt
== NULL
) {
3706 so
->so_rcv
.sb_lastrecord
= nextrecord
;
3708 SB_MB_CHECK(&so
->so_rcv
);
3710 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 4");
3711 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 4");
3712 if (pr
->pr_flags
& PR_WANTRCVD
&& so
->so_pcb
)
3713 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
3716 if (delayed_copy_len
) {
3717 error
= sodelayed_copy(so
, uio
, &free_list
, &delayed_copy_len
);
3721 if (free_list
!= NULL
) {
3722 m_freem_list(free_list
);
3726 postevent(so
, 0, EV_OOB
);
3728 if (orig_resid
== uio_resid(uio
) && orig_resid
&&
3729 (flags
& MSG_EOR
) == 0 && (so
->so_state
& SS_CANTRCVMORE
) == 0) {
3730 sbunlock(&so
->so_rcv
, TRUE
); /* keep socket locked */
3737 #ifdef MORE_LOCKING_DEBUG
3738 if (so
->so_usecount
<= 1) {
3739 panic("%s: release so=%p ref=%d on socket\n", __func__
,
3740 so
, so
->so_usecount
);
3744 if (delayed_copy_len
)
3745 error
= sodelayed_copy(so
, uio
, &free_list
, &delayed_copy_len
);
3747 if (free_list
!= NULL
)
3748 m_freem_list(free_list
);
3750 sbunlock(&so
->so_rcv
, FALSE
); /* will unlock socket */
3753 KERNEL_ENERGYTRACE(kEnTrActKernSockRead
, DBG_FUNC_END
,
3754 VM_KERNEL_ADDRPERM(so
),
3755 ((error
== EWOULDBLOCK
) ? kEnTrFlagNoWork
: 0),
3756 (int64_t)(orig_resid
- uio_resid(uio
)));
3758 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, so
, uio_resid(uio
),
3759 so
->so_rcv
.sb_cc
, 0, error
);
3765 * Returns: 0 Success
3769 sodelayed_copy(struct socket
*so
, struct uio
*uio
, struct mbuf
**free_list
,
3770 user_ssize_t
*resid
)
3777 socket_unlock(so
, 0);
3779 while (m
!= NULL
&& error
== 0) {
3780 error
= uiomove(mtod(m
, caddr_t
), (int)m
->m_len
, uio
);
3783 m_freem_list(*free_list
);
3794 sodelayed_copy_list(struct socket
*so
, struct recv_msg_elem
*msgarray
,
3795 u_int uiocnt
, struct mbuf
**free_list
, user_ssize_t
*resid
)
3799 struct mbuf
*ml
, *m
;
3803 for (ml
= *free_list
, i
= 0; ml
!= NULL
&& i
< uiocnt
;
3804 ml
= ml
->m_nextpkt
, i
++) {
3805 auio
= msgarray
[i
].uio
;
3806 for (m
= ml
; m
!= NULL
; m
= m
->m_next
) {
3807 error
= uiomove(mtod(m
, caddr_t
), m
->m_len
, auio
);
3813 m_freem_list(*free_list
);
3822 soreceive_list(struct socket
*so
, struct recv_msg_elem
*msgarray
, u_int uiocnt
,
3826 struct mbuf
*nextrecord
;
3827 struct mbuf
*ml
= NULL
, *free_list
= NULL
, *free_tail
= NULL
;
3829 user_ssize_t len
, pktlen
, delayed_copy_len
= 0;
3830 struct protosw
*pr
= so
->so_proto
;
3832 struct proc
*p
= current_proc();
3833 struct uio
*auio
= NULL
;
3836 struct sockaddr
**psa
= NULL
;
3837 struct mbuf
**controlp
= NULL
;
3840 struct mbuf
*free_others
= NULL
;
3842 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST
| DBG_FUNC_START
,
3844 so
->so_rcv
.sb_cc
, so
->so_rcv
.sb_lowat
, so
->so_rcv
.sb_hiwat
);
3848 * - Only supports don't wait flags
3849 * - Only support datagram sockets (could be extended to raw)
3851 * - Protocol must support packet chains
3852 * - The uio array is NULL (should we panic?)
3858 if (flags
& ~(MSG_PEEK
| MSG_WAITALL
| MSG_DONTWAIT
| MSG_NEEDSA
|
3860 printf("%s invalid flags 0x%x\n", __func__
, flags
);
3864 if (so
->so_type
!= SOCK_DGRAM
) {
3868 if (sosendallatonce(so
) == 0) {
3872 if (so
->so_proto
->pr_usrreqs
->pru_send_list
== NULL
) {
3873 error
= EPROTONOSUPPORT
;
3876 if (msgarray
== NULL
) {
3877 printf("%s uioarray is NULL\n", __func__
);
3882 printf("%s uiocnt is 0\n", __func__
);
3887 * Sanity check on the length passed by caller as we are making 'int'
3890 resid
= recv_msg_array_resid(msgarray
, uiocnt
);
3891 if (resid
< 0 || resid
> INT_MAX
) {
3896 if (!(flags
& MSG_PEEK
) && sorecvmincopy
> 0)
3902 so_update_last_owner_locked(so
, p
);
3903 so_update_policy(so
);
3906 so_update_necp_policy(so
, NULL
, NULL
);
3910 * If a recv attempt is made on a previously-accepted socket
3911 * that has been marked as inactive (disconnected), reject
3914 if (so
->so_flags
& SOF_DEFUNCT
) {
3915 struct sockbuf
*sb
= &so
->so_rcv
;
3918 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3919 __func__
, proc_pid(p
), proc_best_name(p
),
3920 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
3921 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
3923 * This socket should have been disconnected and flushed
3924 * prior to being returned from sodefunct(); there should
3925 * be no data on its receive list, so panic otherwise.
3927 if (so
->so_state
& SS_DEFUNCT
)
3928 sb_empty_assert(sb
, __func__
);
3934 * The uio may be empty
3936 if (npkts
>= uiocnt
) {
3942 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3943 * and if so just return to the caller. This could happen when
3944 * soreceive() is called by a socket upcall function during the
3945 * time the socket is freed. The socket buffer would have been
3946 * locked across the upcall, therefore we cannot put this thread
3947 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3948 * we may livelock), because the lock on the socket buffer will
3949 * only be released when the upcall routine returns to its caller.
3950 * Because the socket has been officially closed, there can be
3951 * no further read on it.
3953 if ((so
->so_state
& (SS_NOFDREF
| SS_CANTRCVMORE
)) ==
3954 (SS_NOFDREF
| SS_CANTRCVMORE
)) {
3959 error
= sblock(&so
->so_rcv
, SBLOCKWAIT(flags
));
3965 m
= so
->so_rcv
.sb_mb
;
3967 * Block awaiting more datagram if needed
3969 if (m
== NULL
|| (((flags
& MSG_DONTWAIT
) == 0 &&
3970 (so
->so_rcv
.sb_cc
< so
->so_rcv
.sb_lowat
||
3971 ((flags
& MSG_WAITALL
) && npkts
< uiocnt
))))) {
3973 * Panic if we notice inconsistencies in the socket's
3974 * receive list; both sb_mb and sb_cc should correctly
3975 * reflect the contents of the list, otherwise we may
3976 * end up with false positives during select() or poll()
3977 * which could put the application in a bad state.
3979 SB_MB_CHECK(&so
->so_rcv
);
3982 error
= so
->so_error
;
3983 if ((flags
& MSG_PEEK
) == 0)
3987 if (so
->so_state
& SS_CANTRCVMORE
) {
3990 if ((so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
)) == 0 &&
3991 (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
)) {
3995 if ((so
->so_state
& SS_NBIO
) ||
3996 (flags
& (MSG_DONTWAIT
|MSG_NBIO
))) {
3997 error
= EWOULDBLOCK
;
4001 * Do not block if we got some data
4003 if (free_list
!= NULL
) {
4008 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 1");
4009 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 1");
4011 sbunlock(&so
->so_rcv
, TRUE
); /* keep socket locked */
4014 error
= sbwait(&so
->so_rcv
);
4021 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgrcv
);
4022 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1");
4023 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1");
4026 * Consume the current uio index as we have a datagram
4028 auio
= msgarray
[npkts
].uio
;
4029 resid
= uio_resid(auio
);
4030 msgarray
[npkts
].which
|= SOCK_MSG_DATA
;
4031 psa
= (msgarray
[npkts
].which
& SOCK_MSG_SA
) ?
4032 &msgarray
[npkts
].psa
: NULL
;
4033 controlp
= (msgarray
[npkts
].which
& SOCK_MSG_CONTROL
) ?
4034 &msgarray
[npkts
].controlp
: NULL
;
4036 nextrecord
= m
->m_nextpkt
;
4038 if ((pr
->pr_flags
& PR_ADDR
) && m
->m_type
== MT_SONAME
) {
4039 error
= soreceive_addr(p
, so
, psa
, flags
, &m
, &nextrecord
, 1);
4040 if (error
== ERESTART
)
4042 else if (error
!= 0)
4046 if (m
!= NULL
&& m
->m_type
== MT_CONTROL
) {
4047 error
= soreceive_ctl(so
, controlp
, flags
, &m
, &nextrecord
);
4052 if (m
->m_pkthdr
.len
== 0) {
4053 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4055 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
4056 (uint64_t)DEBUG_KERNEL_ADDRPERM(m
),
4061 * Loop to copy the mbufs of the current record
4062 * Support zero length packets
4066 while (m
!= NULL
&& (len
= resid
- pktlen
) >= 0 && error
== 0) {
4068 panic("%p m_len zero", m
);
4070 panic("%p m_type zero", m
);
4072 * Clip to the residual length
4078 * Copy the mbufs via the uio or delay the copy
4079 * Sockbuf must be consistent here (points to current mbuf,
4080 * it points to next record) when we drop priority;
4081 * we must note any additions to the sockbuf when we
4082 * block interrupts again.
4084 if (len
> 0 && can_delay
== 0) {
4085 socket_unlock(so
, 0);
4086 error
= uiomove(mtod(m
, caddr_t
), (int)len
, auio
);
4091 delayed_copy_len
+= len
;
4094 if (len
== m
->m_len
) {
4096 * m was entirely copied
4098 sbfree(&so
->so_rcv
, m
);
4099 nextrecord
= m
->m_nextpkt
;
4100 m
->m_nextpkt
= NULL
;
4103 * Set the first packet to the head of the free list
4105 if (free_list
== NULL
)
4108 * Link current packet to tail of free list
4111 if (free_tail
!= NULL
)
4112 free_tail
->m_nextpkt
= m
;
4116 * Link current mbuf to last mbuf of current packet
4123 * Move next buf to head of socket buffer
4125 so
->so_rcv
.sb_mb
= m
= ml
->m_next
;
4129 m
->m_nextpkt
= nextrecord
;
4130 if (nextrecord
== NULL
)
4131 so
->so_rcv
.sb_lastrecord
= m
;
4133 so
->so_rcv
.sb_mb
= nextrecord
;
4134 SB_EMPTY_FIXUP(&so
->so_rcv
);
4136 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 3");
4137 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 3");
4140 * Stop the loop on partial copy
4145 #ifdef MORE_LOCKING_DEBUG
4146 if (so
->so_usecount
<= 1) {
4147 panic("%s: after big while so=%llx ref=%d on socket\n",
4149 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), so
->so_usecount
);
4154 * Tell the caller we made a partial copy
4157 if (so
->so_options
& SO_DONTTRUNC
) {
4159 * Copyout first the freelist then the partial mbuf
4161 socket_unlock(so
, 0);
4162 if (delayed_copy_len
)
4163 error
= sodelayed_copy_list(so
, msgarray
,
4164 uiocnt
, &free_list
, &delayed_copy_len
);
4167 error
= uiomove(mtod(m
, caddr_t
), (int)len
,
4176 so
->so_rcv
.sb_cc
-= len
;
4177 flags
|= MSG_RCVMORE
;
4179 (void) sbdroprecord(&so
->so_rcv
);
4180 nextrecord
= so
->so_rcv
.sb_mb
;
4187 so
->so_rcv
.sb_mb
= nextrecord
;
4189 * First part is an inline SB_EMPTY_FIXUP(). Second
4190 * part makes sure sb_lastrecord is up-to-date if
4191 * there is still data in the socket buffer.
4193 if (so
->so_rcv
.sb_mb
== NULL
) {
4194 so
->so_rcv
.sb_mbtail
= NULL
;
4195 so
->so_rcv
.sb_lastrecord
= NULL
;
4196 } else if (nextrecord
->m_nextpkt
== NULL
) {
4197 so
->so_rcv
.sb_lastrecord
= nextrecord
;
4199 SB_MB_CHECK(&so
->so_rcv
);
4201 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 4");
4202 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 4");
4205 * We can continue to the next packet as long as:
4206 * - We haven't exhausted the uio array
4207 * - There was no error
4208 * - A packet was not truncated
4209 * - We can still receive more data
4211 if (npkts
< uiocnt
&& error
== 0 &&
4212 (flags
& (MSG_RCVMORE
| MSG_TRUNC
)) == 0 &&
4213 (so
->so_state
& SS_CANTRCVMORE
) == 0) {
4214 sbunlock(&so
->so_rcv
, TRUE
); /* keep socket locked */
4224 * pru_rcvd may cause more data to be received if the socket lock
4225 * is dropped so we set MSG_HAVEMORE now based on what we know.
4226 * That way the caller won't be surprised if it receives less data
4229 if ((so
->so_options
& SO_WANTMORE
) && so
->so_rcv
.sb_cc
> 0)
4230 flags
|= MSG_HAVEMORE
;
4232 if (pr
->pr_flags
& PR_WANTRCVD
&& so
->so_pcb
)
4233 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
4236 sbunlock(&so
->so_rcv
, FALSE
); /* will unlock socket */
4238 socket_unlock(so
, 1);
4240 if (delayed_copy_len
)
4241 error
= sodelayed_copy_list(so
, msgarray
, uiocnt
,
4242 &free_list
, &delayed_copy_len
);
4245 * Amortize the cost of freeing the mbufs
4247 if (free_list
!= NULL
)
4248 m_freem_list(free_list
);
4249 if (free_others
!= NULL
)
4250 m_freem_list(free_others
);
4252 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST
| DBG_FUNC_END
, error
,
4258 * Returns: 0 Success
4261 * <pru_shutdown>:EINVAL
4262 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4263 * <pru_shutdown>:ENOBUFS[TCP]
4264 * <pru_shutdown>:EMSGSIZE[TCP]
4265 * <pru_shutdown>:EHOSTUNREACH[TCP]
4266 * <pru_shutdown>:ENETUNREACH[TCP]
4267 * <pru_shutdown>:ENETDOWN[TCP]
4268 * <pru_shutdown>:ENOMEM[TCP]
4269 * <pru_shutdown>:EACCES[TCP]
4270 * <pru_shutdown>:EMSGSIZE[TCP]
4271 * <pru_shutdown>:ENOBUFS[TCP]
4272 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4273 * <pru_shutdown>:??? [other protocol families]
4276 soshutdown(struct socket
*so
, int how
)
4280 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN
| DBG_FUNC_START
, how
, 0, 0, 0, 0);
4288 (SS_ISCONNECTED
|SS_ISCONNECTING
|SS_ISDISCONNECTING
)) == 0) {
4291 error
= soshutdownlock(so
, how
);
4293 socket_unlock(so
, 1);
4300 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN
| DBG_FUNC_END
, how
, error
, 0, 0, 0);
4306 soshutdownlock_final(struct socket
*so
, int how
)
4308 struct protosw
*pr
= so
->so_proto
;
4311 sflt_notify(so
, sock_evt_shutdown
, &how
);
4313 if (how
!= SHUT_WR
) {
4314 if ((so
->so_state
& SS_CANTRCVMORE
) != 0) {
4315 /* read already shut down */
4320 postevent(so
, 0, EV_RCLOSED
);
4322 if (how
!= SHUT_RD
) {
4323 if ((so
->so_state
& SS_CANTSENDMORE
) != 0) {
4324 /* write already shut down */
4328 error
= (*pr
->pr_usrreqs
->pru_shutdown
)(so
);
4329 postevent(so
, 0, EV_WCLOSED
);
4332 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN
, how
, 1, 0, 0, 0);
4337 soshutdownlock(struct socket
*so
, int how
)
4343 * A content filter may delay the actual shutdown until it
4344 * has processed the pending data
4346 if (so
->so_flags
& SOF_CONTENT_FILTER
) {
4347 error
= cfil_sock_shutdown(so
, &how
);
4348 if (error
== EJUSTRETURN
) {
4351 } else if (error
!= 0) {
4355 #endif /* CONTENT_FILTER */
4357 error
= soshutdownlock_final(so
, how
);
4364 sowflush(struct socket
*so
)
4366 struct sockbuf
*sb
= &so
->so_snd
;
4369 * Obtain lock on the socket buffer (SB_LOCK). This is required
4370 * to prevent the socket buffer from being unexpectedly altered
4371 * while it is used by another thread in socket send/receive.
4373 * sblock() must not fail here, hence the assertion.
4375 (void) sblock(sb
, SBL_WAIT
| SBL_NOINTR
| SBL_IGNDEFUNCT
);
4376 VERIFY(sb
->sb_flags
& SB_LOCK
);
4378 sb
->sb_flags
&= ~(SB_SEL
|SB_UPCALL
);
4379 sb
->sb_flags
|= SB_DROP
;
4380 sb
->sb_upcall
= NULL
;
4381 sb
->sb_upcallarg
= NULL
;
4383 sbunlock(sb
, TRUE
); /* keep socket locked */
4385 selthreadclear(&sb
->sb_sel
);
4390 sorflush(struct socket
*so
)
4392 struct sockbuf
*sb
= &so
->so_rcv
;
4393 struct protosw
*pr
= so
->so_proto
;
4396 lck_mtx_t
*mutex_held
;
4398 * XXX: This code is currently commented out, because we may get here
4399 * as part of sofreelastref(), and at that time, pr_getlock() may no
4400 * longer be able to return us the lock; this will be fixed in future.
4402 if (so
->so_proto
->pr_getlock
!= NULL
)
4403 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
4405 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
4407 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
4410 sflt_notify(so
, sock_evt_flush_read
, NULL
);
4415 * Obtain lock on the socket buffer (SB_LOCK). This is required
4416 * to prevent the socket buffer from being unexpectedly altered
4417 * while it is used by another thread in socket send/receive.
4419 * sblock() must not fail here, hence the assertion.
4421 (void) sblock(sb
, SBL_WAIT
| SBL_NOINTR
| SBL_IGNDEFUNCT
);
4422 VERIFY(sb
->sb_flags
& SB_LOCK
);
4425 * Copy only the relevant fields from "sb" to "asb" which we
4426 * need for sbrelease() to function. In particular, skip
4427 * sb_sel as it contains the wait queue linkage, which would
4428 * wreak havoc if we were to issue selthreadclear() on "asb".
4429 * Make sure to not carry over SB_LOCK in "asb", as we need
4430 * to acquire it later as part of sbrelease().
4432 bzero(&asb
, sizeof (asb
));
4433 asb
.sb_cc
= sb
->sb_cc
;
4434 asb
.sb_hiwat
= sb
->sb_hiwat
;
4435 asb
.sb_mbcnt
= sb
->sb_mbcnt
;
4436 asb
.sb_mbmax
= sb
->sb_mbmax
;
4437 asb
.sb_ctl
= sb
->sb_ctl
;
4438 asb
.sb_lowat
= sb
->sb_lowat
;
4439 asb
.sb_mb
= sb
->sb_mb
;
4440 asb
.sb_mbtail
= sb
->sb_mbtail
;
4441 asb
.sb_lastrecord
= sb
->sb_lastrecord
;
4442 asb
.sb_so
= sb
->sb_so
;
4443 asb
.sb_flags
= sb
->sb_flags
;
4444 asb
.sb_flags
&= ~(SB_LOCK
|SB_SEL
|SB_KNOTE
|SB_UPCALL
);
4445 asb
.sb_flags
|= SB_DROP
;
4448 * Ideally we'd bzero() these and preserve the ones we need;
4449 * but to do that we'd need to shuffle things around in the
4450 * sockbuf, and we can't do it now because there are KEXTS
4451 * that are directly referring to the socket structure.
4453 * Setting SB_DROP acts as a barrier to prevent further appends.
4454 * Clearing SB_SEL is done for selthreadclear() below.
4463 sb
->sb_mbtail
= NULL
;
4464 sb
->sb_lastrecord
= NULL
;
4465 sb
->sb_timeo
.tv_sec
= 0;
4466 sb
->sb_timeo
.tv_usec
= 0;
4467 sb
->sb_upcall
= NULL
;
4468 sb
->sb_upcallarg
= NULL
;
4469 sb
->sb_flags
&= ~(SB_SEL
|SB_UPCALL
);
4470 sb
->sb_flags
|= SB_DROP
;
4472 sbunlock(sb
, TRUE
); /* keep socket locked */
4475 * Note that selthreadclear() is called on the original "sb" and
4476 * not the local "asb" because of the way wait queue linkage is
4477 * implemented. Given that selwakeup() may be triggered, SB_SEL
4478 * should no longer be set (cleared above.)
4480 selthreadclear(&sb
->sb_sel
);
4482 if ((pr
->pr_flags
& PR_RIGHTS
) && pr
->pr_domain
->dom_dispose
)
4483 (*pr
->pr_domain
->dom_dispose
)(asb
.sb_mb
);
4489 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4490 * an additional variant to handle the case where the option value needs
4491 * to be some kind of integer, but not a specific size.
4492 * In addition to their use here, these functions are also called by the
4493 * protocol-level pr_ctloutput() routines.
4495 * Returns: 0 Success
4500 sooptcopyin(struct sockopt
*sopt
, void *buf
, size_t len
, size_t minlen
)
4505 * If the user gives us more than we wanted, we ignore it,
4506 * but if we don't get the minimum length the caller
4507 * wants, we return EINVAL. On success, sopt->sopt_valsize
4508 * is set to however much we actually retrieved.
4510 if ((valsize
= sopt
->sopt_valsize
) < minlen
)
4513 sopt
->sopt_valsize
= valsize
= len
;
4515 if (sopt
->sopt_p
!= kernproc
)
4516 return (copyin(sopt
->sopt_val
, buf
, valsize
));
4518 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), buf
, valsize
);
4523 * sooptcopyin_timeval
4524 * Copy in a timeval value into tv_p, and take into account whether the
4525 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4526 * code here so that we can verify the 64-bit tv_sec value before we lose
4527 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4530 sooptcopyin_timeval(struct sockopt
*sopt
, struct timeval
*tv_p
)
4534 if (proc_is64bit(sopt
->sopt_p
)) {
4535 struct user64_timeval tv64
;
4537 if (sopt
->sopt_valsize
< sizeof (tv64
))
4540 sopt
->sopt_valsize
= sizeof (tv64
);
4541 if (sopt
->sopt_p
!= kernproc
) {
4542 error
= copyin(sopt
->sopt_val
, &tv64
, sizeof (tv64
));
4546 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), &tv64
,
4549 if (tv64
.tv_sec
< 0 || tv64
.tv_sec
> LONG_MAX
||
4550 tv64
.tv_usec
< 0 || tv64
.tv_usec
>= 1000000)
4553 tv_p
->tv_sec
= tv64
.tv_sec
;
4554 tv_p
->tv_usec
= tv64
.tv_usec
;
4556 struct user32_timeval tv32
;
4558 if (sopt
->sopt_valsize
< sizeof (tv32
))
4561 sopt
->sopt_valsize
= sizeof (tv32
);
4562 if (sopt
->sopt_p
!= kernproc
) {
4563 error
= copyin(sopt
->sopt_val
, &tv32
, sizeof (tv32
));
4568 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), &tv32
,
4573 * K64todo "comparison is always false due to
4574 * limited range of data type"
4576 if (tv32
.tv_sec
< 0 || tv32
.tv_sec
> LONG_MAX
||
4577 tv32
.tv_usec
< 0 || tv32
.tv_usec
>= 1000000)
4580 tv_p
->tv_sec
= tv32
.tv_sec
;
4581 tv_p
->tv_usec
= tv32
.tv_usec
;
4587 soopt_cred_check(struct socket
*so
, int priv
)
4589 kauth_cred_t cred
= NULL
;
4590 proc_t ep
= PROC_NULL
;
4593 if (so
->so_flags
& SOF_DELEGATED
) {
4594 ep
= proc_find(so
->e_pid
);
4596 cred
= kauth_cred_proc_ref(ep
);
4598 error
= priv_check_cred(cred
? cred
: so
->so_cred
, priv
, 0);
4600 kauth_cred_unref(&cred
);
4601 if (ep
!= PROC_NULL
)
4608 * Returns: 0 Success
4613 * sooptcopyin:EINVAL
4614 * sooptcopyin:EFAULT
4615 * sooptcopyin_timeval:EINVAL
4616 * sooptcopyin_timeval:EFAULT
4617 * sooptcopyin_timeval:EDOM
4618 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4619 * <pr_ctloutput>:???w
4620 * sflt_attach_private:??? [whatever a filter author chooses]
4621 * <sf_setoption>:??? [whatever a filter author chooses]
4623 * Notes: Other <pru_listen> returns depend on the protocol family; all
4624 * <sf_listen> returns depend on what the filter author causes
4625 * their filter to return.
4628 sosetoptlock(struct socket
*so
, struct sockopt
*sopt
, int dolock
)
4633 #if CONFIG_MACF_SOCKET
4635 #endif /* MAC_SOCKET */
4637 if (sopt
->sopt_dir
!= SOPT_SET
)
4638 sopt
->sopt_dir
= SOPT_SET
;
4643 if ((so
->so_state
& (SS_CANTRCVMORE
| SS_CANTSENDMORE
)) ==
4644 (SS_CANTRCVMORE
| SS_CANTSENDMORE
) &&
4645 (so
->so_flags
& SOF_NPX_SETOPTSHUT
) == 0) {
4646 /* the socket has been shutdown, no more sockopt's */
4651 error
= sflt_setsockopt(so
, sopt
);
4653 if (error
== EJUSTRETURN
)
4658 if (sopt
->sopt_level
!= SOL_SOCKET
) {
4659 if (so
->so_proto
!= NULL
&&
4660 so
->so_proto
->pr_ctloutput
!= NULL
) {
4661 error
= (*so
->so_proto
->pr_ctloutput
)(so
, sopt
);
4664 error
= ENOPROTOOPT
;
4667 * Allow socket-level (SOL_SOCKET) options to be filtered by
4668 * the protocol layer, if needed. A zero value returned from
4669 * the handler means use default socket-level processing as
4670 * done by the rest of this routine. Otherwise, any other
4671 * return value indicates that the option is unsupported.
4673 if (so
->so_proto
!= NULL
&& (error
= so
->so_proto
->pr_usrreqs
->
4674 pru_socheckopt(so
, sopt
)) != 0)
4678 switch (sopt
->sopt_name
) {
4681 error
= sooptcopyin(sopt
, &l
, sizeof (l
), sizeof (l
));
4685 so
->so_linger
= (sopt
->sopt_name
== SO_LINGER
) ?
4686 l
.l_linger
: l
.l_linger
* hz
;
4688 so
->so_options
|= SO_LINGER
;
4690 so
->so_options
&= ~SO_LINGER
;
4696 case SO_USELOOPBACK
:
4702 case SO_TIMESTAMP_MONOTONIC
:
4705 case SO_WANTOOBFLAG
:
4706 case SO_NOWAKEFROMSLEEP
:
4707 case SO_NOAPNFALLBK
:
4708 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4713 so
->so_options
|= sopt
->sopt_name
;
4715 so
->so_options
&= ~sopt
->sopt_name
;
4722 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4728 * Values < 1 make no sense for any of these
4729 * options, so disallow them.
4736 switch (sopt
->sopt_name
) {
4739 struct sockbuf
*sb
=
4740 (sopt
->sopt_name
== SO_SNDBUF
) ?
4741 &so
->so_snd
: &so
->so_rcv
;
4742 if (sbreserve(sb
, (u_int32_t
)optval
) == 0) {
4746 sb
->sb_flags
|= SB_USRSIZE
;
4747 sb
->sb_flags
&= ~SB_AUTOSIZE
;
4748 sb
->sb_idealsize
= (u_int32_t
)optval
;
4752 * Make sure the low-water is never greater than
4756 int space
= sbspace(&so
->so_snd
);
4757 u_int32_t hiwat
= so
->so_snd
.sb_hiwat
;
4759 if (so
->so_snd
.sb_flags
& SB_UNIX
) {
4761 (struct unpcb
*)(so
->so_pcb
);
4763 unp
->unp_conn
!= NULL
) {
4764 hiwat
+= unp
->unp_conn
->unp_cc
;
4768 so
->so_snd
.sb_lowat
=
4772 if (space
>= so
->so_snd
.sb_lowat
) {
4779 so
->so_rcv
.sb_lowat
=
4780 (optval
> so
->so_rcv
.sb_hiwat
) ?
4781 so
->so_rcv
.sb_hiwat
: optval
;
4782 data_len
= so
->so_rcv
.sb_cc
4783 - so
->so_rcv
.sb_ctl
;
4784 if (data_len
>= so
->so_rcv
.sb_lowat
)
4793 error
= sooptcopyin_timeval(sopt
, &tv
);
4797 switch (sopt
->sopt_name
) {
4799 so
->so_snd
.sb_timeo
= tv
;
4802 so
->so_rcv
.sb_timeo
= tv
;
4810 error
= sooptcopyin(sopt
, &nke
, sizeof (nke
),
4815 error
= sflt_attach_internal(so
, nke
.nke_handle
);
4820 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4825 so
->so_flags
|= SOF_NOSIGPIPE
;
4827 so
->so_flags
&= ~SOF_NOSIGPIPE
;
4831 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4836 so
->so_flags
|= SOF_NOADDRAVAIL
;
4838 so
->so_flags
&= ~SOF_NOADDRAVAIL
;
4841 case SO_REUSESHAREUID
:
4842 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4847 so
->so_flags
|= SOF_REUSESHAREUID
;
4849 so
->so_flags
&= ~SOF_REUSESHAREUID
;
4852 case SO_NOTIFYCONFLICT
:
4853 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
4857 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4862 so
->so_flags
|= SOF_NOTIFYCONFLICT
;
4864 so
->so_flags
&= ~SOF_NOTIFYCONFLICT
;
4867 case SO_RESTRICTIONS
:
4868 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4873 error
= so_set_restrictions(so
, optval
);
4876 case SO_AWDL_UNRESTRICTED
:
4877 if (SOCK_DOM(so
) != PF_INET
&&
4878 SOCK_DOM(so
) != PF_INET6
) {
4882 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
4887 error
= soopt_cred_check(so
,
4888 PRIV_NET_RESTRICTED_AWDL
);
4890 inp_set_awdl_unrestricted(
4893 inp_clear_awdl_unrestricted(sotoinpcb(so
));
4895 case SO_INTCOPROC_ALLOW
:
4896 if (SOCK_DOM(so
) != PF_INET6
) {
4900 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
4905 inp_get_intcoproc_allowed(sotoinpcb(so
)) == FALSE
) {
4906 error
= soopt_cred_check(so
,
4907 PRIV_NET_RESTRICTED_INTCOPROC
);
4909 inp_set_intcoproc_allowed(
4911 } else if (optval
== 0)
4912 inp_clear_intcoproc_allowed(sotoinpcb(so
));
4916 #if CONFIG_MACF_SOCKET
4917 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof (extmac
),
4918 sizeof (extmac
))) != 0)
4921 error
= mac_setsockopt_label(proc_ucred(sopt
->sopt_p
),
4925 #endif /* MAC_SOCKET */
4928 case SO_UPCALLCLOSEWAIT
:
4929 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4934 so
->so_flags
|= SOF_UPCALLCLOSEWAIT
;
4936 so
->so_flags
&= ~SOF_UPCALLCLOSEWAIT
;
4940 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4945 so
->so_flags
|= SOF_BINDRANDOMPORT
;
4947 so
->so_flags
&= ~SOF_BINDRANDOMPORT
;
4950 case SO_NP_EXTENSIONS
: {
4951 struct so_np_extensions sonpx
;
4953 error
= sooptcopyin(sopt
, &sonpx
, sizeof (sonpx
),
4957 if (sonpx
.npx_mask
& ~SONPX_MASK_VALID
) {
4962 * Only one bit defined for now
4964 if ((sonpx
.npx_mask
& SONPX_SETOPTSHUT
)) {
4965 if ((sonpx
.npx_flags
& SONPX_SETOPTSHUT
))
4966 so
->so_flags
|= SOF_NPX_SETOPTSHUT
;
4968 so
->so_flags
&= ~SOF_NPX_SETOPTSHUT
;
4973 case SO_TRAFFIC_CLASS
: {
4974 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4978 if (optval
>= SO_TC_NET_SERVICE_OFFSET
) {
4979 int netsvc
= optval
- SO_TC_NET_SERVICE_OFFSET
;
4980 error
= so_set_net_service_type(so
, netsvc
);
4983 error
= so_set_traffic_class(so
, optval
);
4986 so
->so_flags1
&= ~SOF1_TC_NET_SERV_TYPE
;
4987 so
->so_netsvctype
= _NET_SERVICE_TYPE_UNSPEC
;
4991 case SO_RECV_TRAFFIC_CLASS
: {
4992 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
4997 so
->so_flags
&= ~SOF_RECV_TRAFFIC_CLASS
;
4999 so
->so_flags
|= SOF_RECV_TRAFFIC_CLASS
;
5003 #if (DEVELOPMENT || DEBUG)
5004 case SO_TRAFFIC_CLASS_DBG
: {
5005 struct so_tcdbg so_tcdbg
;
5007 error
= sooptcopyin(sopt
, &so_tcdbg
,
5008 sizeof (struct so_tcdbg
), sizeof (struct so_tcdbg
));
5011 error
= so_set_tcdbg(so
, &so_tcdbg
);
5016 #endif /* (DEVELOPMENT || DEBUG) */
5018 case SO_PRIVILEGED_TRAFFIC_CLASS
:
5019 error
= priv_check_cred(kauth_cred_get(),
5020 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS
, 0);
5023 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
5028 so
->so_flags
&= ~SOF_PRIVILEGED_TRAFFIC_CLASS
;
5030 so
->so_flags
|= SOF_PRIVILEGED_TRAFFIC_CLASS
;
5034 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
5036 if (error
!= 0 || (so
->so_flags
& SOF_DEFUNCT
)) {
5042 * Any process can set SO_DEFUNCTOK (clear
5043 * SOF_NODEFUNCT), but only root can clear
5044 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5047 kauth_cred_issuser(kauth_cred_get()) == 0) {
5052 so
->so_flags
&= ~SOF_NODEFUNCT
;
5054 so
->so_flags
|= SOF_NODEFUNCT
;
5056 if (SOCK_DOM(so
) == PF_INET
||
5057 SOCK_DOM(so
) == PF_INET6
) {
5058 char s
[MAX_IPv6_STR_LEN
];
5059 char d
[MAX_IPv6_STR_LEN
];
5060 struct inpcb
*inp
= sotoinpcb(so
);
5062 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5063 "[%s %s:%d -> %s:%d] is now marked "
5064 "as %seligible for "
5065 "defunct\n", __func__
, proc_selfpid(),
5066 proc_best_name(current_proc()),
5067 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
5068 (SOCK_TYPE(so
) == SOCK_STREAM
) ?
5069 "TCP" : "UDP", inet_ntop(SOCK_DOM(so
),
5070 ((SOCK_DOM(so
) == PF_INET
) ?
5071 (void *)&inp
->inp_laddr
.s_addr
:
5072 (void *)&inp
->in6p_laddr
), s
, sizeof (s
)),
5073 ntohs(inp
->in6p_lport
),
5074 inet_ntop(SOCK_DOM(so
),
5075 (SOCK_DOM(so
) == PF_INET
) ?
5076 (void *)&inp
->inp_faddr
.s_addr
:
5077 (void *)&inp
->in6p_faddr
, d
, sizeof (d
)),
5078 ntohs(inp
->in6p_fport
),
5079 (so
->so_flags
& SOF_NODEFUNCT
) ?
5082 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5083 "is now marked as %seligible for "
5085 __func__
, proc_selfpid(),
5086 proc_best_name(current_proc()),
5087 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
5088 SOCK_DOM(so
), SOCK_TYPE(so
),
5089 (so
->so_flags
& SOF_NODEFUNCT
) ?
5095 /* This option is not settable */
5099 case SO_OPPORTUNISTIC
:
5100 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
5103 error
= so_set_opportunistic(so
, optval
);
5107 /* This option is handled by lower layer(s) */
5112 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
5115 error
= so_set_recv_anyif(so
, optval
);
5118 case SO_TRAFFIC_MGT_BACKGROUND
: {
5119 /* This option is handled by lower layer(s) */
5125 case SO_FLOW_DIVERT_TOKEN
:
5126 error
= flow_divert_token_set(so
, sopt
);
5128 #endif /* FLOW_DIVERT */
5132 if ((error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
5133 sizeof (optval
))) != 0)
5136 error
= so_set_effective_pid(so
, optval
, sopt
->sopt_p
);
5139 case SO_DELEGATED_UUID
: {
5142 if ((error
= sooptcopyin(sopt
, &euuid
, sizeof (euuid
),
5143 sizeof (euuid
))) != 0)
5146 error
= so_set_effective_uuid(so
, euuid
, sopt
->sopt_p
);
5151 case SO_NECP_ATTRIBUTES
:
5152 error
= necp_set_socket_attributes(so
, sopt
);
5157 case SO_MPTCP_FASTJOIN
:
5158 if (!((so
->so_flags
& SOF_MP_SUBFLOW
) ||
5159 ((SOCK_CHECK_DOM(so
, PF_MULTIPATH
)) &&
5160 (SOCK_CHECK_PROTO(so
, IPPROTO_TCP
))))) {
5161 error
= ENOPROTOOPT
;
5165 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
5170 so
->so_flags
&= ~SOF_MPTCP_FASTJOIN
;
5172 so
->so_flags
|= SOF_MPTCP_FASTJOIN
;
5176 case SO_EXTENDED_BK_IDLE
:
5177 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
5180 error
= so_set_extended_bk_idle(so
, optval
);
5183 case SO_MARK_CELLFALLBACK
:
5184 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5193 so
->so_flags1
&= ~SOF1_CELLFALLBACK
;
5195 so
->so_flags1
|= SOF1_CELLFALLBACK
;
5198 case SO_NET_SERVICE_TYPE
: {
5199 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5203 error
= so_set_net_service_type(so
, optval
);
5207 case SO_QOSMARKING_POLICY_OVERRIDE
:
5208 error
= priv_check_cred(kauth_cred_get(),
5209 PRIV_NET_QOSMARKING_POLICY_OVERRIDE
, 0);
5212 error
= sooptcopyin(sopt
, &optval
, sizeof(optval
),
5217 so
->so_flags1
&= ~SOF1_QOSMARKING_POLICY_OVERRIDE
;
5219 so
->so_flags1
|= SOF1_QOSMARKING_POLICY_OVERRIDE
;
5223 error
= ENOPROTOOPT
;
5226 if (error
== 0 && so
->so_proto
!= NULL
&&
5227 so
->so_proto
->pr_ctloutput
!= NULL
) {
5228 (void) so
->so_proto
->pr_ctloutput(so
, sopt
);
5233 socket_unlock(so
, 1);
5237 /* Helper routines for getsockopt */
5239 sooptcopyout(struct sockopt
*sopt
, void *buf
, size_t len
)
5247 * Documented get behavior is that we always return a value,
5248 * possibly truncated to fit in the user's buffer.
5249 * Traditional behavior is that we always tell the user
5250 * precisely how much we copied, rather than something useful
5251 * like the total amount we had available for her.
5252 * Note that this interface is not idempotent; the entire answer must
5253 * generated ahead of time.
5255 valsize
= min(len
, sopt
->sopt_valsize
);
5256 sopt
->sopt_valsize
= valsize
;
5257 if (sopt
->sopt_val
!= USER_ADDR_NULL
) {
5258 if (sopt
->sopt_p
!= kernproc
)
5259 error
= copyout(buf
, sopt
->sopt_val
, valsize
);
5261 bcopy(buf
, CAST_DOWN(caddr_t
, sopt
->sopt_val
), valsize
);
5267 sooptcopyout_timeval(struct sockopt
*sopt
, const struct timeval
*tv_p
)
5271 struct user64_timeval tv64
;
5272 struct user32_timeval tv32
;
5277 if (proc_is64bit(sopt
->sopt_p
)) {
5278 len
= sizeof (tv64
);
5279 tv64
.tv_sec
= tv_p
->tv_sec
;
5280 tv64
.tv_usec
= tv_p
->tv_usec
;
5283 len
= sizeof (tv32
);
5284 tv32
.tv_sec
= tv_p
->tv_sec
;
5285 tv32
.tv_usec
= tv_p
->tv_usec
;
5288 valsize
= min(len
, sopt
->sopt_valsize
);
5289 sopt
->sopt_valsize
= valsize
;
5290 if (sopt
->sopt_val
!= USER_ADDR_NULL
) {
5291 if (sopt
->sopt_p
!= kernproc
)
5292 error
= copyout(val
, sopt
->sopt_val
, valsize
);
5294 bcopy(val
, CAST_DOWN(caddr_t
, sopt
->sopt_val
), valsize
);
5302 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5303 * <pr_ctloutput>:???
5304 * <sf_getoption>:???
5307 sogetoptlock(struct socket
*so
, struct sockopt
*sopt
, int dolock
)
5312 #if CONFIG_MACF_SOCKET
5314 #endif /* MAC_SOCKET */
5316 if (sopt
->sopt_dir
!= SOPT_GET
)
5317 sopt
->sopt_dir
= SOPT_GET
;
5322 error
= sflt_getsockopt(so
, sopt
);
5324 if (error
== EJUSTRETURN
)
5329 if (sopt
->sopt_level
!= SOL_SOCKET
) {
5330 if (so
->so_proto
!= NULL
&&
5331 so
->so_proto
->pr_ctloutput
!= NULL
) {
5332 error
= (*so
->so_proto
->pr_ctloutput
)(so
, sopt
);
5335 error
= ENOPROTOOPT
;
5338 * Allow socket-level (SOL_SOCKET) options to be filtered by
5339 * the protocol layer, if needed. A zero value returned from
5340 * the handler means use default socket-level processing as
5341 * done by the rest of this routine. Otherwise, any other
5342 * return value indicates that the option is unsupported.
5344 if (so
->so_proto
!= NULL
&& (error
= so
->so_proto
->pr_usrreqs
->
5345 pru_socheckopt(so
, sopt
)) != 0)
5349 switch (sopt
->sopt_name
) {
5352 l
.l_onoff
= ((so
->so_options
& SO_LINGER
) ? 1 : 0);
5353 l
.l_linger
= (sopt
->sopt_name
== SO_LINGER
) ?
5354 so
->so_linger
: so
->so_linger
/ hz
;
5355 error
= sooptcopyout(sopt
, &l
, sizeof (l
));
5358 case SO_USELOOPBACK
:
5367 case SO_TIMESTAMP_MONOTONIC
:
5370 case SO_WANTOOBFLAG
:
5371 case SO_NOWAKEFROMSLEEP
:
5372 case SO_NOAPNFALLBK
:
5373 optval
= so
->so_options
& sopt
->sopt_name
;
5375 error
= sooptcopyout(sopt
, &optval
, sizeof (optval
));
5379 optval
= so
->so_type
;
5383 if (so
->so_proto
->pr_flags
& PR_ATOMIC
) {
5388 m1
= so
->so_rcv
.sb_mb
;
5389 while (m1
!= NULL
) {
5390 if (m1
->m_type
== MT_DATA
||
5391 m1
->m_type
== MT_HEADER
||
5392 m1
->m_type
== MT_OOBDATA
)
5393 pkt_total
+= m1
->m_len
;
5398 optval
= so
->so_rcv
.sb_cc
- so
->so_rcv
.sb_ctl
;
5403 if (so
->so_proto
->pr_flags
& PR_ATOMIC
) {
5407 m1
= so
->so_rcv
.sb_mb
;
5408 while (m1
!= NULL
) {
5409 if (m1
->m_type
== MT_DATA
||
5410 m1
->m_type
== MT_HEADER
||
5411 m1
->m_type
== MT_OOBDATA
)
5423 optval
= so
->so_snd
.sb_cc
;
5427 optval
= so
->so_error
;
5432 u_int32_t hiwat
= so
->so_snd
.sb_hiwat
;
5434 if (so
->so_snd
.sb_flags
& SB_UNIX
) {
5436 (struct unpcb
*)(so
->so_pcb
);
5437 if (unp
!= NULL
&& unp
->unp_conn
!= NULL
) {
5438 hiwat
+= unp
->unp_conn
->unp_cc
;
5446 optval
= so
->so_rcv
.sb_hiwat
;
5450 optval
= so
->so_snd
.sb_lowat
;
5454 optval
= so
->so_rcv
.sb_lowat
;
5459 tv
= (sopt
->sopt_name
== SO_SNDTIMEO
?
5460 so
->so_snd
.sb_timeo
: so
->so_rcv
.sb_timeo
);
5462 error
= sooptcopyout_timeval(sopt
, &tv
);
5466 optval
= (so
->so_flags
& SOF_NOSIGPIPE
);
5470 optval
= (so
->so_flags
& SOF_NOADDRAVAIL
);
5473 case SO_REUSESHAREUID
:
5474 optval
= (so
->so_flags
& SOF_REUSESHAREUID
);
5478 case SO_NOTIFYCONFLICT
:
5479 optval
= (so
->so_flags
& SOF_NOTIFYCONFLICT
);
5482 case SO_RESTRICTIONS
:
5483 optval
= so_get_restrictions(so
);
5486 case SO_AWDL_UNRESTRICTED
:
5487 if (SOCK_DOM(so
) == PF_INET
||
5488 SOCK_DOM(so
) == PF_INET6
) {
5489 optval
= inp_get_awdl_unrestricted(
5496 case SO_INTCOPROC_ALLOW
:
5497 if (SOCK_DOM(so
) == PF_INET6
) {
5498 optval
= inp_get_intcoproc_allowed(
5506 #if CONFIG_MACF_SOCKET
5507 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof (extmac
),
5508 sizeof (extmac
))) != 0 ||
5509 (error
= mac_socket_label_get(proc_ucred(
5510 sopt
->sopt_p
), so
, &extmac
)) != 0)
5513 error
= sooptcopyout(sopt
, &extmac
, sizeof (extmac
));
5516 #endif /* MAC_SOCKET */
5520 #if CONFIG_MACF_SOCKET
5521 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof (extmac
),
5522 sizeof (extmac
))) != 0 ||
5523 (error
= mac_socketpeer_label_get(proc_ucred(
5524 sopt
->sopt_p
), so
, &extmac
)) != 0)
5527 error
= sooptcopyout(sopt
, &extmac
, sizeof (extmac
));
5530 #endif /* MAC_SOCKET */
5533 #ifdef __APPLE_API_PRIVATE
5534 case SO_UPCALLCLOSEWAIT
:
5535 optval
= (so
->so_flags
& SOF_UPCALLCLOSEWAIT
);
5539 optval
= (so
->so_flags
& SOF_BINDRANDOMPORT
);
5542 case SO_NP_EXTENSIONS
: {
5543 struct so_np_extensions sonpx
;
5545 sonpx
.npx_flags
= (so
->so_flags
& SOF_NPX_SETOPTSHUT
) ?
5546 SONPX_SETOPTSHUT
: 0;
5547 sonpx
.npx_mask
= SONPX_MASK_VALID
;
5549 error
= sooptcopyout(sopt
, &sonpx
,
5550 sizeof (struct so_np_extensions
));
5554 case SO_TRAFFIC_CLASS
:
5555 optval
= so
->so_traffic_class
;
5558 case SO_RECV_TRAFFIC_CLASS
:
5559 optval
= (so
->so_flags
& SOF_RECV_TRAFFIC_CLASS
);
5562 case SO_TRAFFIC_CLASS_STATS
:
5563 error
= sooptcopyout(sopt
, &so
->so_tc_stats
,
5564 sizeof (so
->so_tc_stats
));
5567 #if (DEVELOPMENT || DEBUG)
5568 case SO_TRAFFIC_CLASS_DBG
:
5569 error
= sogetopt_tcdbg(so
, sopt
);
5571 #endif /* (DEVELOPMENT || DEBUG) */
5573 case SO_PRIVILEGED_TRAFFIC_CLASS
:
5574 optval
= (so
->so_flags
& SOF_PRIVILEGED_TRAFFIC_CLASS
);
5578 optval
= !(so
->so_flags
& SOF_NODEFUNCT
);
5582 optval
= (so
->so_flags
& SOF_DEFUNCT
);
5585 case SO_OPPORTUNISTIC
:
5586 optval
= so_get_opportunistic(so
);
5590 /* This option is not gettable */
5595 optval
= so_get_recv_anyif(so
);
5598 case SO_TRAFFIC_MGT_BACKGROUND
:
5599 /* This option is handled by lower layer(s) */
5600 if (so
->so_proto
!= NULL
&&
5601 so
->so_proto
->pr_ctloutput
!= NULL
) {
5602 (void) so
->so_proto
->pr_ctloutput(so
, sopt
);
5607 case SO_FLOW_DIVERT_TOKEN
:
5608 error
= flow_divert_token_get(so
, sopt
);
5610 #endif /* FLOW_DIVERT */
5613 case SO_NECP_ATTRIBUTES
:
5614 error
= necp_get_socket_attributes(so
, sopt
);
5619 case SO_CFIL_SOCK_ID
: {
5620 cfil_sock_id_t sock_id
;
5622 sock_id
= cfil_sock_id_from_socket(so
);
5624 error
= sooptcopyout(sopt
, &sock_id
,
5625 sizeof(cfil_sock_id_t
));
5628 #endif /* CONTENT_FILTER */
5631 case SO_MPTCP_FASTJOIN
:
5632 if (!((so
->so_flags
& SOF_MP_SUBFLOW
) ||
5633 ((SOCK_CHECK_DOM(so
, PF_MULTIPATH
)) &&
5634 (SOCK_CHECK_PROTO(so
, IPPROTO_TCP
))))) {
5635 error
= ENOPROTOOPT
;
5638 optval
= (so
->so_flags
& SOF_MPTCP_FASTJOIN
);
5639 /* Fixed along with rdar://19391339 */
5643 case SO_EXTENDED_BK_IDLE
:
5644 optval
= (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
);
5646 case SO_MARK_CELLFALLBACK
:
5647 optval
= ((so
->so_flags1
& SOF1_CELLFALLBACK
) > 0)
5650 case SO_NET_SERVICE_TYPE
: {
5651 if ((so
->so_flags1
& SOF1_TC_NET_SERV_TYPE
))
5652 optval
= so
->so_netsvctype
;
5654 optval
= NET_SERVICE_TYPE_BE
;
5657 case SO_NETSVC_MARKING_LEVEL
:
5658 optval
= so_get_netsvc_marking_level(so
);
5662 error
= ENOPROTOOPT
;
5668 socket_unlock(so
, 1);
5673 * The size limits on our soopt_getm is different from that on FreeBSD.
5674 * We limit the size of options to MCLBYTES. This will have to change
5675 * if we need to define options that need more space than MCLBYTES.
5678 soopt_getm(struct sockopt
*sopt
, struct mbuf
**mp
)
5680 struct mbuf
*m
, *m_prev
;
5681 int sopt_size
= sopt
->sopt_valsize
;
5684 if (sopt_size
<= 0 || sopt_size
> MCLBYTES
)
5687 how
= sopt
->sopt_p
!= kernproc
? M_WAIT
: M_DONTWAIT
;
5688 MGET(m
, how
, MT_DATA
);
5691 if (sopt_size
> MLEN
) {
5693 if ((m
->m_flags
& M_EXT
) == 0) {
5697 m
->m_len
= min(MCLBYTES
, sopt_size
);
5699 m
->m_len
= min(MLEN
, sopt_size
);
5701 sopt_size
-= m
->m_len
;
5705 while (sopt_size
> 0) {
5706 MGET(m
, how
, MT_DATA
);
5711 if (sopt_size
> MLEN
) {
5713 if ((m
->m_flags
& M_EXT
) == 0) {
5718 m
->m_len
= min(MCLBYTES
, sopt_size
);
5720 m
->m_len
= min(MLEN
, sopt_size
);
5722 sopt_size
-= m
->m_len
;
5729 /* copyin sopt data into mbuf chain */
5731 soopt_mcopyin(struct sockopt
*sopt
, struct mbuf
*m
)
5733 struct mbuf
*m0
= m
;
5735 if (sopt
->sopt_val
== USER_ADDR_NULL
)
5737 while (m
!= NULL
&& sopt
->sopt_valsize
>= m
->m_len
) {
5738 if (sopt
->sopt_p
!= kernproc
) {
5741 error
= copyin(sopt
->sopt_val
, mtod(m
, char *),
5748 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
),
5749 mtod(m
, char *), m
->m_len
);
5751 sopt
->sopt_valsize
-= m
->m_len
;
5752 sopt
->sopt_val
+= m
->m_len
;
5755 /* should be allocated enoughly at ip6_sooptmcopyin() */
5757 panic("soopt_mcopyin");
5763 /* copyout mbuf chain data into soopt */
5765 soopt_mcopyout(struct sockopt
*sopt
, struct mbuf
*m
)
5767 struct mbuf
*m0
= m
;
5770 if (sopt
->sopt_val
== USER_ADDR_NULL
)
5772 while (m
!= NULL
&& sopt
->sopt_valsize
>= m
->m_len
) {
5773 if (sopt
->sopt_p
!= kernproc
) {
5776 error
= copyout(mtod(m
, char *), sopt
->sopt_val
,
5783 bcopy(mtod(m
, char *),
5784 CAST_DOWN(caddr_t
, sopt
->sopt_val
), m
->m_len
);
5786 sopt
->sopt_valsize
-= m
->m_len
;
5787 sopt
->sopt_val
+= m
->m_len
;
5788 valsize
+= m
->m_len
;
5792 /* enough soopt buffer should be given from user-land */
5796 sopt
->sopt_valsize
= valsize
;
5801 sohasoutofband(struct socket
*so
)
5803 if (so
->so_pgid
< 0)
5804 gsignal(-so
->so_pgid
, SIGURG
);
5805 else if (so
->so_pgid
> 0)
5806 proc_signal(so
->so_pgid
, SIGURG
);
5807 selwakeup(&so
->so_rcv
.sb_sel
);
5808 if (so
->so_rcv
.sb_flags
& SB_KNOTE
) {
5809 KNOTE(&so
->so_rcv
.sb_sel
.si_note
,
5810 (NOTE_OOB
| SO_FILT_HINT_LOCKED
));
5815 sopoll(struct socket
*so
, int events
, kauth_cred_t cred
, void * wql
)
5817 #pragma unused(cred)
5818 struct proc
*p
= current_proc();
5822 so_update_last_owner_locked(so
, PROC_NULL
);
5823 so_update_policy(so
);
5825 if (events
& (POLLIN
| POLLRDNORM
))
5827 revents
|= events
& (POLLIN
| POLLRDNORM
);
5829 if (events
& (POLLOUT
| POLLWRNORM
))
5830 if (sowriteable(so
))
5831 revents
|= events
& (POLLOUT
| POLLWRNORM
);
5833 if (events
& (POLLPRI
| POLLRDBAND
))
5834 if (so
->so_oobmark
|| (so
->so_state
& SS_RCVATMARK
))
5835 revents
|= events
& (POLLPRI
| POLLRDBAND
);
5838 if (events
& (POLLIN
| POLLPRI
| POLLRDNORM
| POLLRDBAND
)) {
5840 * Darwin sets the flag first,
5841 * BSD calls selrecord first
5843 so
->so_rcv
.sb_flags
|= SB_SEL
;
5844 selrecord(p
, &so
->so_rcv
.sb_sel
, wql
);
5847 if (events
& (POLLOUT
| POLLWRNORM
)) {
5849 * Darwin sets the flag first,
5850 * BSD calls selrecord first
5852 so
->so_snd
.sb_flags
|= SB_SEL
;
5853 selrecord(p
, &so
->so_snd
.sb_sel
, wql
);
5857 socket_unlock(so
, 1);
5862 soo_kqfilter(struct fileproc
*fp
, struct knote
*kn
, vfs_context_t ctx
)
5865 #if !CONFIG_MACF_SOCKET
5867 #endif /* MAC_SOCKET */
5868 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
5872 so_update_last_owner_locked(so
, PROC_NULL
);
5873 so_update_policy(so
);
5875 #if CONFIG_MACF_SOCKET
5876 if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx
)),
5878 socket_unlock(so
, 1);
5879 kn
->kn_flags
= EV_ERROR
;
5880 kn
->kn_data
= EPERM
;
5883 #endif /* MAC_SOCKET */
5885 switch (kn
->kn_filter
) {
5887 kn
->kn_filtid
= EVFILTID_SOREAD
;
5890 kn
->kn_filtid
= EVFILTID_SOWRITE
;
5893 kn
->kn_filtid
= EVFILTID_SCK
;
5896 kn
->kn_filtid
= EVFILTID_SOEXCEPT
;
5899 socket_unlock(so
, 1);
5900 kn
->kn_flags
= EV_ERROR
;
5901 kn
->kn_data
= EINVAL
;
5906 * call the appropriate sub-filter attach
5907 * with the socket still locked
5909 result
= knote_fops(kn
)->f_attach(kn
);
5911 socket_unlock(so
, 1);
5917 filt_soread_common(struct knote
*kn
, struct socket
*so
)
5919 if (so
->so_options
& SO_ACCEPTCONN
) {
5923 * Radar 6615193 handle the listen case dynamically
5924 * for kqueue read filter. This allows to call listen()
5925 * after registering the kqueue EVFILT_READ.
5928 kn
->kn_data
= so
->so_qlen
;
5929 is_not_empty
= ! TAILQ_EMPTY(&so
->so_comp
);
5931 return (is_not_empty
);
5934 /* socket isn't a listener */
5936 * NOTE_LOWAT specifies new low water mark in data, i.e.
5937 * the bytes of protocol data. We therefore exclude any
5940 kn
->kn_data
= so
->so_rcv
.sb_cc
- so
->so_rcv
.sb_ctl
;
5942 if (kn
->kn_sfflags
& NOTE_OOB
) {
5943 if (so
->so_oobmark
|| (so
->so_state
& SS_RCVATMARK
)) {
5944 kn
->kn_fflags
|= NOTE_OOB
;
5945 kn
->kn_data
-= so
->so_oobmark
;
5950 if ((so
->so_state
& SS_CANTRCVMORE
)
5952 && cfil_sock_data_pending(&so
->so_rcv
) == 0
5953 #endif /* CONTENT_FILTER */
5955 kn
->kn_flags
|= EV_EOF
;
5956 kn
->kn_fflags
= so
->so_error
;
5960 if (so
->so_error
) { /* temporary udp error */
5964 int64_t lowwat
= so
->so_rcv
.sb_lowat
;
5966 * Ensure that when NOTE_LOWAT is used, the derived
5967 * low water mark is bounded by socket's rcv buf's
5968 * high and low water mark values.
5970 if (kn
->kn_sfflags
& NOTE_LOWAT
) {
5971 if (kn
->kn_sdata
> so
->so_rcv
.sb_hiwat
)
5972 lowwat
= so
->so_rcv
.sb_hiwat
;
5973 else if (kn
->kn_sdata
> lowwat
)
5974 lowwat
= kn
->kn_sdata
;
5978 * The order below is important. Since NOTE_LOWAT
5979 * overrides sb_lowat, check for NOTE_LOWAT case
5982 if (kn
->kn_sfflags
& NOTE_LOWAT
)
5983 return (kn
->kn_data
>= lowwat
);
5985 return (so
->so_rcv
.sb_cc
>= lowwat
);
5989 filt_sorattach(struct knote
*kn
)
5991 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
5996 * If the caller explicitly asked for OOB results (e.g. poll())
5997 * from EVFILT_READ, then save that off in the hookid field
5998 * and reserve the kn_flags EV_OOBAND bit for output only.
6000 if (kn
->kn_filter
== EVFILT_READ
&&
6001 kn
->kn_flags
& EV_OOBAND
) {
6002 kn
->kn_flags
&= ~EV_OOBAND
;
6003 kn
->kn_hookid
= EV_OOBAND
;
6007 if (KNOTE_ATTACH(&so
->so_rcv
.sb_sel
.si_note
, kn
))
6008 so
->so_rcv
.sb_flags
|= SB_KNOTE
;
6010 /* indicate if event is already fired */
6011 return filt_soread_common(kn
, so
);
6015 filt_sordetach(struct knote
*kn
)
6017 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6020 if (so
->so_rcv
.sb_flags
& SB_KNOTE
)
6021 if (KNOTE_DETACH(&so
->so_rcv
.sb_sel
.si_note
, kn
))
6022 so
->so_rcv
.sb_flags
&= ~SB_KNOTE
;
6023 socket_unlock(so
, 1);
6028 filt_soread(struct knote
*kn
, long hint
)
6030 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6033 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
6036 retval
= filt_soread_common(kn
, so
);
6038 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
6039 socket_unlock(so
, 1);
6045 filt_sortouch(struct knote
*kn
, struct kevent_internal_s
*kev
)
6047 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6052 /* save off the new input fflags and data */
6053 kn
->kn_sfflags
= kev
->fflags
;
6054 kn
->kn_sdata
= kev
->data
;
6055 if ((kn
->kn_status
& KN_UDATA_SPECIFIC
) == 0)
6056 kn
->kn_udata
= kev
->udata
;
6058 /* determine if changes result in fired events */
6059 retval
= filt_soread_common(kn
, so
);
6061 socket_unlock(so
, 1);
6067 filt_sorprocess(struct knote
*kn
, struct filt_process_s
*data
, struct kevent_internal_s
*kev
)
6069 #pragma unused(data)
6070 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6074 retval
= filt_soread_common(kn
, so
);
6076 *kev
= kn
->kn_kevent
;
6077 if (kn
->kn_flags
& EV_CLEAR
) {
6082 socket_unlock(so
, 1);
6088 so_wait_for_if_feedback(struct socket
*so
)
6090 if ((SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) &&
6091 (so
->so_state
& SS_ISCONNECTED
)) {
6092 struct inpcb
*inp
= sotoinpcb(so
);
6093 if (INP_WAIT_FOR_IF_FEEDBACK(inp
))
6100 filt_sowrite_common(struct knote
*kn
, struct socket
*so
)
6104 kn
->kn_data
= sbspace(&so
->so_snd
);
6105 if (so
->so_state
& SS_CANTSENDMORE
) {
6106 kn
->kn_flags
|= EV_EOF
;
6107 kn
->kn_fflags
= so
->so_error
;
6110 if (so
->so_error
) { /* temporary udp error */
6113 if (!socanwrite(so
)) {
6116 if (so
->so_flags1
& SOF1_PRECONNECT_DATA
) {
6119 int64_t lowwat
= so
->so_snd
.sb_lowat
;
6120 if (kn
->kn_sfflags
& NOTE_LOWAT
) {
6121 if (kn
->kn_sdata
> so
->so_snd
.sb_hiwat
)
6122 lowwat
= so
->so_snd
.sb_hiwat
;
6123 else if (kn
->kn_sdata
> lowwat
)
6124 lowwat
= kn
->kn_sdata
;
6126 if (kn
->kn_data
>= lowwat
) {
6127 if ((so
->so_flags
& SOF_NOTSENT_LOWAT
)
6128 #if (DEBUG || DEVELOPMENT)
6129 && so_notsent_lowat_check
== 1
6130 #endif /* DEBUG || DEVELOPMENT */
6132 if ((SOCK_DOM(so
) == PF_INET
||
6133 SOCK_DOM(so
) == PF_INET6
) &&
6134 so
->so_type
== SOCK_STREAM
) {
6135 ret
= tcp_notsent_lowat_check(so
);
6138 else if ((SOCK_DOM(so
) == PF_MULTIPATH
) &&
6139 (SOCK_PROTO(so
) == IPPROTO_TCP
)) {
6140 ret
= mptcp_notsent_lowat_check(so
);
6150 if (so_wait_for_if_feedback(so
))
6156 filt_sowattach(struct knote
*kn
)
6158 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6161 if (KNOTE_ATTACH(&so
->so_snd
.sb_sel
.si_note
, kn
))
6162 so
->so_snd
.sb_flags
|= SB_KNOTE
;
6164 /* determine if its already fired */
6165 return filt_sowrite_common(kn
, so
);
6169 filt_sowdetach(struct knote
*kn
)
6171 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6174 if (so
->so_snd
.sb_flags
& SB_KNOTE
)
6175 if (KNOTE_DETACH(&so
->so_snd
.sb_sel
.si_note
, kn
))
6176 so
->so_snd
.sb_flags
&= ~SB_KNOTE
;
6177 socket_unlock(so
, 1);
6182 filt_sowrite(struct knote
*kn
, long hint
)
6184 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6187 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
6190 ret
= filt_sowrite_common(kn
, so
);
6192 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
6193 socket_unlock(so
, 1);
6199 filt_sowtouch(struct knote
*kn
, struct kevent_internal_s
*kev
)
6201 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6206 /*save off the new input fflags and data */
6207 kn
->kn_sfflags
= kev
->fflags
;
6208 kn
->kn_sdata
= kev
->data
;
6209 if ((kn
->kn_status
& KN_UDATA_SPECIFIC
) == 0)
6210 kn
->kn_udata
= kev
->udata
;
6212 /* determine if these changes result in a triggered event */
6213 ret
= filt_sowrite_common(kn
, so
);
6215 socket_unlock(so
, 1);
6221 filt_sowprocess(struct knote
*kn
, struct filt_process_s
*data
, struct kevent_internal_s
*kev
)
6223 #pragma unused(data)
6224 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6228 ret
= filt_sowrite_common(kn
, so
);
6230 *kev
= kn
->kn_kevent
;
6231 if (kn
->kn_flags
& EV_CLEAR
) {
6236 socket_unlock(so
, 1);
6241 filt_sockev_common(struct knote
*kn
, struct socket
*so
, long ev_hint
)
6244 uint32_t level_trigger
= 0;
6246 if (ev_hint
& SO_FILT_HINT_CONNRESET
) {
6247 kn
->kn_fflags
|= NOTE_CONNRESET
;
6249 if (ev_hint
& SO_FILT_HINT_TIMEOUT
) {
6250 kn
->kn_fflags
|= NOTE_TIMEOUT
;
6252 if (ev_hint
& SO_FILT_HINT_NOSRCADDR
) {
6253 kn
->kn_fflags
|= NOTE_NOSRCADDR
;
6255 if (ev_hint
& SO_FILT_HINT_IFDENIED
) {
6256 kn
->kn_fflags
|= NOTE_IFDENIED
;
6258 if (ev_hint
& SO_FILT_HINT_KEEPALIVE
) {
6259 kn
->kn_fflags
|= NOTE_KEEPALIVE
;
6261 if (ev_hint
& SO_FILT_HINT_ADAPTIVE_WTIMO
) {
6262 kn
->kn_fflags
|= NOTE_ADAPTIVE_WTIMO
;
6264 if (ev_hint
& SO_FILT_HINT_ADAPTIVE_RTIMO
) {
6265 kn
->kn_fflags
|= NOTE_ADAPTIVE_RTIMO
;
6267 if ((ev_hint
& SO_FILT_HINT_CONNECTED
) ||
6268 (so
->so_state
& SS_ISCONNECTED
)) {
6269 kn
->kn_fflags
|= NOTE_CONNECTED
;
6270 level_trigger
|= NOTE_CONNECTED
;
6272 if ((ev_hint
& SO_FILT_HINT_DISCONNECTED
) ||
6273 (so
->so_state
& SS_ISDISCONNECTED
)) {
6274 kn
->kn_fflags
|= NOTE_DISCONNECTED
;
6275 level_trigger
|= NOTE_DISCONNECTED
;
6277 if (ev_hint
& SO_FILT_HINT_CONNINFO_UPDATED
) {
6278 if (so
->so_proto
!= NULL
&&
6279 (so
->so_proto
->pr_flags
& PR_EVCONNINFO
))
6280 kn
->kn_fflags
|= NOTE_CONNINFO_UPDATED
;
6283 if ((ev_hint
& SO_FILT_HINT_NOTIFY_ACK
) ||
6284 tcp_notify_ack_active(so
)) {
6285 kn
->kn_fflags
|= NOTE_NOTIFY_ACK
;
6288 if ((so
->so_state
& SS_CANTRCVMORE
)
6290 && cfil_sock_data_pending(&so
->so_rcv
) == 0
6291 #endif /* CONTENT_FILTER */
6293 kn
->kn_fflags
|= NOTE_READCLOSED
;
6294 level_trigger
|= NOTE_READCLOSED
;
6297 if (so
->so_state
& SS_CANTSENDMORE
) {
6298 kn
->kn_fflags
|= NOTE_WRITECLOSED
;
6299 level_trigger
|= NOTE_WRITECLOSED
;
6302 if ((ev_hint
& SO_FILT_HINT_SUSPEND
) ||
6303 (so
->so_flags
& SOF_SUSPENDED
)) {
6304 kn
->kn_fflags
&= ~(NOTE_SUSPEND
| NOTE_RESUME
);
6306 /* If resume event was delivered before, reset it */
6307 kn
->kn_hookid
&= ~NOTE_RESUME
;
6309 kn
->kn_fflags
|= NOTE_SUSPEND
;
6310 level_trigger
|= NOTE_SUSPEND
;
6313 if ((ev_hint
& SO_FILT_HINT_RESUME
) ||
6314 (so
->so_flags
& SOF_SUSPENDED
) == 0) {
6315 kn
->kn_fflags
&= ~(NOTE_SUSPEND
| NOTE_RESUME
);
6317 /* If suspend event was delivered before, reset it */
6318 kn
->kn_hookid
&= ~NOTE_SUSPEND
;
6320 kn
->kn_fflags
|= NOTE_RESUME
;
6321 level_trigger
|= NOTE_RESUME
;
6324 if (so
->so_error
!= 0) {
6326 kn
->kn_data
= so
->so_error
;
6327 kn
->kn_flags
|= EV_EOF
;
6329 get_sockev_state(so
, (u_int32_t
*)&(kn
->kn_data
));
6332 /* Reset any events that are not requested on this knote */
6333 kn
->kn_fflags
&= (kn
->kn_sfflags
& EVFILT_SOCK_ALL_MASK
);
6334 level_trigger
&= (kn
->kn_sfflags
& EVFILT_SOCK_ALL_MASK
);
6336 /* Find the level triggerred events that are already delivered */
6337 level_trigger
&= kn
->kn_hookid
;
6338 level_trigger
&= EVFILT_SOCK_LEVEL_TRIGGER_MASK
;
6340 /* Do not deliver level triggerred events more than once */
6341 if ((kn
->kn_fflags
& ~level_trigger
) != 0)
6348 filt_sockattach(struct knote
*kn
)
6350 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6354 if (KNOTE_ATTACH(&so
->so_klist
, kn
))
6355 so
->so_flags
|= SOF_KNOTE
;
6357 /* determine if event already fired */
6358 return filt_sockev_common(kn
, so
, 0);
6362 filt_sockdetach(struct knote
*kn
)
6364 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6367 if ((so
->so_flags
& SOF_KNOTE
) != 0)
6368 if (KNOTE_DETACH(&so
->so_klist
, kn
))
6369 so
->so_flags
&= ~SOF_KNOTE
;
6370 socket_unlock(so
, 1);
6374 filt_sockev(struct knote
*kn
, long hint
)
6376 int ret
= 0, locked
= 0;
6377 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6378 long ev_hint
= (hint
& SO_FILT_HINT_EV
);
6380 if ((hint
& SO_FILT_HINT_LOCKED
) == 0) {
6385 ret
= filt_sockev_common(kn
, so
, ev_hint
);
6388 socket_unlock(so
, 1);
6396 * filt_socktouch - update event state
6401 struct kevent_internal_s
*kev
)
6403 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6404 uint32_t changed_flags
;
6409 /* save off the [result] data and fflags */
6410 changed_flags
= (kn
->kn_sfflags
^ kn
->kn_hookid
);
6412 /* save off the new input fflags and data */
6413 kn
->kn_sfflags
= kev
->fflags
;
6414 kn
->kn_sdata
= kev
->data
;
6415 if ((kn
->kn_status
& KN_UDATA_SPECIFIC
) == 0)
6416 kn
->kn_udata
= kev
->udata
;
6418 /* restrict the current results to the (smaller?) set of new interest */
6420 * For compatibility with previous implementations, we leave kn_fflags
6421 * as they were before.
6423 //kn->kn_fflags &= kev->fflags;
6426 * Since we keep track of events that are already
6427 * delivered, if any of those events are not requested
6428 * anymore the state related to them can be reset
6431 ~(changed_flags
& EVFILT_SOCK_LEVEL_TRIGGER_MASK
);
6433 /* determine if we have events to deliver */
6434 ret
= filt_sockev_common(kn
, so
, 0);
6436 socket_unlock(so
, 1);
6442 * filt_sockprocess - query event fired state and return data
6447 struct filt_process_s
*data
,
6448 struct kevent_internal_s
*kev
)
6450 #pragma unused(data)
6452 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
6457 ret
= filt_sockev_common(kn
, so
, 0);
6459 *kev
= kn
->kn_kevent
;
6462 * Store the state of the events being delivered. This
6463 * state can be used to deliver level triggered events
6464 * ateast once and still avoid waking up the application
6465 * multiple times as long as the event is active.
6467 if (kn
->kn_fflags
!= 0)
6468 kn
->kn_hookid
|= (kn
->kn_fflags
&
6469 EVFILT_SOCK_LEVEL_TRIGGER_MASK
);
6472 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6473 * only one of them and remember the last one that was
6476 if (kn
->kn_fflags
& NOTE_SUSPEND
)
6477 kn
->kn_hookid
&= ~NOTE_RESUME
;
6478 if (kn
->kn_fflags
& NOTE_RESUME
)
6479 kn
->kn_hookid
&= ~NOTE_SUSPEND
;
6481 if (kn
->kn_flags
& EV_CLEAR
) {
6487 socket_unlock(so
, 1);
6493 get_sockev_state(struct socket
*so
, u_int32_t
*statep
)
6495 u_int32_t state
= *(statep
);
6498 * If the state variable is already used by a previous event,
6504 if (so
->so_state
& SS_ISCONNECTED
)
6505 state
|= SOCKEV_CONNECTED
;
6507 state
&= ~(SOCKEV_CONNECTED
);
6508 state
|= ((so
->so_state
& SS_ISDISCONNECTED
) ? SOCKEV_DISCONNECTED
: 0);
6512 #define SO_LOCK_HISTORY_STR_LEN \
6513 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
6515 __private_extern__
const char *
6516 solockhistory_nr(struct socket
*so
)
6520 static char lock_history_str
[SO_LOCK_HISTORY_STR_LEN
];
6522 bzero(lock_history_str
, sizeof (lock_history_str
));
6523 for (i
= SO_LCKDBG_MAX
- 1; i
>= 0; i
--) {
6524 n
+= snprintf(lock_history_str
+ n
,
6525 SO_LOCK_HISTORY_STR_LEN
- n
, "%p:%p ",
6526 so
->lock_lr
[(so
->next_lock_lr
+ i
) % SO_LCKDBG_MAX
],
6527 so
->unlock_lr
[(so
->next_unlock_lr
+ i
) % SO_LCKDBG_MAX
]);
6529 return (lock_history_str
);
6533 socket_lock(struct socket
*so
, int refcount
)
6538 lr_saved
= __builtin_return_address(0);
6540 if (so
->so_proto
->pr_lock
) {
6541 error
= (*so
->so_proto
->pr_lock
)(so
, refcount
, lr_saved
);
6543 #ifdef MORE_LOCKING_DEBUG
6544 lck_mtx_assert(so
->so_proto
->pr_domain
->dom_mtx
,
6545 LCK_MTX_ASSERT_NOTOWNED
);
6547 lck_mtx_lock(so
->so_proto
->pr_domain
->dom_mtx
);
6550 so
->lock_lr
[so
->next_lock_lr
] = lr_saved
;
6551 so
->next_lock_lr
= (so
->next_lock_lr
+1) % SO_LCKDBG_MAX
;
6558 socket_unlock(struct socket
*so
, int refcount
)
6562 lck_mtx_t
*mutex_held
;
6564 lr_saved
= __builtin_return_address(0);
6566 if (so
->so_proto
== NULL
) {
6567 panic("%s: null so_proto so=%p\n", __func__
, so
);
6571 if (so
&& so
->so_proto
->pr_unlock
) {
6572 error
= (*so
->so_proto
->pr_unlock
)(so
, refcount
, lr_saved
);
6574 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
6575 #ifdef MORE_LOCKING_DEBUG
6576 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
6578 so
->unlock_lr
[so
->next_unlock_lr
] = lr_saved
;
6579 so
->next_unlock_lr
= (so
->next_unlock_lr
+1) % SO_LCKDBG_MAX
;
6582 if (so
->so_usecount
<= 0) {
6583 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
6584 "lrh=%s", __func__
, so
->so_usecount
, so
,
6585 SOCK_DOM(so
), so
->so_type
,
6586 SOCK_PROTO(so
), solockhistory_nr(so
));
6591 if (so
->so_usecount
== 0)
6592 sofreelastref(so
, 1);
6594 lck_mtx_unlock(mutex_held
);
6600 /* Called with socket locked, will unlock socket */
6602 sofree(struct socket
*so
)
6604 lck_mtx_t
*mutex_held
;
6606 if (so
->so_proto
->pr_getlock
!= NULL
)
6607 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
6609 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
6610 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
6612 sofreelastref(so
, 0);
6616 soreference(struct socket
*so
)
6618 socket_lock(so
, 1); /* locks & take one reference on socket */
6619 socket_unlock(so
, 0); /* unlock only */
6623 sodereference(struct socket
*so
)
6626 socket_unlock(so
, 1);
6630 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
6631 * possibility of using jumbo clusters. Caller must ensure to hold
6635 somultipages(struct socket
*so
, boolean_t set
)
6638 so
->so_flags
|= SOF_MULTIPAGES
;
6640 so
->so_flags
&= ~SOF_MULTIPAGES
;
6644 soif2kcl(struct socket
*so
, boolean_t set
)
6647 so
->so_flags1
|= SOF1_IF_2KCL
;
6649 so
->so_flags1
&= ~SOF1_IF_2KCL
;
6653 so_isdstlocal(struct socket
*so
) {
6655 struct inpcb
*inp
= (struct inpcb
*)so
->so_pcb
;
6657 if (SOCK_DOM(so
) == PF_INET
)
6658 return (inaddr_local(inp
->inp_faddr
));
6659 else if (SOCK_DOM(so
) == PF_INET6
)
6660 return (in6addr_local(&inp
->in6p_faddr
));
6666 sosetdefunct(struct proc
*p
, struct socket
*so
, int level
, boolean_t noforce
)
6668 struct sockbuf
*rcv
, *snd
;
6669 int err
= 0, defunct
;
6674 defunct
= (so
->so_flags
& SOF_DEFUNCT
);
6676 if (!(snd
->sb_flags
& rcv
->sb_flags
& SB_DROP
)) {
6677 panic("%s: SB_DROP not set", __func__
);
6683 if (so
->so_flags
& SOF_NODEFUNCT
) {
6686 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
6687 "name %s level %d) so 0x%llx [%d,%d] "
6688 "is not eligible for defunct "
6689 "(%d)\n", __func__
, proc_selfpid(),
6690 proc_best_name(current_proc()), proc_pid(p
),
6691 proc_best_name(p
), level
,
6692 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
6693 SOCK_DOM(so
), SOCK_TYPE(so
), err
);
6696 so
->so_flags
&= ~SOF_NODEFUNCT
;
6697 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6698 "so 0x%llx [%d,%d] defunct by force\n", __func__
,
6699 proc_selfpid(), proc_best_name(current_proc()),
6700 proc_pid(p
), proc_best_name(p
), level
,
6701 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
6702 SOCK_DOM(so
), SOCK_TYPE(so
));
6703 } else if (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
) {
6704 struct inpcb
*inp
= (struct inpcb
*)so
->so_pcb
;
6705 struct ifnet
*ifp
= inp
->inp_last_outifp
;
6707 if (ifp
&& IFNET_IS_CELLULAR(ifp
)) {
6708 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_nocell
);
6709 } else if (so
->so_flags
& SOF_DELEGATED
) {
6710 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_nodlgtd
);
6711 } else if (soextbkidlestat
.so_xbkidle_time
== 0) {
6712 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_notime
);
6713 } else if (noforce
) {
6714 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_active
);
6716 so
->so_flags1
|= SOF1_EXTEND_BK_IDLE_INPROG
;
6717 so
->so_extended_bk_start
= net_uptime();
6718 OSBitOrAtomic(P_LXBKIDLEINPROG
, &p
->p_ladvflag
);
6720 inpcb_timer_sched(inp
->inp_pcbinfo
, INPCB_TIMER_LAZY
);
6723 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s "
6724 "level %d) extend bk idle so 0x%llx rcv hw %d "
6726 __func__
, proc_selfpid(),
6727 proc_best_name(current_proc()), proc_pid(p
),
6728 proc_best_name(p
), level
,
6729 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
6730 so
->so_rcv
.sb_hiwat
, so
->so_rcv
.sb_cc
);
6733 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_forced
);
6737 so
->so_flags
|= SOF_DEFUNCT
;
6739 /* Prevent further data from being appended to the socket buffers */
6740 snd
->sb_flags
|= SB_DROP
;
6741 rcv
->sb_flags
|= SB_DROP
;
6743 /* Flush any existing data in the socket buffers */
6744 if (rcv
->sb_cc
!= 0) {
6745 rcv
->sb_flags
&= ~SB_SEL
;
6746 selthreadclear(&rcv
->sb_sel
);
6749 if (snd
->sb_cc
!= 0) {
6750 snd
->sb_flags
&= ~SB_SEL
;
6751 selthreadclear(&snd
->sb_sel
);
6756 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6757 "so 0x%llx [%d,%d] %s defunct%s\n", __func__
, proc_selfpid(),
6758 proc_best_name(current_proc()), proc_pid(p
), proc_best_name(p
),
6759 level
, (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), SOCK_DOM(so
),
6760 SOCK_TYPE(so
), defunct
? "is already" : "marked as",
6761 (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
) ? " extbkidle" : "");
6767 sodefunct(struct proc
*p
, struct socket
*so
, int level
)
6769 struct sockbuf
*rcv
, *snd
;
6771 if (!(so
->so_flags
& SOF_DEFUNCT
)) {
6772 panic("%s improperly called", __func__
);
6775 if (so
->so_state
& SS_DEFUNCT
)
6781 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
6782 char s
[MAX_IPv6_STR_LEN
];
6783 char d
[MAX_IPv6_STR_LEN
];
6784 struct inpcb
*inp
= sotoinpcb(so
);
6786 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6787 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
6788 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n",
6789 __func__
, proc_selfpid(), proc_best_name(current_proc()),
6790 proc_pid(p
), proc_best_name(p
), level
,
6791 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
6792 (SOCK_TYPE(so
) == SOCK_STREAM
) ? "TCP" : "UDP",
6793 inet_ntop(SOCK_DOM(so
), ((SOCK_DOM(so
) == PF_INET
) ?
6794 (void *)&inp
->inp_laddr
.s_addr
: (void *)&inp
->in6p_laddr
),
6795 s
, sizeof (s
)), ntohs(inp
->in6p_lport
),
6796 inet_ntop(SOCK_DOM(so
), (SOCK_DOM(so
) == PF_INET
) ?
6797 (void *)&inp
->inp_faddr
.s_addr
: (void *)&inp
->in6p_faddr
,
6798 d
, sizeof (d
)), ntohs(inp
->in6p_fport
),
6799 (uint32_t)rcv
->sb_sel
.si_flags
,
6800 (uint32_t)snd
->sb_sel
.si_flags
,
6801 rcv
->sb_flags
, snd
->sb_flags
);
6803 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6804 "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
6805 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__
,
6806 proc_selfpid(), proc_best_name(current_proc()),
6807 proc_pid(p
), proc_best_name(p
), level
,
6808 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
6809 SOCK_DOM(so
), SOCK_TYPE(so
),
6810 (uint32_t)rcv
->sb_sel
.si_flags
,
6811 (uint32_t)snd
->sb_sel
.si_flags
, rcv
->sb_flags
,
6816 * Unwedge threads blocked on sbwait() and sb_lock().
6821 so
->so_flags1
|= SOF1_DEFUNCTINPROG
;
6822 if (rcv
->sb_flags
& SB_LOCK
)
6823 sbunlock(rcv
, TRUE
); /* keep socket locked */
6824 if (snd
->sb_flags
& SB_LOCK
)
6825 sbunlock(snd
, TRUE
); /* keep socket locked */
6828 * Flush the buffers and disconnect. We explicitly call shutdown
6829 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
6830 * states are set for the socket. This would also flush out data
6831 * hanging off the receive list of this socket.
6833 (void) soshutdownlock_final(so
, SHUT_RD
);
6834 (void) soshutdownlock_final(so
, SHUT_WR
);
6835 (void) sodisconnectlocked(so
);
6838 * Explicitly handle connectionless-protocol disconnection
6839 * and release any remaining data in the socket buffers.
6841 if (!(so
->so_flags
& SS_ISDISCONNECTED
))
6842 (void) soisdisconnected(so
);
6844 if (so
->so_error
== 0)
6845 so
->so_error
= EBADF
;
6847 if (rcv
->sb_cc
!= 0) {
6848 rcv
->sb_flags
&= ~SB_SEL
;
6849 selthreadclear(&rcv
->sb_sel
);
6852 if (snd
->sb_cc
!= 0) {
6853 snd
->sb_flags
&= ~SB_SEL
;
6854 selthreadclear(&snd
->sb_sel
);
6857 so
->so_state
|= SS_DEFUNCT
;
6858 OSIncrementAtomicLong((volatile long *)&sodefunct_calls
);
6865 soresume(struct proc
*p
, struct socket
*so
, int locked
)
6870 if (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_INPROG
) {
6871 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
6872 "[%d,%d] resumed from bk idle\n",
6873 __func__
, proc_selfpid(), proc_best_name(current_proc()),
6874 proc_pid(p
), proc_best_name(p
),
6875 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
6876 SOCK_DOM(so
), SOCK_TYPE(so
));
6878 so
->so_flags1
&= ~SOF1_EXTEND_BK_IDLE_INPROG
;
6879 so
->so_extended_bk_start
= 0;
6880 OSBitAndAtomic(~P_LXBKIDLEINPROG
, &p
->p_ladvflag
);
6882 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_resumed
);
6883 OSDecrementAtomic(&soextbkidlestat
.so_xbkidle_active
);
6884 VERIFY(soextbkidlestat
.so_xbkidle_active
>= 0);
6887 socket_unlock(so
, 1);
6893 * Does not attempt to account for sockets that are delegated from
6894 * the current process
6897 so_set_extended_bk_idle(struct socket
*so
, int optval
)
6901 if ((SOCK_DOM(so
) != PF_INET
&& SOCK_DOM(so
) != PF_INET6
) ||
6902 SOCK_PROTO(so
) != IPPROTO_TCP
) {
6903 OSDecrementAtomic(&soextbkidlestat
.so_xbkidle_notsupp
);
6905 } else if (optval
== 0) {
6906 so
->so_flags1
&= ~SOF1_EXTEND_BK_IDLE_WANTED
;
6908 soresume(current_proc(), so
, 1);
6910 struct proc
*p
= current_proc();
6912 struct filedesc
*fdp
;
6918 for (i
= 0; i
< fdp
->fd_nfiles
; i
++) {
6919 struct fileproc
*fp
= fdp
->fd_ofiles
[i
];
6923 (fdp
->fd_ofileflags
[i
] & UF_RESERVED
) != 0 ||
6924 FILEGLOB_DTYPE(fp
->f_fglob
) != DTYPE_SOCKET
)
6927 so2
= (struct socket
*)fp
->f_fglob
->fg_data
;
6929 so2
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
)
6931 if (count
>= soextbkidlestat
.so_xbkidle_maxperproc
)
6934 if (count
>= soextbkidlestat
.so_xbkidle_maxperproc
) {
6935 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_toomany
);
6937 } else if (so
->so_flags
& SOF_DELEGATED
) {
6938 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_nodlgtd
);
6941 so
->so_flags1
|= SOF1_EXTEND_BK_IDLE_WANTED
;
6942 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_wantok
);
6944 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
6945 "%s marked for extended bk idle\n",
6946 __func__
, proc_selfpid(), proc_best_name(current_proc()),
6947 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
6948 SOCK_DOM(so
), SOCK_TYPE(so
),
6949 (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_WANTED
) ?
6959 so_stop_extended_bk_idle(struct socket
*so
)
6961 so
->so_flags1
&= ~SOF1_EXTEND_BK_IDLE_INPROG
;
6962 so
->so_extended_bk_start
= 0;
6964 OSDecrementAtomic(&soextbkidlestat
.so_xbkidle_active
);
6965 VERIFY(soextbkidlestat
.so_xbkidle_active
>= 0);
6969 sosetdefunct(current_proc(), so
,
6970 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL
, FALSE
);
6971 if (so
->so_flags
& SOF_DEFUNCT
) {
6972 sodefunct(current_proc(), so
,
6973 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL
);
6978 so_drain_extended_bk_idle(struct socket
*so
)
6980 if (so
&& (so
->so_flags1
& SOF1_EXTEND_BK_IDLE_INPROG
)) {
6982 * Only penalize sockets that have outstanding data
6984 if (so
->so_rcv
.sb_cc
|| so
->so_snd
.sb_cc
) {
6985 so_stop_extended_bk_idle(so
);
6987 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_drained
);
6993 * Return values tells if socket is still in extended background idle
6996 so_check_extended_bk_idle_time(struct socket
*so
)
7000 if ((so
->so_flags1
& SOF1_EXTEND_BK_IDLE_INPROG
)) {
7001 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7002 __func__
, proc_selfpid(), proc_best_name(current_proc()),
7003 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7004 SOCK_DOM(so
), SOCK_TYPE(so
));
7005 if (net_uptime() - so
->so_extended_bk_start
>
7006 soextbkidlestat
.so_xbkidle_time
) {
7007 so_stop_extended_bk_idle(so
);
7009 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_expired
);
7013 struct inpcb
*inp
= (struct inpcb
*)so
->so_pcb
;
7015 inpcb_timer_sched(inp
->inp_pcbinfo
, INPCB_TIMER_LAZY
);
7016 OSIncrementAtomic(&soextbkidlestat
.so_xbkidle_resched
);
7024 resume_proc_sockets(proc_t p
)
7026 if (p
->p_ladvflag
& P_LXBKIDLEINPROG
) {
7027 struct filedesc
*fdp
;
7032 for (i
= 0; i
< fdp
->fd_nfiles
; i
++) {
7033 struct fileproc
*fp
;
7036 fp
= fdp
->fd_ofiles
[i
];
7038 (fdp
->fd_ofileflags
[i
] & UF_RESERVED
) != 0 ||
7039 FILEGLOB_DTYPE(fp
->f_fglob
) != DTYPE_SOCKET
)
7042 so
= (struct socket
*)fp
->f_fglob
->fg_data
;
7043 (void) soresume(p
, so
, 0);
7047 OSBitAndAtomic(~P_LXBKIDLEINPROG
, &p
->p_ladvflag
);
7051 __private_extern__
int
7052 so_set_recv_anyif(struct socket
*so
, int optval
)
7057 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
7059 if (SOCK_DOM(so
) == PF_INET
) {
7062 sotoinpcb(so
)->inp_flags
|= INP_RECV_ANYIF
;
7064 sotoinpcb(so
)->inp_flags
&= ~INP_RECV_ANYIF
;
7070 __private_extern__
int
7071 so_get_recv_anyif(struct socket
*so
)
7076 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
7078 if (SOCK_DOM(so
) == PF_INET
) {
7080 ret
= (sotoinpcb(so
)->inp_flags
& INP_RECV_ANYIF
) ? 1 : 0;
7087 so_set_restrictions(struct socket
*so
, uint32_t vals
)
7089 int nocell_old
, nocell_new
;
7090 int noexpensive_old
, noexpensive_new
;
7093 * Deny-type restrictions are trapdoors; once set they cannot be
7094 * unset for the lifetime of the socket. This allows them to be
7095 * issued by a framework on behalf of the application without
7096 * having to worry that they can be undone.
7098 * Note here that socket-level restrictions overrides any protocol
7099 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7100 * socket restriction issued on the socket has a higher precendence
7101 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7102 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7103 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7105 nocell_old
= (so
->so_restrictions
& SO_RESTRICT_DENY_CELLULAR
);
7106 noexpensive_old
= (so
->so_restrictions
& SO_RESTRICT_DENY_EXPENSIVE
);
7107 so
->so_restrictions
|= (vals
& (SO_RESTRICT_DENY_IN
|
7108 SO_RESTRICT_DENY_OUT
| SO_RESTRICT_DENY_CELLULAR
|
7109 SO_RESTRICT_DENY_EXPENSIVE
));
7110 nocell_new
= (so
->so_restrictions
& SO_RESTRICT_DENY_CELLULAR
);
7111 noexpensive_new
= (so
->so_restrictions
& SO_RESTRICT_DENY_EXPENSIVE
);
7113 /* we can only set, not clear restrictions */
7114 if ((nocell_new
- nocell_old
) == 0 &&
7115 (noexpensive_new
- noexpensive_old
) == 0)
7118 if (SOCK_DOM(so
) == PF_INET
|| SOCK_DOM(so
) == PF_INET6
) {
7120 if (SOCK_DOM(so
) == PF_INET
) {
7122 if (nocell_new
- nocell_old
!= 0) {
7124 * if deny cellular is now set, do what's needed
7127 inp_set_nocellular(sotoinpcb(so
));
7129 if (noexpensive_new
- noexpensive_old
!= 0) {
7130 inp_set_noexpensive(sotoinpcb(so
));
7138 so_get_restrictions(struct socket
*so
)
7140 return (so
->so_restrictions
& (SO_RESTRICT_DENY_IN
|
7141 SO_RESTRICT_DENY_OUT
|
7142 SO_RESTRICT_DENY_CELLULAR
| SO_RESTRICT_DENY_EXPENSIVE
));
7145 struct sockaddr_entry
*
7146 sockaddrentry_alloc(int how
)
7148 struct sockaddr_entry
*se
;
7150 se
= (how
== M_WAITOK
) ? zalloc(se_zone
) : zalloc_noblock(se_zone
);
7152 bzero(se
, se_zone_size
);
7158 sockaddrentry_free(struct sockaddr_entry
*se
)
7160 if (se
->se_addr
!= NULL
) {
7161 FREE(se
->se_addr
, M_SONAME
);
7167 struct sockaddr_entry
*
7168 sockaddrentry_dup(const struct sockaddr_entry
*src_se
, int how
)
7170 struct sockaddr_entry
*dst_se
;
7172 dst_se
= sockaddrentry_alloc(how
);
7173 if (dst_se
!= NULL
) {
7174 int len
= src_se
->se_addr
->sa_len
;
7176 MALLOC(dst_se
->se_addr
, struct sockaddr
*,
7177 len
, M_SONAME
, how
| M_ZERO
);
7178 if (dst_se
->se_addr
!= NULL
) {
7179 bcopy(src_se
->se_addr
, dst_se
->se_addr
, len
);
7181 sockaddrentry_free(dst_se
);
7189 struct sockaddr_list
*
7190 sockaddrlist_alloc(int how
)
7192 struct sockaddr_list
*sl
;
7194 sl
= (how
== M_WAITOK
) ? zalloc(sl_zone
) : zalloc_noblock(sl_zone
);
7196 bzero(sl
, sl_zone_size
);
7197 TAILQ_INIT(&sl
->sl_head
);
7203 sockaddrlist_free(struct sockaddr_list
*sl
)
7205 struct sockaddr_entry
*se
, *tse
;
7207 TAILQ_FOREACH_SAFE(se
, &sl
->sl_head
, se_link
, tse
) {
7208 sockaddrlist_remove(sl
, se
);
7209 sockaddrentry_free(se
);
7211 VERIFY(sl
->sl_cnt
== 0 && TAILQ_EMPTY(&sl
->sl_head
));
7216 sockaddrlist_insert(struct sockaddr_list
*sl
, struct sockaddr_entry
*se
)
7218 VERIFY(!(se
->se_flags
& SEF_ATTACHED
));
7219 se
->se_flags
|= SEF_ATTACHED
;
7220 TAILQ_INSERT_TAIL(&sl
->sl_head
, se
, se_link
);
7222 VERIFY(sl
->sl_cnt
!= 0);
7226 sockaddrlist_remove(struct sockaddr_list
*sl
, struct sockaddr_entry
*se
)
7228 VERIFY(se
->se_flags
& SEF_ATTACHED
);
7229 se
->se_flags
&= ~SEF_ATTACHED
;
7230 VERIFY(sl
->sl_cnt
!= 0);
7232 TAILQ_REMOVE(&sl
->sl_head
, se
, se_link
);
7235 struct sockaddr_list
*
7236 sockaddrlist_dup(const struct sockaddr_list
*src_sl
, int how
)
7238 struct sockaddr_entry
*src_se
, *tse
;
7239 struct sockaddr_list
*dst_sl
;
7241 dst_sl
= sockaddrlist_alloc(how
);
7245 TAILQ_FOREACH_SAFE(src_se
, &src_sl
->sl_head
, se_link
, tse
) {
7246 struct sockaddr_entry
*dst_se
;
7248 if (src_se
->se_addr
== NULL
)
7251 dst_se
= sockaddrentry_dup(src_se
, how
);
7252 if (dst_se
== NULL
) {
7253 sockaddrlist_free(dst_sl
);
7257 sockaddrlist_insert(dst_sl
, dst_se
);
7259 VERIFY(src_sl
->sl_cnt
== dst_sl
->sl_cnt
);
7265 so_set_effective_pid(struct socket
*so
, int epid
, struct proc
*p
)
7267 struct proc
*ep
= PROC_NULL
;
7270 /* pid 0 is reserved for kernel */
7277 * If this is an in-kernel socket, prevent its delegate
7278 * association from changing unless the socket option is
7279 * coming from within the kernel itself.
7281 if (so
->last_pid
== 0 && p
!= kernproc
) {
7287 * If this is issued by a process that's recorded as the
7288 * real owner of the socket, or if the pid is the same as
7289 * the process's own pid, then proceed. Otherwise ensure
7290 * that the issuing process has the necessary privileges.
7292 if (epid
!= so
->last_pid
|| epid
!= proc_pid(p
)) {
7293 if ((error
= priv_check_cred(kauth_cred_get(),
7294 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE
, 0))) {
7300 /* Find the process that corresponds to the effective pid */
7301 if ((ep
= proc_find(epid
)) == PROC_NULL
) {
7307 * If a process tries to delegate the socket to itself, then
7308 * there's really nothing to do; treat it as a way for the
7309 * delegate association to be cleared. Note that we check
7310 * the passed-in proc rather than calling proc_selfpid(),
7311 * as we need to check the process issuing the socket option
7312 * which could be kernproc. Given that we don't allow 0 for
7313 * effective pid, it means that a delegated in-kernel socket
7314 * stays delegated during its lifetime (which is probably OK.)
7316 if (epid
== proc_pid(p
)) {
7317 so
->so_flags
&= ~SOF_DELEGATED
;
7320 uuid_clear(so
->e_uuid
);
7322 so
->so_flags
|= SOF_DELEGATED
;
7323 so
->e_upid
= proc_uniqueid(ep
);
7324 so
->e_pid
= proc_pid(ep
);
7325 proc_getexecutableuuid(ep
, so
->e_uuid
, sizeof (so
->e_uuid
));
7328 if (error
== 0 && net_io_policy_log
) {
7331 uuid_unparse(so
->e_uuid
, buf
);
7332 log(LOG_DEBUG
, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7333 "euuid %s%s\n", __func__
, proc_name_address(p
),
7334 proc_pid(p
), (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7335 SOCK_DOM(so
), SOCK_TYPE(so
),
7336 so
->e_pid
, proc_name_address(ep
), buf
,
7337 ((so
->so_flags
& SOF_DELEGATED
) ? " [delegated]" : ""));
7338 } else if (error
!= 0 && net_io_policy_log
) {
7339 log(LOG_ERR
, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7340 "ERROR (%d)\n", __func__
, proc_name_address(p
),
7341 proc_pid(p
), (uint64_t)DEBUG_KERNEL_ADDRPERM(so
),
7342 SOCK_DOM(so
), SOCK_TYPE(so
),
7343 epid
, (ep
== PROC_NULL
) ? "PROC_NULL" :
7344 proc_name_address(ep
), error
);
7347 /* Update this socket's policy upon success */
7349 so
->so_policy_gencnt
*= -1;
7350 so_update_policy(so
);
7352 so_update_necp_policy(so
, NULL
, NULL
);
7356 if (ep
!= PROC_NULL
)
7363 so_set_effective_uuid(struct socket
*so
, uuid_t euuid
, struct proc
*p
)
7369 /* UUID must not be all-zeroes (reserved for kernel) */
7370 if (uuid_is_null(euuid
)) {
7376 * If this is an in-kernel socket, prevent its delegate
7377 * association from changing unless the socket option is
7378 * coming from within the kernel itself.
7380 if (so
->last_pid
== 0 && p
!= kernproc
) {
7385 /* Get the UUID of the issuing process */
7386 proc_getexecutableuuid(p
, uuid
, sizeof (uuid
));
7389 * If this is issued by a process that's recorded as the
7390 * real owner of the socket, or if the uuid is the same as
7391 * the process's own uuid, then proceed. Otherwise ensure
7392 * that the issuing process has the necessary privileges.
7394 if (uuid_compare(euuid
, so
->last_uuid
) != 0 ||
7395 uuid_compare(euuid
, uuid
) != 0) {
7396 if ((error
= priv_check_cred(kauth_cred_get(),
7397 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE
, 0))) {
7404 * If a process tries to delegate the socket to itself, then
7405 * there's really nothing to do; treat it as a way for the
7406 * delegate association to be cleared. Note that we check
7407 * the uuid of the passed-in proc rather than that of the
7408 * current process, as we need to check the process issuing
7409 * the socket option which could be kernproc itself. Given
7410 * that we don't allow 0 for effective uuid, it means that
7411 * a delegated in-kernel socket stays delegated during its
7412 * lifetime (which is okay.)
7414 if (uuid_compare(euuid
, uuid
) == 0) {
7415 so
->so_flags
&= ~SOF_DELEGATED
;
7418 uuid_clear(so
->e_uuid
);
7420 so
->so_flags
|= SOF_DELEGATED
;
7422 * Unlike so_set_effective_pid(), we only have the UUID
7423 * here and the process ID is not known. Inherit the
7424 * real {pid,upid} of the socket.
7426 so
->e_upid
= so
->last_upid
;
7427 so
->e_pid
= so
->last_pid
;
7428 uuid_copy(so
->e_uuid
, euuid
);
7432 if (error
== 0 && net_io_policy_log
) {
7433 uuid_unparse(so
->e_uuid
, buf
);
7434 log(LOG_DEBUG
, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7435 "euuid %s%s\n", __func__
, proc_name_address(p
), proc_pid(p
),
7436 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), SOCK_DOM(so
),
7437 SOCK_TYPE(so
), so
->e_pid
, buf
,
7438 ((so
->so_flags
& SOF_DELEGATED
) ? " [delegated]" : ""));
7439 } else if (error
!= 0 && net_io_policy_log
) {
7440 uuid_unparse(euuid
, buf
);
7441 log(LOG_DEBUG
, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7442 "ERROR (%d)\n", __func__
, proc_name_address(p
), proc_pid(p
),
7443 (uint64_t)DEBUG_KERNEL_ADDRPERM(so
), SOCK_DOM(so
),
7444 SOCK_TYPE(so
), buf
, error
);
7447 /* Update this socket's policy upon success */
7449 so
->so_policy_gencnt
*= -1;
7450 so_update_policy(so
);
7452 so_update_necp_policy(so
, NULL
, NULL
);
7460 netpolicy_post_msg(uint32_t ev_code
, struct netpolicy_event_data
*ev_data
,
7461 uint32_t ev_datalen
)
7463 struct kev_msg ev_msg
;
7466 * A netpolicy event always starts with a netpolicy_event_data
7467 * structure, but the caller can provide for a longer event
7468 * structure to post, depending on the event code.
7470 VERIFY(ev_data
!= NULL
&& ev_datalen
>= sizeof (*ev_data
));
7472 bzero(&ev_msg
, sizeof (ev_msg
));
7473 ev_msg
.vendor_code
= KEV_VENDOR_APPLE
;
7474 ev_msg
.kev_class
= KEV_NETWORK_CLASS
;
7475 ev_msg
.kev_subclass
= KEV_NETPOLICY_SUBCLASS
;
7476 ev_msg
.event_code
= ev_code
;
7478 ev_msg
.dv
[0].data_ptr
= ev_data
;
7479 ev_msg
.dv
[0].data_length
= ev_datalen
;
7481 kev_post_msg(&ev_msg
);
7485 socket_post_kev_msg(uint32_t ev_code
,
7486 struct kev_socket_event_data
*ev_data
,
7487 uint32_t ev_datalen
)
7489 struct kev_msg ev_msg
;
7491 bzero(&ev_msg
, sizeof(ev_msg
));
7492 ev_msg
.vendor_code
= KEV_VENDOR_APPLE
;
7493 ev_msg
.kev_class
= KEV_NETWORK_CLASS
;
7494 ev_msg
.kev_subclass
= KEV_SOCKET_SUBCLASS
;
7495 ev_msg
.event_code
= ev_code
;
7497 ev_msg
.dv
[0].data_ptr
= ev_data
;
7498 ev_msg
.dv
[0]. data_length
= ev_datalen
;
7500 kev_post_msg(&ev_msg
);
7504 socket_post_kev_msg_closed(struct socket
*so
)
7506 struct kev_socket_closed ev
;
7507 struct sockaddr
*socksa
= NULL
, *peersa
= NULL
;
7509 bzero(&ev
, sizeof(ev
));
7510 err
= (*so
->so_proto
->pr_usrreqs
->pru_sockaddr
)(so
, &socksa
);
7512 err
= (*so
->so_proto
->pr_usrreqs
->pru_peeraddr
)(so
,
7515 memcpy(&ev
.ev_data
.kev_sockname
, socksa
,
7517 sizeof (ev
.ev_data
.kev_sockname
)));
7518 memcpy(&ev
.ev_data
.kev_peername
, peersa
,
7520 sizeof (ev
.ev_data
.kev_peername
)));
7521 socket_post_kev_msg(KEV_SOCKET_CLOSED
,
7522 &ev
.ev_data
, sizeof (ev
));
7526 FREE(socksa
, M_SONAME
);
7528 FREE(peersa
, M_SONAME
);