2 * Copyright (c) 1998-2008 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.16 2001/06/14 20:46:06 ume Exp $
65 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
66 * support for mandatory and extensible security protections. This notice
67 * is included in support of clause 2.2 (b) of the Apple Public License,
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/filedesc.h>
75 #include <sys/proc_internal.h>
76 #include <sys/kauth.h>
77 #include <sys/file_internal.h>
78 #include <sys/fcntl.h>
79 #include <sys/malloc.h>
81 #include <sys/domain.h>
82 #include <sys/kernel.h>
83 #include <sys/event.h>
85 #include <sys/protosw.h>
86 #include <sys/socket.h>
87 #include <sys/socketvar.h>
88 #include <sys/resourcevar.h>
89 #include <sys/signalvar.h>
90 #include <sys/sysctl.h>
93 #include <sys/kdebug.h>
96 #include <net/route.h>
97 #include <netinet/in.h>
98 #include <netinet/in_pcb.h>
99 #include <kern/zalloc.h>
100 #include <kern/locks.h>
101 #include <machine/limits.h>
102 #include <libkern/OSAtomic.h>
103 #include <pexpert/pexpert.h>
104 #include <kern/assert.h>
107 #include <security/mac.h>
108 #include <security/mac_framework.h>
112 int so_cache_timeouts
= 0;
113 int so_cache_max_freed
= 0;
114 int cached_sock_count
= 0;
115 __private_extern__
int max_cached_sock_count
= MAX_CACHED_SOCKETS
;
116 struct socket
*socket_cache_head
= 0;
117 struct socket
*socket_cache_tail
= 0;
118 u_int32_t so_cache_time
= 0;
119 int so_cache_init_done
= 0;
120 struct zone
*so_cache_zone
;
122 static lck_grp_t
*so_cache_mtx_grp
;
123 static lck_attr_t
*so_cache_mtx_attr
;
124 static lck_grp_attr_t
*so_cache_mtx_grp_attr
;
125 lck_mtx_t
*so_cache_mtx
;
127 #include <machine/limits.h>
129 static void filt_sordetach(struct knote
*kn
);
130 static int filt_soread(struct knote
*kn
, long hint
);
131 static void filt_sowdetach(struct knote
*kn
);
132 static int filt_sowrite(struct knote
*kn
, long hint
);
135 sooptcopyin_timeval(struct sockopt
*sopt
, struct timeval
* tv_p
);
138 sooptcopyout_timeval(struct sockopt
*sopt
, const struct timeval
* tv_p
);
140 static struct filterops soread_filtops
= {
142 .f_detach
= filt_sordetach
,
143 .f_event
= filt_soread
,
145 static struct filterops sowrite_filtops
= {
147 .f_detach
= filt_sowdetach
,
148 .f_event
= filt_sowrite
,
151 #define EVEN_MORE_LOCKING_DEBUG 0
152 int socket_debug
= 0;
153 int socket_zone
= M_SOCKET
;
154 so_gen_t so_gencnt
; /* generation count for sockets */
156 MALLOC_DEFINE(M_SONAME
, "soname", "socket name");
157 MALLOC_DEFINE(M_PCB
, "pcb", "protocol control block");
159 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
160 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
161 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
162 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
163 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
164 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
165 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
167 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
170 SYSCTL_DECL(_kern_ipc
);
172 int somaxconn
= SOMAXCONN
;
173 SYSCTL_INT(_kern_ipc
, KIPC_SOMAXCONN
, somaxconn
, CTLFLAG_RW
, &somaxconn
, 0, "");
175 /* Should we get a maximum also ??? */
176 static int sosendmaxchain
= 65536;
177 static int sosendminchain
= 16384;
178 static int sorecvmincopy
= 16384;
179 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendminchain
, CTLFLAG_RW
, &sosendminchain
,
181 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sorecvmincopy
, CTLFLAG_RW
, &sorecvmincopy
,
185 * Set to enable jumbo clusters (if available) for large writes when
186 * the socket is marked with SOF_MULTIPAGES; see below.
189 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendjcl
, CTLFLAG_RW
, &sosendjcl
, 0, "");
192 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
193 * writes on the socket for all protocols on any network interfaces,
194 * depending upon sosendjcl above. Be extra careful when setting this
195 * to 1, because sending down packets that cross physical pages down to
196 * broken drivers (those that falsely assume that the physical pages
197 * are contiguous) might lead to system panics or silent data corruption.
198 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
199 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
200 * capable. Set this to 1 only for testing/debugging purposes.
202 int sosendjcl_ignore_capab
= 0;
203 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendjcl_ignore_capab
, CTLFLAG_RW
,
204 &sosendjcl_ignore_capab
, 0, "");
207 * Socket operation routines.
208 * These routines are called by the routines in
209 * sys_socket.c or from a system process, and
210 * implement the semantics of socket operations by
211 * switching out to the protocol specific routines.
215 extern void postevent(struct socket
*, struct sockbuf
*, int);
216 extern void evsofree(struct socket
*);
218 /* TODO: these should be in header file */
219 extern int get_inpcb_str_size(void);
220 extern int get_tcp_str_size(void);
221 extern struct domain
*pffinddomain(int);
222 extern struct protosw
*pffindprotonotype(int, int);
223 extern int soclose_locked(struct socket
*);
224 extern int soo_kqfilter(struct fileproc
*, struct knote
*, struct proc
*);
226 extern int uthread_get_background_state(uthread_t
);
230 vm_size_t so_cache_zone_element_size
;
232 static int sodelayed_copy(struct socket
*, struct uio
*, struct mbuf
**, int *);
233 static void cached_sock_alloc(struct socket
**, int);
234 static void cached_sock_free(struct socket
*);
235 static void so_cache_timer(void *);
237 void soclose_wait_locked(struct socket
*so
);
238 int so_isdstlocal(struct socket
*so
);
246 if (so_cache_init_done
) {
247 printf("socketinit: already called...\n");
251 PE_parse_boot_argn("socket_debug", &socket_debug
, sizeof (socket_debug
));
254 * allocate lock group attribute and group for socket cache mutex
256 so_cache_mtx_grp_attr
= lck_grp_attr_alloc_init();
258 so_cache_mtx_grp
= lck_grp_alloc_init("so_cache",
259 so_cache_mtx_grp_attr
);
262 * allocate the lock attribute for socket cache mutex
264 so_cache_mtx_attr
= lck_attr_alloc_init();
266 so_cache_init_done
= 1;
268 /* cached sockets mutex */
269 so_cache_mtx
= lck_mtx_alloc_init(so_cache_mtx_grp
, so_cache_mtx_attr
);
271 if (so_cache_mtx
== NULL
)
272 return; /* we're hosed... */
274 str_size
= (vm_size_t
)(sizeof (struct socket
) + 4 +
275 get_inpcb_str_size() + 4 + get_tcp_str_size());
277 so_cache_zone
= zinit(str_size
, 120000*str_size
, 8192, "socache zone");
279 printf("cached_sock_alloc -- so_cache_zone size is %x\n", str_size
);
281 timeout(so_cache_timer
, NULL
, (SO_CACHE_FLUSH_INTERVAL
* hz
));
283 so_cache_zone_element_size
= str_size
;
289 cached_sock_alloc(struct socket
**so
, int waitok
)
292 register uintptr_t offset
;
294 lck_mtx_lock(so_cache_mtx
);
296 if (cached_sock_count
) {
298 *so
= socket_cache_head
;
300 panic("cached_sock_alloc: cached sock is null");
302 socket_cache_head
= socket_cache_head
->cache_next
;
303 if (socket_cache_head
)
304 socket_cache_head
->cache_prev
= 0;
306 socket_cache_tail
= 0;
308 lck_mtx_unlock(so_cache_mtx
);
310 temp
= (*so
)->so_saved_pcb
;
311 bzero((caddr_t
)*so
, sizeof (struct socket
));
313 kprintf("cached_sock_alloc - retreiving cached sock %p - "
314 "count == %d\n", *so
, cached_sock_count
);
316 (*so
)->so_saved_pcb
= temp
;
317 (*so
)->cached_in_sock_layer
= 1;
320 kprintf("Allocating cached sock %p from memory\n", *so
);
323 lck_mtx_unlock(so_cache_mtx
);
326 *so
= (struct socket
*)zalloc(so_cache_zone
);
328 *so
= (struct socket
*)zalloc_noblock(so_cache_zone
);
333 bzero((caddr_t
)*so
, sizeof (struct socket
));
336 * Define offsets for extra structures into our single block of
337 * memory. Align extra structures on longword boundaries.
340 offset
= (uintptr_t) *so
;
341 offset
+= sizeof (struct socket
);
343 offset
= ALIGN(offset
);
345 (*so
)->so_saved_pcb
= (caddr_t
)offset
;
346 offset
+= get_inpcb_str_size();
348 offset
= ALIGN(offset
);
350 ((struct inpcb
*)(*so
)->so_saved_pcb
)->inp_saved_ppcb
=
353 kprintf("Allocating cached socket - %p, pcb=%p tcpcb=%p\n",
354 *so
, (*so
)->so_saved_pcb
,
355 ((struct inpcb
*)(*so
)->so_saved_pcb
)->inp_saved_ppcb
);
359 (*so
)->cached_in_sock_layer
= 1;
363 cached_sock_free(struct socket
*so
)
366 lck_mtx_lock(so_cache_mtx
);
368 if (++cached_sock_count
> max_cached_sock_count
) {
370 lck_mtx_unlock(so_cache_mtx
);
372 kprintf("Freeing overflowed cached socket %p\n", so
);
374 zfree(so_cache_zone
, so
);
377 kprintf("Freeing socket %p into cache\n", so
);
379 if (so_cache_hw
< cached_sock_count
)
380 so_cache_hw
= cached_sock_count
;
382 so
->cache_next
= socket_cache_head
;
384 if (socket_cache_head
)
385 socket_cache_head
->cache_prev
= so
;
387 socket_cache_tail
= so
;
389 so
->cache_timestamp
= so_cache_time
;
390 socket_cache_head
= so
;
391 lck_mtx_unlock(so_cache_mtx
);
395 kprintf("Freed cached sock %p into cache - count is %d\n",
396 so
, cached_sock_count
);
401 so_cache_timer(__unused
void *dummy
)
403 register struct socket
*p
;
404 register int n_freed
= 0;
406 lck_mtx_lock(so_cache_mtx
);
410 while ((p
= socket_cache_tail
)) {
411 if ((so_cache_time
- p
->cache_timestamp
) < SO_CACHE_TIME_LIMIT
)
416 if ((socket_cache_tail
= p
->cache_prev
))
417 p
->cache_prev
->cache_next
= 0;
418 if (--cached_sock_count
== 0)
419 socket_cache_head
= 0;
421 zfree(so_cache_zone
, p
);
423 if (++n_freed
>= SO_CACHE_MAX_FREE_BATCH
) {
424 so_cache_max_freed
++;
428 lck_mtx_unlock(so_cache_mtx
);
430 timeout(so_cache_timer
, NULL
, (SO_CACHE_FLUSH_INTERVAL
* hz
));
432 #endif /* __APPLE__ */
435 * Get a socket structure from our zone, and initialize it.
436 * We don't implement `waitok' yet (see comments in uipc_domain.c).
437 * Note that it would probably be better to allocate socket
438 * and PCB at the same time, but I'm not convinced that all
439 * the protocols can be easily modified to do this.
442 soalloc(int waitok
, int dom
, int type
)
446 if ((dom
== PF_INET
) && (type
== SOCK_STREAM
)) {
447 cached_sock_alloc(&so
, waitok
);
449 MALLOC_ZONE(so
, struct socket
*, sizeof (*so
), socket_zone
,
452 bzero(so
, sizeof (*so
));
454 /* XXX race condition for reentrant kernel */
455 //###LD Atomic add for so_gencnt
457 so
->so_gencnt
= ++so_gencnt
;
458 so
->so_zone
= socket_zone
;
459 #if CONFIG_MACF_SOCKET
460 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
461 if (mac_socket_label_init(so
, !waitok
) != 0) {
465 #endif /* MAC_SOCKET */
477 * <pru_attach>:ENOBUFS[AF_UNIX]
478 * <pru_attach>:ENOBUFS[TCP]
479 * <pru_attach>:ENOMEM[TCP]
480 * <pru_attach>:EISCONN[TCP]
481 * <pru_attach>:??? [other protocol families, IPSEC]
484 socreate(int dom
, struct socket
**aso
, int type
, int proto
)
486 struct proc
*p
= current_proc();
487 register struct protosw
*prp
;
488 register struct socket
*so
;
489 register int error
= 0;
494 extern int tcpconsdebug
;
497 prp
= pffindproto(dom
, proto
, type
);
499 prp
= pffindtype(dom
, type
);
501 if (prp
== 0 || prp
->pr_usrreqs
->pru_attach
== 0) {
502 if (pffinddomain(dom
) == NULL
) {
503 return (EAFNOSUPPORT
);
506 if (pffindprotonotype(dom
, proto
) != NULL
) {
510 return (EPROTONOSUPPORT
);
512 if (prp
->pr_type
!= type
)
514 so
= soalloc(1, dom
, type
);
518 TAILQ_INIT(&so
->so_incomp
);
519 TAILQ_INIT(&so
->so_comp
);
522 so
->so_uid
= kauth_cred_getuid(kauth_cred_get());
523 if (!suser(kauth_cred_get(), NULL
))
524 so
->so_state
= SS_PRIV
;
528 so
->so_rcv
.sb_flags
|= SB_RECV
; /* XXX */
529 so
->so_rcv
.sb_so
= so
->so_snd
.sb_so
= so
;
531 so
->next_lock_lr
= 0;
532 so
->next_unlock_lr
= 0;
534 #if CONFIG_MACF_SOCKET
535 mac_socket_label_associate(kauth_cred_get(), so
);
536 #endif /* MAC_SOCKET */
538 //### Attachement will create the per pcb lock if necessary and increase refcount
540 * for creation, make sure it's done before
541 * socket is inserted in lists
545 error
= (*prp
->pr_usrreqs
->pru_attach
)(so
, proto
, p
);
549 * If so_pcb is not zero, the socket will be leaked,
550 * so protocol attachment handler must be coded carefuly
552 so
->so_state
|= SS_NOFDREF
;
554 sofreelastref(so
, 1); /* will deallocate the socket */
558 prp
->pr_domain
->dom_refs
++;
559 TAILQ_INIT(&so
->so_evlist
);
561 /* Attach socket filters for this protocol */
564 if (tcpconsdebug
== 2)
565 so
->so_options
|= SO_DEBUG
;
569 * If this is a background thread/task, mark the socket as such.
571 thread
= current_thread();
572 ut
= get_bsdthread_info(thread
);
573 if (uthread_get_background_state(ut
)) {
574 socket_set_traffic_mgt_flags(so
, TRAFFIC_MGT_SO_BACKGROUND
);
575 so
->so_background_thread
= thread
;
577 * In case setpriority(PRIO_DARWIN_THREAD) was called
578 * on this thread, regulate network (TCP) traffics.
580 if (ut
->uu_flag
& UT_BACKGROUND_TRAFFIC_MGT
) {
581 socket_set_traffic_mgt_flags(so
,
582 TRAFFIC_MGT_SO_BG_REGULATE
);
592 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
593 * <pru_bind>:EAFNOSUPPORT Address family not supported
594 * <pru_bind>:EADDRNOTAVAIL Address not available.
595 * <pru_bind>:EINVAL Invalid argument
596 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
597 * <pru_bind>:EACCES Permission denied
598 * <pru_bind>:EADDRINUSE Address in use
599 * <pru_bind>:EAGAIN Resource unavailable, try again
600 * <pru_bind>:EPERM Operation not permitted
604 * Notes: It's not possible to fully enumerate the return codes above,
605 * since socket filter authors and protocol family authors may
606 * not choose to limit their error returns to those listed, even
607 * though this may result in some software operating incorrectly.
609 * The error codes which are enumerated above are those known to
610 * be returned by the tcp_usr_bind function supplied.
613 sobind(struct socket
*so
, struct sockaddr
*nam
)
615 struct proc
*p
= current_proc();
617 struct socket_filter_entry
*filter
;
623 * If this is a bind request on a previously-accepted socket
624 * that has been marked as inactive, reject it now before
627 if (so
->so_flags
& SOF_DEFUNCT
) {
634 for (filter
= so
->so_filt
; filter
&& (error
== 0);
635 filter
= filter
->sfe_next_onsocket
) {
636 if (filter
->sfe_filter
->sf_filter
.sf_bind
) {
640 socket_unlock(so
, 0);
642 error
= filter
->sfe_filter
->sf_filter
.
643 sf_bind(filter
->sfe_cookie
, so
, nam
);
650 /* End socket filter */
653 error
= (*so
->so_proto
->pr_usrreqs
->pru_bind
)(so
, nam
, p
);
655 socket_unlock(so
, 1);
657 if (error
== EJUSTRETURN
)
664 sodealloc(struct socket
*so
)
666 so
->so_gencnt
= ++so_gencnt
;
668 #if CONFIG_MACF_SOCKET
669 mac_socket_label_destroy(so
);
670 #endif /* MAC_SOCKET */
671 if (so
->cached_in_sock_layer
== 1) {
672 cached_sock_free(so
);
674 if (so
->cached_in_sock_layer
== -1)
675 panic("sodealloc: double dealloc: so=%p\n", so
);
676 so
->cached_in_sock_layer
= -1;
677 FREE_ZONE(so
, sizeof (*so
), so
->so_zone
);
685 * <pru_listen>:EINVAL[AF_UNIX]
686 * <pru_listen>:EINVAL[TCP]
687 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
688 * <pru_listen>:EINVAL[TCP] Invalid argument
689 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
690 * <pru_listen>:EACCES[TCP] Permission denied
691 * <pru_listen>:EADDRINUSE[TCP] Address in use
692 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
693 * <pru_listen>:EPERM[TCP] Operation not permitted
696 * Notes: Other <pru_listen> returns depend on the protocol family; all
697 * <sf_listen> returns depend on what the filter author causes
698 * their filter to return.
701 solisten(struct socket
*so
, int backlog
)
703 struct proc
*p
= current_proc();
705 struct socket_filter_entry
*filter
;
709 if (so
->so_proto
== NULL
) {
713 if ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) == 0) {
719 * If the listen request is made on a socket that is not fully
720 * disconnected, or on a previously-accepted socket that has
721 * been marked as inactive, reject the request now.
724 (SS_ISCONNECTED
|SS_ISCONNECTING
|SS_ISDISCONNECTING
)) ||
725 (so
->so_flags
& SOF_DEFUNCT
)) {
730 if ((so
->so_restrictions
& SO_RESTRICT_DENYIN
) != 0) {
736 for (filter
= so
->so_filt
; filter
&& (error
== 0);
737 filter
= filter
->sfe_next_onsocket
) {
738 if (filter
->sfe_filter
->sf_filter
.sf_listen
) {
742 socket_unlock(so
, 0);
744 error
= filter
->sfe_filter
->sf_filter
.
745 sf_listen(filter
->sfe_cookie
, so
);
754 error
= (*so
->so_proto
->pr_usrreqs
->pru_listen
)(so
, p
);
758 if (error
== EJUSTRETURN
)
763 if (TAILQ_EMPTY(&so
->so_comp
))
764 so
->so_options
|= SO_ACCEPTCONN
;
766 * POSIX: The implementation may have an upper limit on the length of
767 * the listen queue-either global or per accepting socket. If backlog
768 * exceeds this limit, the length of the listen queue is set to the
771 * If listen() is called with a backlog argument value that is less
772 * than 0, the function behaves as if it had been called with a backlog
773 * argument value of 0.
775 * A backlog argument of 0 may allow the socket to accept connections,
776 * in which case the length of the listen queue may be set to an
777 * implementation-defined minimum value.
779 if (backlog
<= 0 || backlog
> somaxconn
)
782 so
->so_qlimit
= backlog
;
784 socket_unlock(so
, 1);
789 sofreelastref(struct socket
*so
, int dealloc
)
791 struct socket
*head
= so
->so_head
;
793 /* Assume socket is locked */
795 /* Remove any filters - may be called more than once */
798 if ((!(so
->so_flags
& SOF_PCBCLEARING
)) ||
799 ((so
->so_state
& SS_NOFDREF
) == 0)) {
801 selthreadclear(&so
->so_snd
.sb_sel
);
802 selthreadclear(&so
->so_rcv
.sb_sel
);
803 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
804 so
->so_snd
.sb_flags
&= ~SB_UPCALL
;
809 socket_lock(head
, 1);
810 if (so
->so_state
& SS_INCOMP
) {
811 TAILQ_REMOVE(&head
->so_incomp
, so
, so_list
);
813 } else if (so
->so_state
& SS_COMP
) {
815 * We must not decommission a socket that's
816 * on the accept(2) queue. If we do, then
817 * accept(2) may hang after select(2) indicated
818 * that the listening socket was ready.
821 selthreadclear(&so
->so_snd
.sb_sel
);
822 selthreadclear(&so
->so_rcv
.sb_sel
);
823 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
824 so
->so_snd
.sb_flags
&= ~SB_UPCALL
;
826 socket_unlock(head
, 1);
829 panic("sofree: not queued");
832 so
->so_state
&= ~SS_INCOMP
;
834 socket_unlock(head
, 1);
837 selthreadclear(&so
->so_snd
.sb_sel
);
838 sbrelease(&so
->so_snd
);
842 /* 3932268: disable upcall */
843 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
844 so
->so_snd
.sb_flags
&= ~SB_UPCALL
;
851 soclose_wait_locked(struct socket
*so
)
853 lck_mtx_t
*mutex_held
;
855 if (so
->so_proto
->pr_getlock
!= NULL
)
856 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
858 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
859 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
862 * Double check here and return if there's no outstanding upcall;
863 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
865 if (!(so
->so_flags
& SOF_UPCALLINUSE
) ||
866 !(so
->so_flags
& SOF_UPCALLCLOSEWAIT
))
869 so
->so_flags
|= SOF_CLOSEWAIT
;
870 (void) msleep((caddr_t
)&so
->so_upcall
, mutex_held
, (PZERO
- 1),
871 "soclose_wait_locked", NULL
);
872 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
873 so
->so_flags
&= ~SOF_CLOSEWAIT
;
877 * Close a socket on last file table reference removal.
878 * Initiate disconnect if connected.
879 * Free socket when disconnect complete.
882 soclose_locked(struct socket
*so
)
885 lck_mtx_t
*mutex_held
;
888 if (so
->so_usecount
== 0) {
889 panic("soclose: so=%p refcount=0\n", so
);
892 sflt_notify(so
, sock_evt_closing
, NULL
);
894 if ((so
->so_options
& SO_ACCEPTCONN
)) {
895 struct socket
*sp
, *sonext
;
899 * We do not want new connection to be added
900 * to the connection queues
902 so
->so_options
&= ~SO_ACCEPTCONN
;
904 for (sp
= TAILQ_FIRST(&so
->so_incomp
); sp
!= NULL
; sp
= sonext
) {
905 sonext
= TAILQ_NEXT(sp
, so_list
);
908 * skip sockets thrown away by tcpdropdropblreq
909 * they will get cleanup by the garbage collection.
910 * otherwise, remove the incomp socket from the queue
911 * and let soabort trigger the appropriate cleanup.
913 if (sp
->so_flags
& SOF_OVERFLOW
)
916 if (so
->so_proto
->pr_getlock
!= NULL
) {
917 /* lock ordering for consistency with the rest of the stack,
918 * we lock the socket first and then grabb the head.
920 socket_unlock(so
, 0);
926 TAILQ_REMOVE(&so
->so_incomp
, sp
, so_list
);
929 if (sp
->so_state
& SS_INCOMP
) {
930 sp
->so_state
&= ~SS_INCOMP
;
937 socket_unlock(sp
, 1);
940 while ((sp
= TAILQ_FIRST(&so
->so_comp
)) != NULL
) {
941 /* Dequeue from so_comp since sofree() won't do it */
942 TAILQ_REMOVE(&so
->so_comp
, sp
, so_list
);
945 if (so
->so_proto
->pr_getlock
!= NULL
) {
946 socket_unlock(so
, 0);
950 if (sp
->so_state
& SS_COMP
) {
951 sp
->so_state
&= ~SS_COMP
;
957 if (so
->so_proto
->pr_getlock
!= NULL
) {
958 socket_unlock(sp
, 1);
963 if (so
->so_pcb
== 0) {
964 /* 3915887: mark the socket as ready for dealloc */
965 so
->so_flags
|= SOF_PCBCLEARING
;
968 if (so
->so_state
& SS_ISCONNECTED
) {
969 if ((so
->so_state
& SS_ISDISCONNECTING
) == 0) {
970 error
= sodisconnectlocked(so
);
974 if (so
->so_options
& SO_LINGER
) {
975 if ((so
->so_state
& SS_ISDISCONNECTING
) &&
976 (so
->so_state
& SS_NBIO
))
978 if (so
->so_proto
->pr_getlock
!= NULL
)
979 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
981 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
982 while (so
->so_state
& SS_ISCONNECTED
) {
983 ts
.tv_sec
= (so
->so_linger
/100);
984 ts
.tv_nsec
= (so
->so_linger
% 100) *
985 NSEC_PER_USEC
* 1000 * 10;
986 error
= msleep((caddr_t
)&so
->so_timeo
,
987 mutex_held
, PSOCK
| PCATCH
, "soclose", &ts
);
990 * It's OK when the time fires,
991 * don't report an error
993 if (error
== EWOULDBLOCK
)
1001 if (so
->so_usecount
== 0)
1002 panic("soclose: usecount is zero so=%p\n", so
);
1003 if (so
->so_pcb
&& !(so
->so_flags
& SOF_PCBCLEARING
)) {
1004 int error2
= (*so
->so_proto
->pr_usrreqs
->pru_detach
)(so
);
1008 if (so
->so_usecount
<= 0)
1009 panic("soclose: usecount is zero so=%p\n", so
);
1011 if (so
->so_pcb
&& so
->so_state
& SS_NOFDREF
)
1012 panic("soclose: NOFDREF");
1013 so
->so_state
|= SS_NOFDREF
;
1015 so
->so_proto
->pr_domain
->dom_refs
--;
1024 soclose(struct socket
*so
)
1029 if (so
->so_flags
& SOF_UPCALLINUSE
)
1030 soclose_wait_locked(so
);
1032 if (so
->so_retaincnt
== 0) {
1033 error
= soclose_locked(so
);
1036 * if the FD is going away, but socket is
1037 * retained in kernel remove its reference
1040 if (so
->so_usecount
< 2)
1041 panic("soclose: retaincnt non null and so=%p "
1042 "usecount=%d\n", so
, so
->so_usecount
);
1044 socket_unlock(so
, 1);
1049 * Must be called at splnet...
1051 /* Should already be locked */
1053 soabort(struct socket
*so
)
1057 #ifdef MORE_LOCKING_DEBUG
1058 lck_mtx_t
*mutex_held
;
1060 if (so
->so_proto
->pr_getlock
!= NULL
)
1061 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
1063 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1064 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1067 if ((so
->so_flags
& SOF_ABORTED
) == 0) {
1068 so
->so_flags
|= SOF_ABORTED
;
1069 error
= (*so
->so_proto
->pr_usrreqs
->pru_abort
)(so
);
1079 soacceptlock(struct socket
*so
, struct sockaddr
**nam
, int dolock
)
1086 if ((so
->so_state
& SS_NOFDREF
) == 0)
1087 panic("soaccept: !NOFDREF");
1088 so
->so_state
&= ~SS_NOFDREF
;
1089 error
= (*so
->so_proto
->pr_usrreqs
->pru_accept
)(so
, nam
);
1092 socket_unlock(so
, 1);
1097 soaccept(struct socket
*so
, struct sockaddr
**nam
)
1099 return (soacceptlock(so
, nam
, 1));
1103 soacceptfilter(struct socket
*so
)
1105 struct sockaddr
*local
= NULL
, *remote
= NULL
;
1106 struct socket_filter_entry
*filter
;
1107 int error
= 0, filtered
= 0;
1108 struct socket
*head
= so
->so_head
;
1111 * Hold the lock even if this socket
1112 * has not been made visible to the filter(s).
1113 * For sockets with global locks, this protect against the
1114 * head or peer going away
1117 if (sogetaddr_locked(so
, &remote
, 1) != 0 ||
1118 sogetaddr_locked(so
, &local
, 0) != 0) {
1119 so
->so_state
&= ~(SS_NOFDREF
| SS_COMP
);
1121 socket_unlock(so
, 1);
1123 /* Out of resources; try it again next time */
1124 error
= ECONNABORTED
;
1129 * At this point, we have a reference on the listening socket
1130 * so we know it won't be going away. Do the same for the newly
1131 * accepted socket while we invoke the accept callback routine.
1133 for (filter
= so
->so_filt
; filter
!= NULL
&& error
== 0;
1134 filter
= filter
->sfe_next_onsocket
) {
1135 if (filter
->sfe_filter
->sf_filter
.sf_accept
!= NULL
) {
1139 socket_unlock(so
, 0);
1141 error
= filter
->sfe_filter
->sf_filter
.
1142 sf_accept(filter
->sfe_cookie
,
1143 head
, so
, local
, remote
);
1153 * If we get EJUSTRETURN from one of the filters, mark this socket
1154 * as inactive and return it anyway. This newly accepted socket
1155 * will be disconnected later before we hand it off to the caller.
1157 if (error
== EJUSTRETURN
) {
1159 so
->so_flags
|= SOF_DEFUNCT
;
1160 /* Prevent data from being appended to the socket buffers */
1161 so
->so_snd
.sb_flags
|= SB_DROP
;
1162 so
->so_rcv
.sb_flags
|= SB_DROP
;
1167 * This may seem like a duplication to the above error
1168 * handling part when we return ECONNABORTED, except
1169 * the following is done while holding the lock since
1170 * the socket has been exposed to the filter(s) earlier.
1172 so
->so_state
&= ~(SS_NOFDREF
| SS_COMP
);
1174 socket_unlock(so
, 1);
1176 /* Propagate socket filter's error code to the caller */
1178 socket_unlock(so
, 1);
1181 /* Callee checks for NULL pointer */
1182 sock_freeaddr(remote
);
1183 sock_freeaddr(local
);
1188 * Returns: 0 Success
1189 * EOPNOTSUPP Operation not supported on socket
1190 * EISCONN Socket is connected
1191 * <pru_connect>:EADDRNOTAVAIL Address not available.
1192 * <pru_connect>:EINVAL Invalid argument
1193 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1194 * <pru_connect>:EACCES Permission denied
1195 * <pru_connect>:EADDRINUSE Address in use
1196 * <pru_connect>:EAGAIN Resource unavailable, try again
1197 * <pru_connect>:EPERM Operation not permitted
1198 * <sf_connect_out>:??? [anything a filter writer might set]
1201 soconnectlock(struct socket
*so
, struct sockaddr
*nam
, int dolock
)
1204 struct proc
*p
= current_proc();
1210 * If this is a listening socket or if this is a previously-accepted
1211 * socket that has been marked as inactive, reject the connect request.
1213 if ((so
->so_options
& SO_ACCEPTCONN
) || (so
->so_flags
& SOF_DEFUNCT
)) {
1215 socket_unlock(so
, 1);
1216 return (EOPNOTSUPP
);
1219 if ((so
->so_restrictions
& SO_RESTRICT_DENYOUT
) != 0) {
1221 socket_unlock(so
, 1);
1226 * If protocol is connection-based, can only connect once.
1227 * Otherwise, if connected, try to disconnect first.
1228 * This allows user to disconnect by connecting to, e.g.,
1231 if (so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
) &&
1232 ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ||
1233 (error
= sodisconnectlocked(so
)))) {
1237 * Run connect filter before calling protocol:
1238 * - non-blocking connect returns before completion;
1240 struct socket_filter_entry
*filter
;
1244 for (filter
= so
->so_filt
; filter
&& (error
== 0);
1245 filter
= filter
->sfe_next_onsocket
) {
1246 if (filter
->sfe_filter
->sf_filter
.sf_connect_out
) {
1247 if (filtered
== 0) {
1250 socket_unlock(so
, 0);
1252 error
= filter
->sfe_filter
->sf_filter
.
1253 sf_connect_out(filter
->sfe_cookie
, so
, nam
);
1256 if (filtered
!= 0) {
1262 if (error
== EJUSTRETURN
)
1265 socket_unlock(so
, 1);
1269 error
= (*so
->so_proto
->pr_usrreqs
->pru_connect
)(so
, nam
, p
);
1272 socket_unlock(so
, 1);
1277 soconnect(struct socket
*so
, struct sockaddr
*nam
)
1279 return (soconnectlock(so
, nam
, 1));
1283 * Returns: 0 Success
1284 * <pru_connect2>:EINVAL[AF_UNIX]
1285 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1286 * <pru_connect2>:??? [other protocol families]
1288 * Notes: <pru_connect2> is not supported by [TCP].
1291 soconnect2(struct socket
*so1
, struct socket
*so2
)
1295 socket_lock(so1
, 1);
1296 if (so2
->so_proto
->pr_lock
)
1297 socket_lock(so2
, 1);
1299 error
= (*so1
->so_proto
->pr_usrreqs
->pru_connect2
)(so1
, so2
);
1301 socket_unlock(so1
, 1);
1302 if (so2
->so_proto
->pr_lock
)
1303 socket_unlock(so2
, 1);
1308 sodisconnectlocked(struct socket
*so
)
1312 if ((so
->so_state
& SS_ISCONNECTED
) == 0) {
1316 if (so
->so_state
& SS_ISDISCONNECTING
) {
1321 error
= (*so
->so_proto
->pr_usrreqs
->pru_disconnect
)(so
);
1324 sflt_notify(so
, sock_evt_disconnected
, NULL
);
1330 /* Locking version */
1332 sodisconnect(struct socket
*so
)
1337 error
= sodisconnectlocked(so
);
1338 socket_unlock(so
, 1);
1342 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_DONTWAIT : M_WAIT)
1345 * sosendcheck will lock the socket buffer if it isn't locked and
1346 * verify that there is space for the data being inserted.
1348 * Returns: 0 Success
1350 * sblock:EWOULDBLOCK
1357 sosendcheck(struct socket
*so
, struct sockaddr
*addr
, int32_t resid
, int32_t clen
,
1358 int32_t atomic
, int flags
, int *sblocked
)
1365 if (*sblocked
== 0) {
1366 if ((so
->so_snd
.sb_flags
& SB_LOCK
) != 0 &&
1367 so
->so_send_filt_thread
!= 0 &&
1368 so
->so_send_filt_thread
== current_thread()) {
1370 * We're being called recursively from a filter,
1371 * allow this to continue. Radar 4150520.
1372 * Don't set sblocked because we don't want
1373 * to perform an unlock later.
1377 error
= sblock(&so
->so_snd
, SBLOCKWAIT(flags
));
1386 * If a send attempt is made on a previously-accepted socket
1387 * that has been marked as inactive (disconnected), reject
1390 if (so
->so_flags
& SOF_DEFUNCT
)
1393 if (so
->so_state
& SS_CANTSENDMORE
)
1397 error
= so
->so_error
;
1402 if ((so
->so_state
& SS_ISCONNECTED
) == 0) {
1403 if ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) != 0) {
1404 if ((so
->so_state
& SS_ISCONFIRMING
) == 0 &&
1405 !(resid
== 0 && clen
!= 0))
1407 } else if (addr
== 0 && !(flags
&MSG_HOLD
)) {
1408 return ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ?
1409 ENOTCONN
: EDESTADDRREQ
);
1412 space
= sbspace(&so
->so_snd
);
1413 if (flags
& MSG_OOB
)
1415 if ((atomic
&& resid
> so
->so_snd
.sb_hiwat
) ||
1416 clen
> so
->so_snd
.sb_hiwat
)
1418 if (space
< resid
+ clen
&&
1419 (atomic
|| space
< (int32_t)so
->so_snd
.sb_lowat
|| space
< clen
)) {
1420 if ((so
->so_state
& SS_NBIO
) || (flags
& MSG_NBIO
) ||
1422 return (EWOULDBLOCK
);
1424 sbunlock(&so
->so_snd
, 1);
1425 error
= sbwait(&so
->so_snd
);
1437 * If send must go all at once and message is larger than
1438 * send buffering, then hard error.
1439 * Lock against other senders.
1440 * If must go all at once and not enough room now, then
1441 * inform user that this would block and do nothing.
1442 * Otherwise, if nonblocking, send as much as possible.
1443 * The data to be sent is described by "uio" if nonzero,
1444 * otherwise by the mbuf chain "top" (which must be null
1445 * if uio is not). Data provided in mbuf chain must be small
1446 * enough to send all at once.
1448 * Returns nonzero on error, timeout or signal; callers
1449 * must check for short counts if EINTR/ERESTART are returned.
1450 * Data and control buffers are freed on return.
1452 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1453 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1454 * point at the mbuf chain being constructed and go from there.
1456 * Returns: 0 Success
1462 * sosendcheck:EWOULDBLOCK
1466 * sosendcheck:??? [value from so_error]
1467 * <pru_send>:ECONNRESET[TCP]
1468 * <pru_send>:EINVAL[TCP]
1469 * <pru_send>:ENOBUFS[TCP]
1470 * <pru_send>:EADDRINUSE[TCP]
1471 * <pru_send>:EADDRNOTAVAIL[TCP]
1472 * <pru_send>:EAFNOSUPPORT[TCP]
1473 * <pru_send>:EACCES[TCP]
1474 * <pru_send>:EAGAIN[TCP]
1475 * <pru_send>:EPERM[TCP]
1476 * <pru_send>:EMSGSIZE[TCP]
1477 * <pru_send>:EHOSTUNREACH[TCP]
1478 * <pru_send>:ENETUNREACH[TCP]
1479 * <pru_send>:ENETDOWN[TCP]
1480 * <pru_send>:ENOMEM[TCP]
1481 * <pru_send>:ENOBUFS[TCP]
1482 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
1483 * <pru_send>:EINVAL[AF_UNIX]
1484 * <pru_send>:EOPNOTSUPP[AF_UNIX]
1485 * <pru_send>:EPIPE[AF_UNIX]
1486 * <pru_send>:ENOTCONN[AF_UNIX]
1487 * <pru_send>:EISCONN[AF_UNIX]
1488 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
1489 * <sf_data_out>:??? [whatever a filter author chooses]
1491 * Notes: Other <pru_send> returns depend on the protocol family; all
1492 * <sf_data_out> returns depend on what the filter author causes
1493 * their filter to return.
1496 sosend(struct socket
*so
, struct sockaddr
*addr
, struct uio
*uio
,
1497 struct mbuf
*top
, struct mbuf
*control
, int flags
)
1500 register struct mbuf
*m
, *freelist
= NULL
;
1501 register int32_t space
, len
, resid
;
1502 int clen
= 0, error
, dontroute
, mlen
, sendflags
;
1503 int atomic
= sosendallatonce(so
) || top
;
1505 struct proc
*p
= current_proc();
1508 // LP64todo - fix this!
1509 resid
= uio_resid(uio
);
1511 resid
= top
->m_pkthdr
.len
;
1513 KERNEL_DEBUG((DBG_FNC_SOSEND
| DBG_FUNC_START
), so
, resid
,
1514 so
->so_snd
.sb_cc
, so
->so_snd
.sb_lowat
, so
->so_snd
.sb_hiwat
);
1517 if (so
->so_type
!= SOCK_STREAM
&& (flags
& MSG_OOB
) != 0) {
1519 socket_unlock(so
, 1);
1524 * In theory resid should be unsigned.
1525 * However, space must be signed, as it might be less than 0
1526 * if we over-committed, and we must use a signed comparison
1527 * of space and resid. On the other hand, a negative resid
1528 * causes us to loop sending 0-length segments to the protocol.
1530 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1531 * type sockets since that's an error.
1533 if (resid
< 0 || (so
->so_type
== SOCK_STREAM
&& (flags
& MSG_EOR
))) {
1535 socket_unlock(so
, 1);
1540 (flags
& MSG_DONTROUTE
) && (so
->so_options
& SO_DONTROUTE
) == 0 &&
1541 (so
->so_proto
->pr_flags
& PR_ATOMIC
);
1542 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgsnd
);
1544 clen
= control
->m_len
;
1547 error
= sosendcheck(so
, addr
, resid
, clen
, atomic
, flags
,
1553 space
= sbspace(&so
->so_snd
) - clen
+ ((flags
& MSG_OOB
) ?
1557 struct socket_filter_entry
*filter
;
1559 boolean_t recursive
;
1563 * Data is prepackaged in "top".
1566 if (flags
& MSG_EOR
)
1567 top
->m_flags
|= M_EOR
;
1573 bytes_to_copy
= imin(resid
, space
);
1575 if (sosendminchain
> 0) {
1578 chainlength
= sosendmaxchain
;
1582 * Attempt to use larger than system page-size
1583 * clusters for large writes only if there is
1584 * a jumbo cluster pool and if the socket is
1585 * marked accordingly.
1587 jumbocl
= sosendjcl
&& njcl
> 0 &&
1588 ((so
->so_flags
& SOF_MULTIPAGES
) ||
1589 sosendjcl_ignore_capab
);
1591 socket_unlock(so
, 0);
1595 int hdrs_needed
= (top
== 0) ? 1 : 0;
1598 * try to maintain a local cache of mbuf
1599 * clusters needed to complete this
1600 * write the list is further limited to
1601 * the number that are currently needed
1602 * to fill the socket this mechanism
1603 * allows a large number of mbufs/
1604 * clusters to be grabbed under a single
1605 * mbuf lock... if we can't get any
1606 * clusters, than fall back to trying
1607 * for mbufs if we fail early (or
1608 * miscalcluate the number needed) make
1609 * sure to release any clusters we
1610 * haven't yet consumed.
1612 if (freelist
== NULL
&&
1613 bytes_to_copy
> NBPG
&& jumbocl
) {
1615 bytes_to_copy
/ M16KCLBYTES
;
1617 if ((bytes_to_copy
-
1618 (num_needed
* M16KCLBYTES
))
1623 m_getpackets_internal(
1624 (unsigned int *)&num_needed
,
1625 hdrs_needed
, M_WAIT
, 0,
1628 * Fall back to 4K cluster size
1629 * if allocation failed
1633 if (freelist
== NULL
&&
1634 bytes_to_copy
> MCLBYTES
) {
1636 bytes_to_copy
/ NBPG
;
1638 if ((bytes_to_copy
-
1639 (num_needed
* NBPG
)) >=
1644 m_getpackets_internal(
1645 (unsigned int *)&num_needed
,
1646 hdrs_needed
, M_WAIT
, 0,
1649 * Fall back to cluster size
1650 * if allocation failed
1654 if (freelist
== NULL
&&
1655 bytes_to_copy
> MINCLSIZE
) {
1657 bytes_to_copy
/ MCLBYTES
;
1659 if ((bytes_to_copy
-
1660 (num_needed
* MCLBYTES
)) >=
1665 m_getpackets_internal(
1666 (unsigned int *)&num_needed
,
1667 hdrs_needed
, M_WAIT
, 0,
1670 * Fall back to a single mbuf
1671 * if allocation failed
1675 if (freelist
== NULL
) {
1683 if (freelist
== NULL
) {
1689 * For datagram protocols,
1690 * leave room for protocol
1691 * headers in first mbuf.
1693 if (atomic
&& top
== 0 &&
1694 bytes_to_copy
< MHLEN
) {
1700 freelist
= m
->m_next
;
1703 if ((m
->m_flags
& M_EXT
))
1704 mlen
= m
->m_ext
.ext_size
;
1705 else if ((m
->m_flags
& M_PKTHDR
))
1707 MHLEN
- m_leadingspace(m
);
1710 len
= imin(mlen
, bytes_to_copy
);
1716 error
= uiomove(mtod(m
, caddr_t
),
1719 resid
= uio_resid(uio
);
1723 top
->m_pkthdr
.len
+= len
;
1728 if (flags
& MSG_EOR
)
1729 top
->m_flags
|= M_EOR
;
1732 bytes_to_copy
= min(resid
, space
);
1734 } while (space
> 0 &&
1735 (chainlength
< sosendmaxchain
|| atomic
||
1736 resid
< MINCLSIZE
));
1744 if (flags
& (MSG_HOLD
|MSG_SEND
)) {
1745 /* Enqueue for later, go away if HOLD */
1746 register struct mbuf
*mb1
;
1747 if (so
->so_temp
&& (flags
& MSG_FLUSH
)) {
1748 m_freem(so
->so_temp
);
1752 so
->so_tail
->m_next
= top
;
1759 if (flags
& MSG_HOLD
) {
1766 so
->so_options
|= SO_DONTROUTE
;
1768 /* Compute flags here, for pru_send and NKEs */
1769 sendflags
= (flags
& MSG_OOB
) ? PRUS_OOB
:
1771 * If the user set MSG_EOF, the protocol
1772 * understands this flag and nothing left to
1773 * send then use PRU_SEND_EOF instead of PRU_SEND.
1775 ((flags
& MSG_EOF
) &&
1776 (so
->so_proto
->pr_flags
& PR_IMPLOPCL
) &&
1779 /* If there is more to send set PRUS_MORETOCOME */
1780 (resid
> 0 && space
> 0) ? PRUS_MORETOCOME
: 0;
1783 * Socket filter processing
1785 recursive
= (so
->so_send_filt_thread
!= NULL
);
1788 for (filter
= so
->so_filt
; filter
&& (error
== 0);
1789 filter
= filter
->sfe_next_onsocket
) {
1790 if (filter
->sfe_filter
->sf_filter
.sf_data_out
) {
1792 if (filtered
== 0) {
1794 so
->so_send_filt_thread
=
1797 socket_unlock(so
, 0);
1799 (sendflags
& MSG_OOB
) ?
1800 sock_data_filt_flag_oob
: 0;
1802 error
= filter
->sfe_filter
->sf_filter
.
1803 sf_data_out(filter
->sfe_cookie
, so
,
1804 addr
, &top
, &control
, so_flags
);
1810 * At this point, we've run at least one
1811 * filter. The socket is unlocked as is
1812 * the socket buffer. Clear the recorded
1813 * filter thread only when we are outside
1814 * of a filter's context. This allows for
1815 * a filter to issue multiple inject calls
1816 * from its sf_data_out callback routine.
1821 so
->so_send_filt_thread
= 0;
1823 if (error
== EJUSTRETURN
) {
1834 * End Socket filter processing
1837 if (error
== EJUSTRETURN
) {
1838 /* A socket filter handled this data */
1841 error
= (*so
->so_proto
->pr_usrreqs
->pru_send
)
1842 (so
, sendflags
, top
, addr
, control
, p
);
1845 if (flags
& MSG_SEND
)
1849 so
->so_options
&= ~SO_DONTROUTE
;
1857 } while (resid
&& space
> 0);
1862 sbunlock(&so
->so_snd
, 0); /* will unlock socket */
1864 socket_unlock(so
, 1);
1871 m_freem_list(freelist
);
1873 KERNEL_DEBUG(DBG_FNC_SOSEND
| DBG_FUNC_END
, so
, resid
, so
->so_snd
.sb_cc
,
1880 * Implement receive operations on a socket.
1881 * We depend on the way that records are added to the sockbuf
1882 * by sbappend*. In particular, each record (mbufs linked through m_next)
1883 * must begin with an address if the protocol so specifies,
1884 * followed by an optional mbuf or mbufs containing ancillary data,
1885 * and then zero or more mbufs of data.
1886 * In order to avoid blocking network interrupts for the entire time here,
1887 * we splx() while doing the actual copy to user space.
1888 * Although the sockbuf is locked, new data may still be appended,
1889 * and thus we must maintain consistency of the sockbuf during that time.
1891 * The caller may receive the data as a single mbuf chain by supplying
1892 * an mbuf **mp0 for use in returning the chain. The uio is then used
1893 * only for the count in uio_resid.
1895 * Returns: 0 Success
1900 * sblock:EWOULDBLOCK
1904 * sodelayed_copy:EFAULT
1905 * <pru_rcvoob>:EINVAL[TCP]
1906 * <pru_rcvoob>:EWOULDBLOCK[TCP]
1908 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
1909 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
1910 * <pr_domain->dom_externalize>:???
1912 * Notes: Additional return values from calls through <pru_rcvoob> and
1913 * <pr_domain->dom_externalize> depend on protocols other than
1914 * TCP or AF_UNIX, which are documented above.
1917 soreceive(struct socket
*so
, struct sockaddr
**psa
, struct uio
*uio
,
1918 struct mbuf
**mp0
, struct mbuf
**controlp
, int *flagsp
)
1920 register struct mbuf
*m
, **mp
, *ml
= NULL
;
1921 register int flags
, len
, error
, offset
;
1922 struct protosw
*pr
= so
->so_proto
;
1923 struct mbuf
*nextrecord
;
1925 int orig_resid
= uio_resid(uio
);
1926 struct mbuf
*free_list
;
1927 int delayed_copy_len
;
1930 struct proc
*p
= current_proc();
1932 // LP64todo - fix this!
1933 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_START
, so
, uio_resid(uio
),
1934 so
->so_rcv
.sb_cc
, so
->so_rcv
.sb_lowat
, so
->so_rcv
.sb_hiwat
);
1938 #ifdef MORE_LOCKING_DEBUG
1939 if (so
->so_usecount
== 1)
1940 panic("soreceive: so=%x no other reference on socket\n", so
);
1948 flags
= *flagsp
&~ MSG_EOR
;
1953 * If a recv attempt is made on a previously-accepted socket
1954 * that has been marked as inactive (disconnected), reject
1957 if (so
->so_flags
& SOF_DEFUNCT
) {
1958 struct sockbuf
*sb
= &so
->so_rcv
;
1961 * This socket should have been disconnected and flushed
1962 * prior to being returned from accept; there should be
1963 * no data on its receive list, so panic otherwise.
1965 sb_empty_assert(sb
, __func__
);
1966 socket_unlock(so
, 1);
1971 * When SO_WANTOOBFLAG is set we try to get out-of-band data
1972 * regardless of the flags argument. Here is the case were
1973 * out-of-band data is not inline.
1975 if ((flags
& MSG_OOB
) ||
1976 ((so
->so_options
& SO_WANTOOBFLAG
) != 0 &&
1977 (so
->so_options
& SO_OOBINLINE
) == 0 &&
1978 (so
->so_oobmark
|| (so
->so_state
& SS_RCVATMARK
)))) {
1979 m
= m_get(M_WAIT
, MT_DATA
);
1981 socket_unlock(so
, 1);
1982 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
,
1983 ENOBUFS
, 0, 0, 0, 0);
1986 error
= (*pr
->pr_usrreqs
->pru_rcvoob
)(so
, m
, flags
& MSG_PEEK
);
1989 socket_unlock(so
, 0);
1991 error
= uiomove(mtod(m
, caddr_t
),
1992 imin(uio_resid(uio
), m
->m_len
), uio
);
1994 } while (uio_resid(uio
) && error
== 0 && m
);
2000 if ((so
->so_options
& SO_WANTOOBFLAG
) != 0) {
2001 if (error
== EWOULDBLOCK
|| error
== EINVAL
) {
2003 * Let's try to get normal data:
2004 * EWOULDBLOCK: out-of-band data not
2005 * receive yet. EINVAL: out-of-band data
2010 } else if (error
== 0 && flagsp
) {
2014 socket_unlock(so
, 1);
2015 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
2022 *mp
= (struct mbuf
*)0;
2023 if (so
->so_state
& SS_ISCONFIRMING
&& uio_resid(uio
))
2024 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, 0);
2027 free_list
= (struct mbuf
*)0;
2028 delayed_copy_len
= 0;
2030 #ifdef MORE_LOCKING_DEBUG
2031 if (so
->so_usecount
<= 1)
2032 printf("soreceive: sblock so=%p ref=%d on socket\n",
2033 so
, so
->so_usecount
);
2036 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2037 * and if so just return to the caller. This could happen when
2038 * soreceive() is called by a socket upcall function during the
2039 * time the socket is freed. The socket buffer would have been
2040 * locked across the upcall, therefore we cannot put this thread
2041 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2042 * we may livelock), because the lock on the socket buffer will
2043 * only be released when the upcall routine returns to its caller.
2044 * Because the socket has been officially closed, there can be
2045 * no further read on it.
2047 if ((so
->so_state
& (SS_NOFDREF
| SS_CANTRCVMORE
)) ==
2048 (SS_NOFDREF
| SS_CANTRCVMORE
)) {
2049 socket_unlock(so
, 1);
2053 error
= sblock(&so
->so_rcv
, SBLOCKWAIT(flags
));
2055 socket_unlock(so
, 1);
2056 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
2061 m
= so
->so_rcv
.sb_mb
;
2063 * If we have less data than requested, block awaiting more
2064 * (subject to any timeout) if:
2065 * 1. the current count is less than the low water mark, or
2066 * 2. MSG_WAITALL is set, and it is possible to do the entire
2067 * receive operation at once if we block (resid <= hiwat).
2068 * 3. MSG_DONTWAIT is not set
2069 * If MSG_WAITALL is set but resid is larger than the receive buffer,
2070 * we have to do the receive in sections, and thus risk returning
2071 * a short count if a timeout or signal occurs after we start.
2073 if (m
== 0 || (((flags
& MSG_DONTWAIT
) == 0 &&
2074 so
->so_rcv
.sb_cc
< uio_resid(uio
)) &&
2075 (so
->so_rcv
.sb_cc
< so
->so_rcv
.sb_lowat
||
2076 ((flags
& MSG_WAITALL
) && uio_resid(uio
) <= so
->so_rcv
.sb_hiwat
)) &&
2077 m
->m_nextpkt
== 0 && (pr
->pr_flags
& PR_ATOMIC
) == 0)) {
2079 * Panic if we notice inconsistencies in the socket's
2080 * receive list; both sb_mb and sb_cc should correctly
2081 * reflect the contents of the list, otherwise we may
2082 * end up with false positives during select() or poll()
2083 * which could put the application in a bad state.
2085 if (m
== NULL
&& so
->so_rcv
.sb_cc
!= 0)
2086 panic("soreceive corrupted so_rcv: m %p cc %u",
2087 m
, so
->so_rcv
.sb_cc
);
2092 error
= so
->so_error
;
2093 if ((flags
& MSG_PEEK
) == 0)
2097 if (so
->so_state
& SS_CANTRCVMORE
) {
2103 for (; m
; m
= m
->m_next
)
2104 if (m
->m_type
== MT_OOBDATA
|| (m
->m_flags
& M_EOR
)) {
2105 m
= so
->so_rcv
.sb_mb
;
2108 if ((so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
)) == 0 &&
2109 (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
)) {
2113 if (uio_resid(uio
) == 0)
2115 if ((so
->so_state
& SS_NBIO
) ||
2116 (flags
& (MSG_DONTWAIT
|MSG_NBIO
))) {
2117 error
= EWOULDBLOCK
;
2120 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 1");
2121 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 1");
2122 sbunlock(&so
->so_rcv
, 1);
2123 #if EVEN_MORE_LOCKING_DEBUG
2125 printf("Waiting for socket data\n");
2128 error
= sbwait(&so
->so_rcv
);
2129 #if EVEN_MORE_LOCKING_DEBUG
2131 printf("SORECEIVE - sbwait returned %d\n", error
);
2133 if (so
->so_usecount
< 1)
2134 panic("soreceive: after 2nd sblock so=%p ref=%d on "
2135 "socket\n", so
, so
->so_usecount
);
2137 socket_unlock(so
, 1);
2138 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
2145 OSIncrementAtomicLong(&p
->p_stats
->p_ru
.ru_msgrcv
);
2146 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1");
2147 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1");
2148 nextrecord
= m
->m_nextpkt
;
2149 if ((pr
->pr_flags
& PR_ADDR
) && m
->m_type
== MT_SONAME
) {
2150 KASSERT(m
->m_type
== MT_SONAME
, ("receive 1a"));
2151 #if CONFIG_MACF_SOCKET_SUBSET
2153 * Call the MAC framework for policy checking if we're in
2154 * the user process context and the socket isn't connected.
2156 if (p
!= kernproc
&& !(so
->so_state
& SS_ISCONNECTED
)) {
2157 struct mbuf
*m0
= m
;
2159 * Dequeue this record (temporarily) from the receive
2160 * list since we're about to drop the socket's lock
2161 * where a new record may arrive and be appended to
2162 * the list. Upon MAC policy failure, the record
2163 * will be freed. Otherwise, we'll add it back to
2164 * the head of the list. We cannot rely on SB_LOCK
2165 * because append operation uses the socket's lock.
2168 m
->m_nextpkt
= NULL
;
2169 sbfree(&so
->so_rcv
, m
);
2171 } while (m
!= NULL
);
2173 so
->so_rcv
.sb_mb
= nextrecord
;
2174 SB_EMPTY_FIXUP(&so
->so_rcv
);
2175 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1a");
2176 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1a");
2177 socket_unlock(so
, 0);
2178 if (mac_socket_check_received(proc_ucred(p
), so
,
2179 mtod(m
, struct sockaddr
*)) != 0) {
2181 * MAC policy failure; free this record and
2182 * process the next record (or block until
2183 * one is available). We have adjusted sb_cc
2184 * and sb_mbcnt above so there is no need to
2185 * call sbfree() again.
2189 } while (m
!= NULL
);
2191 * Clear SB_LOCK but don't unlock the socket.
2192 * Process the next record or wait for one.
2195 sbunlock(&so
->so_rcv
, 1);
2200 * Re-adjust the socket receive list and re-enqueue
2201 * the record in front of any packets which may have
2202 * been appended while we dropped the lock.
2204 for (m
= m0
; m
->m_next
!= NULL
; m
= m
->m_next
)
2205 sballoc(&so
->so_rcv
, m
);
2206 sballoc(&so
->so_rcv
, m
);
2207 if (so
->so_rcv
.sb_mb
== NULL
) {
2208 so
->so_rcv
.sb_lastrecord
= m0
;
2209 so
->so_rcv
.sb_mbtail
= m
;
2212 nextrecord
= m
->m_nextpkt
= so
->so_rcv
.sb_mb
;
2213 so
->so_rcv
.sb_mb
= m
;
2214 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1b");
2215 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1b");
2217 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2220 *psa
= dup_sockaddr(mtod(m
, struct sockaddr
*),
2222 if ((*psa
== 0) && (flags
& MSG_NEEDSA
)) {
2223 error
= EWOULDBLOCK
;
2227 if (flags
& MSG_PEEK
) {
2230 sbfree(&so
->so_rcv
, m
);
2231 if (m
->m_next
== 0 && so
->so_rcv
.sb_cc
!= 0)
2232 panic("soreceive: about to create invalid "
2234 MFREE(m
, so
->so_rcv
.sb_mb
);
2235 m
= so
->so_rcv
.sb_mb
;
2237 m
->m_nextpkt
= nextrecord
;
2239 so
->so_rcv
.sb_mb
= nextrecord
;
2240 SB_EMPTY_FIXUP(&so
->so_rcv
);
2246 * Process one or more MT_CONTROL mbufs present before any data mbufs
2247 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2248 * just copy the data; if !MSG_PEEK, we call into the protocol to
2249 * perform externalization.
2251 if (m
!= NULL
&& m
->m_type
== MT_CONTROL
) {
2252 struct mbuf
*cm
= NULL
, *cmn
;
2253 struct mbuf
**cme
= &cm
;
2254 struct sockbuf
*sb_rcv
= &so
->so_rcv
;
2257 * Externalizing the control messages would require us to
2258 * drop the socket's lock below. Once we re-acquire the
2259 * lock, the mbuf chain might change. In order to preserve
2260 * consistency, we unlink all control messages from the
2261 * first mbuf chain in one shot and link them separately
2262 * onto a different chain.
2265 if (flags
& MSG_PEEK
) {
2266 if (controlp
!= NULL
) {
2267 *controlp
= m_copy(m
, 0, m
->m_len
);
2268 controlp
= &(*controlp
)->m_next
;
2272 m
->m_nextpkt
= NULL
;
2274 sb_rcv
->sb_mb
= m
->m_next
;
2277 cme
= &(*cme
)->m_next
;
2280 } while (m
!= NULL
&& m
->m_type
== MT_CONTROL
);
2282 if (!(flags
& MSG_PEEK
)) {
2283 if (sb_rcv
->sb_mb
!= NULL
) {
2284 sb_rcv
->sb_mb
->m_nextpkt
= nextrecord
;
2286 sb_rcv
->sb_mb
= nextrecord
;
2287 SB_EMPTY_FIXUP(sb_rcv
);
2289 if (nextrecord
== NULL
)
2290 sb_rcv
->sb_lastrecord
= m
;
2293 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive ctl");
2294 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive ctl");
2296 while (cm
!= NULL
) {
2301 cmsg_type
= mtod(cm
, struct cmsghdr
*)->cmsg_type
;
2304 * Call the protocol to externalize SCM_RIGHTS message
2305 * and return the modified message to the caller upon
2306 * success. Otherwise, all other control messages are
2307 * returned unmodified to the caller. Note that we
2308 * only get into this loop if MSG_PEEK is not set.
2310 if (pr
->pr_domain
->dom_externalize
!= NULL
&&
2311 cmsg_type
== SCM_RIGHTS
) {
2313 * Release socket lock: see 3903171. This
2314 * would also allow more records to be appended
2315 * to the socket buffer. We still have SB_LOCK
2316 * set on it, so we can be sure that the head
2317 * of the mbuf chain won't change.
2319 socket_unlock(so
, 0);
2320 error
= (*pr
->pr_domain
->dom_externalize
)(cm
);
2326 if (controlp
!= NULL
&& error
== 0) {
2328 controlp
= &(*controlp
)->m_next
;
2336 if (sb_rcv
->sb_mb
!= NULL
)
2337 nextrecord
= sb_rcv
->sb_mb
->m_nextpkt
;
2343 if (!(flags
& MSG_PEEK
)) {
2345 * We get here because m points to an mbuf following
2346 * any MT_SONAME or MT_CONTROL mbufs which have been
2347 * processed above. In any case, m should be pointing
2348 * to the head of the mbuf chain, and the nextrecord
2349 * should be either NULL or equal to m->m_nextpkt.
2350 * See comments above about SB_LOCK.
2352 if (m
!= so
->so_rcv
.sb_mb
|| m
->m_nextpkt
!= nextrecord
)
2353 panic("soreceive: post-control !sync so=%p "
2354 "m=%p nextrecord=%p\n", so
, m
, nextrecord
);
2356 if (nextrecord
== NULL
)
2357 so
->so_rcv
.sb_lastrecord
= m
;
2360 if (type
== MT_OOBDATA
)
2363 if (!(flags
& MSG_PEEK
)) {
2364 so
->so_rcv
.sb_mb
= nextrecord
;
2365 SB_EMPTY_FIXUP(&so
->so_rcv
);
2368 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 2");
2369 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 2");
2374 if (!(flags
& MSG_PEEK
) && uio_resid(uio
) > sorecvmincopy
)
2381 while (m
&& (uio_resid(uio
) - delayed_copy_len
) > 0 && error
== 0) {
2382 if (m
->m_type
== MT_OOBDATA
) {
2383 if (type
!= MT_OOBDATA
)
2385 } else if (type
== MT_OOBDATA
) {
2389 * Make sure to allways set MSG_OOB event when getting
2390 * out of band data inline.
2392 if ((so
->so_options
& SO_WANTOOBFLAG
) != 0 &&
2393 (so
->so_options
& SO_OOBINLINE
) != 0 &&
2394 (so
->so_state
& SS_RCVATMARK
) != 0) {
2397 so
->so_state
&= ~SS_RCVATMARK
;
2398 len
= uio_resid(uio
) - delayed_copy_len
;
2399 if (so
->so_oobmark
&& len
> so
->so_oobmark
- offset
)
2400 len
= so
->so_oobmark
- offset
;
2401 if (len
> m
->m_len
- moff
)
2402 len
= m
->m_len
- moff
;
2404 * If mp is set, just pass back the mbufs.
2405 * Otherwise copy them out via the uio, then free.
2406 * Sockbuf must be consistent here (points to current mbuf,
2407 * it points to next record) when we drop priority;
2408 * we must note any additions to the sockbuf when we
2409 * block interrupts again.
2412 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive uiomove");
2413 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive uiomove");
2414 if (can_delay
&& len
== m
->m_len
) {
2416 * only delay the copy if we're consuming the
2417 * mbuf and we're NOT in MSG_PEEK mode
2418 * and we have enough data to make it worthwile
2419 * to drop and retake the lock... can_delay
2420 * reflects the state of the 2 latter
2421 * constraints moff should always be zero
2424 delayed_copy_len
+= len
;
2426 if (delayed_copy_len
) {
2427 error
= sodelayed_copy(so
, uio
,
2428 &free_list
, &delayed_copy_len
);
2434 * can only get here if MSG_PEEK is not
2435 * set therefore, m should point at the
2436 * head of the rcv queue; if it doesn't,
2437 * it means something drastically
2438 * changed while we were out from behind
2439 * the lock in sodelayed_copy. perhaps
2440 * a RST on the stream. in any event,
2441 * the stream has been interrupted. it's
2442 * probably best just to return whatever
2443 * data we've moved and let the caller
2446 if (m
!= so
->so_rcv
.sb_mb
) {
2450 socket_unlock(so
, 0);
2451 error
= uiomove(mtod(m
, caddr_t
) + moff
,
2459 uio_setresid(uio
, (uio_resid(uio
) - len
));
2461 if (len
== m
->m_len
- moff
) {
2462 if (m
->m_flags
& M_EOR
)
2464 if (flags
& MSG_PEEK
) {
2468 nextrecord
= m
->m_nextpkt
;
2469 sbfree(&so
->so_rcv
, m
);
2470 m
->m_nextpkt
= NULL
;
2475 so
->so_rcv
.sb_mb
= m
= m
->m_next
;
2476 *mp
= (struct mbuf
*)0;
2478 if (free_list
== NULL
)
2483 so
->so_rcv
.sb_mb
= m
= m
->m_next
;
2487 m
->m_nextpkt
= nextrecord
;
2488 if (nextrecord
== NULL
)
2489 so
->so_rcv
.sb_lastrecord
= m
;
2491 so
->so_rcv
.sb_mb
= nextrecord
;
2492 SB_EMPTY_FIXUP(&so
->so_rcv
);
2494 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 3");
2495 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 3");
2498 if (flags
& MSG_PEEK
) {
2502 *mp
= m_copym(m
, 0, len
, M_WAIT
);
2505 so
->so_rcv
.sb_cc
-= len
;
2508 if (so
->so_oobmark
) {
2509 if ((flags
& MSG_PEEK
) == 0) {
2510 so
->so_oobmark
-= len
;
2511 if (so
->so_oobmark
== 0) {
2512 so
->so_state
|= SS_RCVATMARK
;
2514 * delay posting the actual event until
2515 * after any delayed copy processing
2523 if (offset
== so
->so_oobmark
)
2527 if (flags
& MSG_EOR
)
2530 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
2531 * (for non-atomic socket), we must not quit until
2532 * "uio->uio_resid == 0" or an error termination.
2533 * If a signal/timeout occurs, return with a short
2534 * count but without error. Keep sockbuf locked
2535 * against other readers.
2537 while (flags
& (MSG_WAITALL
|MSG_WAITSTREAM
) && m
== 0 &&
2538 (uio_resid(uio
) - delayed_copy_len
) > 0 &&
2539 !sosendallatonce(so
) && !nextrecord
) {
2540 if (so
->so_error
|| so
->so_state
& SS_CANTRCVMORE
)
2544 * Depending on the protocol (e.g. TCP), the following
2545 * might cause the socket lock to be dropped and later
2546 * be reacquired, and more data could have arrived and
2547 * have been appended to the receive socket buffer by
2548 * the time it returns. Therefore, we only sleep in
2549 * sbwait() below if and only if the socket buffer is
2550 * empty, in order to avoid a false sleep.
2552 if (pr
->pr_flags
& PR_WANTRCVD
&& so
->so_pcb
&&
2553 (((struct inpcb
*)so
->so_pcb
)->inp_state
!=
2555 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
2557 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 2");
2558 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 2");
2560 if (so
->so_rcv
.sb_mb
== NULL
&& sbwait(&so
->so_rcv
)) {
2565 * have to wait until after we get back from the sbwait
2566 * to do the copy because we will drop the lock if we
2567 * have enough data that has been delayed... by dropping
2568 * the lock we open up a window allowing the netisr
2569 * thread to process the incoming packets and to change
2570 * the state of this socket... we're issuing the sbwait
2571 * because the socket is empty and we're expecting the
2572 * netisr thread to wake us up when more packets arrive;
2573 * if we allow that processing to happen and then sbwait
2574 * we could stall forever with packets sitting in the
2575 * socket if no further packets arrive from the remote
2578 * we want to copy before we've collected all the data
2579 * to satisfy this request to allow the copy to overlap
2580 * the incoming packet processing on an MP system
2582 if (delayed_copy_len
> sorecvmincopy
&&
2583 (delayed_copy_len
> (so
->so_rcv
.sb_hiwat
/ 2))) {
2584 error
= sodelayed_copy(so
, uio
,
2585 &free_list
, &delayed_copy_len
);
2590 m
= so
->so_rcv
.sb_mb
;
2592 nextrecord
= m
->m_nextpkt
;
2596 #ifdef MORE_LOCKING_DEBUG
2597 if (so
->so_usecount
<= 1)
2598 panic("soreceive: after big while so=%p ref=%d on socket\n",
2599 so
, so
->so_usecount
);
2602 if (m
&& pr
->pr_flags
& PR_ATOMIC
) {
2604 if (so
->so_options
& SO_DONTTRUNC
) {
2605 flags
|= MSG_RCVMORE
;
2609 if ((flags
& MSG_PEEK
) == 0)
2610 (void) sbdroprecord(&so
->so_rcv
);
2617 * pru_rcvd below (for TCP) may cause more data to be received
2618 * if the socket lock is dropped prior to sending the ACK; some
2619 * legacy OpenTransport applications don't handle this well
2620 * (if it receives less data than requested while MSG_HAVEMORE
2621 * is set), and so we set the flag now based on what we know
2622 * prior to calling pru_rcvd.
2624 if ((so
->so_options
& SO_WANTMORE
) && so
->so_rcv
.sb_cc
> 0)
2625 flags
|= MSG_HAVEMORE
;
2627 if ((flags
& MSG_PEEK
) == 0) {
2629 so
->so_rcv
.sb_mb
= nextrecord
;
2631 * First part is an inline SB_EMPTY_FIXUP(). Second
2632 * part makes sure sb_lastrecord is up-to-date if
2633 * there is still data in the socket buffer.
2635 if (so
->so_rcv
.sb_mb
== NULL
) {
2636 so
->so_rcv
.sb_mbtail
= NULL
;
2637 so
->so_rcv
.sb_lastrecord
= NULL
;
2638 } else if (nextrecord
->m_nextpkt
== NULL
) {
2639 so
->so_rcv
.sb_lastrecord
= nextrecord
;
2642 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 4");
2643 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 4");
2644 if (pr
->pr_flags
& PR_WANTRCVD
&& so
->so_pcb
)
2645 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
2648 if (delayed_copy_len
) {
2649 error
= sodelayed_copy(so
, uio
, &free_list
, &delayed_copy_len
);
2655 m_freem_list((struct mbuf
*)free_list
);
2656 free_list
= (struct mbuf
*)0;
2659 postevent(so
, 0, EV_OOB
);
2661 if (orig_resid
== uio_resid(uio
) && orig_resid
&&
2662 (flags
& MSG_EOR
) == 0 && (so
->so_state
& SS_CANTRCVMORE
) == 0) {
2663 sbunlock(&so
->so_rcv
, 1);
2670 #ifdef MORE_LOCKING_DEBUG
2671 if (so
->so_usecount
<= 1)
2672 panic("soreceive: release so=%p ref=%d on socket\n",
2673 so
, so
->so_usecount
);
2675 if (delayed_copy_len
) {
2676 error
= sodelayed_copy(so
, uio
, &free_list
, &delayed_copy_len
);
2679 m_freem_list((struct mbuf
*)free_list
);
2681 sbunlock(&so
->so_rcv
, 0); /* will unlock socket */
2683 // LP64todo - fix this!
2684 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, so
, uio_resid(uio
),
2685 so
->so_rcv
.sb_cc
, 0, error
);
2691 * Returns: 0 Success
2695 sodelayed_copy(struct socket
*so
, struct uio
*uio
, struct mbuf
**free_list
,
2703 socket_unlock(so
, 0);
2705 while (m
&& error
== 0) {
2707 error
= uiomove(mtod(m
, caddr_t
), (int)m
->m_len
, uio
);
2711 m_freem_list(*free_list
);
2713 *free_list
= (struct mbuf
*)NULL
;
2723 * Returns: 0 Success
2726 * <pru_shutdown>:EINVAL
2727 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
2728 * <pru_shutdown>:ENOBUFS[TCP]
2729 * <pru_shutdown>:EMSGSIZE[TCP]
2730 * <pru_shutdown>:EHOSTUNREACH[TCP]
2731 * <pru_shutdown>:ENETUNREACH[TCP]
2732 * <pru_shutdown>:ENETDOWN[TCP]
2733 * <pru_shutdown>:ENOMEM[TCP]
2734 * <pru_shutdown>:EACCES[TCP]
2735 * <pru_shutdown>:EMSGSIZE[TCP]
2736 * <pru_shutdown>:ENOBUFS[TCP]
2737 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2738 * <pru_shutdown>:??? [other protocol families]
2741 soshutdown(struct socket
*so
, int how
)
2751 (SS_ISCONNECTED
|SS_ISCONNECTING
|SS_ISDISCONNECTING
)) == 0) {
2754 error
= soshutdownlock(so
, how
);
2756 socket_unlock(so
, 1);
2767 soshutdownlock(struct socket
*so
, int how
)
2769 struct protosw
*pr
= so
->so_proto
;
2772 sflt_notify(so
, sock_evt_shutdown
, &how
);
2774 if (how
!= SHUT_WR
) {
2775 if ((so
->so_state
& SS_CANTRCVMORE
) != 0) {
2776 /* read already shut down */
2781 postevent(so
, 0, EV_RCLOSED
);
2783 if (how
!= SHUT_RD
) {
2784 if ((so
->so_state
& SS_CANTSENDMORE
) != 0) {
2785 /* write already shut down */
2789 error
= (*pr
->pr_usrreqs
->pru_shutdown
)(so
);
2790 postevent(so
, 0, EV_WCLOSED
);
2793 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN
| DBG_FUNC_END
, 0, 0, 0, 0, 0);
2798 sorflush(struct socket
*so
)
2800 register struct sockbuf
*sb
= &so
->so_rcv
;
2801 register struct protosw
*pr
= so
->so_proto
;
2804 #ifdef MORE_LOCKING_DEBUG
2805 lck_mtx_t
*mutex_held
;
2807 if (so
->so_proto
->pr_getlock
!= NULL
)
2808 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
2810 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
2811 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
2814 sflt_notify(so
, sock_evt_flush_read
, NULL
);
2816 sb
->sb_flags
|= SB_NOINTR
;
2817 (void) sblock(sb
, M_WAIT
);
2821 selthreadclear(&sb
->sb_sel
);
2824 bzero((caddr_t
)sb
, sizeof (*sb
));
2825 sb
->sb_so
= so
; /* reestablish link to socket */
2826 if (asb
.sb_flags
& SB_KNOTE
) {
2827 sb
->sb_sel
.si_note
= asb
.sb_sel
.si_note
;
2828 sb
->sb_flags
= SB_KNOTE
;
2830 if (asb
.sb_flags
& SB_DROP
)
2831 sb
->sb_flags
|= SB_DROP
;
2832 if (asb
.sb_flags
& SB_UNIX
)
2833 sb
->sb_flags
|= SB_UNIX
;
2834 if ((pr
->pr_flags
& PR_RIGHTS
) && pr
->pr_domain
->dom_dispose
) {
2835 (*pr
->pr_domain
->dom_dispose
)(asb
.sb_mb
);
2841 * Perhaps this routine, and sooptcopyout(), below, ought to come in
2842 * an additional variant to handle the case where the option value needs
2843 * to be some kind of integer, but not a specific size.
2844 * In addition to their use here, these functions are also called by the
2845 * protocol-level pr_ctloutput() routines.
2847 * Returns: 0 Success
2852 sooptcopyin(struct sockopt
*sopt
, void *buf
, size_t len
, size_t minlen
)
2857 * If the user gives us more than we wanted, we ignore it,
2858 * but if we don't get the minimum length the caller
2859 * wants, we return EINVAL. On success, sopt->sopt_valsize
2860 * is set to however much we actually retrieved.
2862 if ((valsize
= sopt
->sopt_valsize
) < minlen
)
2865 sopt
->sopt_valsize
= valsize
= len
;
2867 if (sopt
->sopt_p
!= kernproc
)
2868 return (copyin(sopt
->sopt_val
, buf
, valsize
));
2870 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), buf
, valsize
);
2875 * sooptcopyin_timeval
2876 * Copy in a timeval value into tv_p, and take into account whether the
2877 * the calling process is 64-bit or 32-bit. Moved the sanity checking
2878 * code here so that we can verify the 64-bit tv_sec value before we lose
2879 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
2882 sooptcopyin_timeval(struct sockopt
*sopt
, struct timeval
* tv_p
)
2886 if (proc_is64bit(sopt
->sopt_p
)) {
2887 struct user64_timeval tv64
;
2889 if (sopt
->sopt_valsize
< sizeof(tv64
)) {
2892 sopt
->sopt_valsize
= sizeof(tv64
);
2893 if (sopt
->sopt_p
!= kernproc
) {
2894 error
= copyin(sopt
->sopt_val
, &tv64
, sizeof(tv64
));
2898 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), &tv64
,
2901 if (tv64
.tv_sec
< 0 || tv64
.tv_sec
> LONG_MAX
2902 || tv64
.tv_usec
< 0 || tv64
.tv_usec
>= 1000000) {
2905 tv_p
->tv_sec
= tv64
.tv_sec
;
2906 tv_p
->tv_usec
= tv64
.tv_usec
;
2908 struct user32_timeval tv32
;
2910 if (sopt
->sopt_valsize
< sizeof(tv32
)) {
2913 sopt
->sopt_valsize
= sizeof(tv32
);
2914 if (sopt
->sopt_p
!= kernproc
) {
2915 error
= copyin(sopt
->sopt_val
, &tv32
, sizeof(tv32
));
2920 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), &tv32
,
2923 #ifndef __LP64__ // K64todo "comparison is always false due to limited range of data type"
2924 if (tv32
.tv_sec
< 0 || tv32
.tv_sec
> LONG_MAX
2925 || tv32
.tv_usec
< 0 || tv32
.tv_usec
>= 1000000) {
2929 tv_p
->tv_sec
= tv32
.tv_sec
;
2930 tv_p
->tv_usec
= tv32
.tv_usec
;
2936 * Returns: 0 Success
2941 * sooptcopyin:EINVAL
2942 * sooptcopyin:EFAULT
2943 * sooptcopyin_timeval:EINVAL
2944 * sooptcopyin_timeval:EFAULT
2945 * sooptcopyin_timeval:EDOM
2946 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
2947 * <pr_ctloutput>:???w
2948 * sflt_attach_private:??? [whatever a filter author chooses]
2949 * <sf_setoption>:??? [whatever a filter author chooses]
2951 * Notes: Other <pru_listen> returns depend on the protocol family; all
2952 * <sf_listen> returns depend on what the filter author causes
2953 * their filter to return.
2956 sosetopt(struct socket
*so
, struct sockopt
*sopt
)
2961 struct socket_filter_entry
*filter
;
2963 #if CONFIG_MACF_SOCKET
2965 #endif /* MAC_SOCKET */
2968 if ((so
->so_state
& (SS_CANTRCVMORE
| SS_CANTSENDMORE
))
2969 == (SS_CANTRCVMORE
| SS_CANTSENDMORE
) &&
2970 (so
->so_flags
& SOF_NPX_SETOPTSHUT
) == 0) {
2971 /* the socket has been shutdown, no more sockopt's */
2976 if (sopt
->sopt_dir
!= SOPT_SET
) {
2977 sopt
->sopt_dir
= SOPT_SET
;
2981 for (filter
= so
->so_filt
; filter
&& (error
== 0);
2982 filter
= filter
->sfe_next_onsocket
) {
2983 if (filter
->sfe_filter
->sf_filter
.sf_setoption
) {
2984 if (filtered
== 0) {
2987 socket_unlock(so
, 0);
2989 error
= filter
->sfe_filter
->sf_filter
.
2990 sf_setoption(filter
->sfe_cookie
, so
, sopt
);
2994 if (filtered
!= 0) {
2999 if (error
== EJUSTRETURN
)
3006 if (sopt
->sopt_level
!= SOL_SOCKET
) {
3007 if (so
->so_proto
&& so
->so_proto
->pr_ctloutput
) {
3008 error
= (*so
->so_proto
->pr_ctloutput
)(so
, sopt
);
3009 socket_unlock(so
, 1);
3012 error
= ENOPROTOOPT
;
3014 switch (sopt
->sopt_name
) {
3017 error
= sooptcopyin(sopt
, &l
, sizeof (l
), sizeof (l
));
3021 so
->so_linger
= (sopt
->sopt_name
== SO_LINGER
) ?
3022 l
.l_linger
: l
.l_linger
* hz
;
3024 so
->so_options
|= SO_LINGER
;
3026 so
->so_options
&= ~SO_LINGER
;
3032 case SO_USELOOPBACK
:
3041 case SO_WANTOOBFLAG
:
3043 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3048 so
->so_options
|= sopt
->sopt_name
;
3050 so
->so_options
&= ~sopt
->sopt_name
;
3057 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3063 * Values < 1 make no sense for any of these
3064 * options, so disallow them.
3071 switch (sopt
->sopt_name
) {
3074 if (sbreserve(sopt
->sopt_name
== SO_SNDBUF
?
3075 &so
->so_snd
: &so
->so_rcv
,
3076 (u_int32_t
) optval
) == 0) {
3080 if (sopt
->sopt_name
== SO_SNDBUF
)
3081 so
->so_snd
.sb_flags
|= SB_USRSIZE
;
3083 so
->so_rcv
.sb_flags
|= SB_USRSIZE
;
3087 * Make sure the low-water is never greater than
3091 so
->so_snd
.sb_lowat
=
3092 (optval
> so
->so_snd
.sb_hiwat
) ?
3093 so
->so_snd
.sb_hiwat
: optval
;
3096 so
->so_rcv
.sb_lowat
=
3097 (optval
> so
->so_rcv
.sb_hiwat
) ?
3098 so
->so_rcv
.sb_hiwat
: optval
;
3105 error
= sooptcopyin_timeval(sopt
, &tv
);
3109 switch (sopt
->sopt_name
) {
3111 so
->so_snd
.sb_timeo
= tv
;
3114 so
->so_rcv
.sb_timeo
= tv
;
3123 error
= sooptcopyin(sopt
, &nke
, sizeof (nke
),
3128 error
= sflt_attach_private(so
, NULL
,
3134 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3139 so
->so_flags
|= SOF_NOSIGPIPE
;
3141 so
->so_flags
&= ~SOF_NOSIGPIPE
;
3146 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3151 so
->so_flags
|= SOF_NOADDRAVAIL
;
3153 so
->so_flags
&= ~SOF_NOADDRAVAIL
;
3157 case SO_REUSESHAREUID
:
3158 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3163 so
->so_flags
|= SOF_REUSESHAREUID
;
3165 so
->so_flags
&= ~SOF_REUSESHAREUID
;
3167 #ifdef __APPLE_API_PRIVATE
3168 case SO_NOTIFYCONFLICT
:
3169 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3173 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3178 so
->so_flags
|= SOF_NOTIFYCONFLICT
;
3180 so
->so_flags
&= ~SOF_NOTIFYCONFLICT
;
3183 case SO_RESTRICTIONS
:
3184 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3188 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3192 so
->so_restrictions
= (optval
& (SO_RESTRICT_DENYIN
|
3193 SO_RESTRICT_DENYOUT
| SO_RESTRICT_DENYSET
));
3197 #if CONFIG_MACF_SOCKET
3198 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof (extmac
),
3199 sizeof (extmac
))) != 0)
3202 error
= mac_setsockopt_label(proc_ucred(sopt
->sopt_p
),
3206 #endif /* MAC_SOCKET */
3209 #ifdef __APPLE_API_PRIVATE
3210 case SO_UPCALLCLOSEWAIT
:
3211 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3216 so
->so_flags
|= SOF_UPCALLCLOSEWAIT
;
3218 so
->so_flags
&= ~SOF_UPCALLCLOSEWAIT
;
3223 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3228 so
->so_flags
|= SOF_BINDRANDOMPORT
;
3230 so
->so_flags
&= ~SOF_BINDRANDOMPORT
;
3233 case SO_NP_EXTENSIONS
: {
3234 struct so_np_extensions sonpx
;
3236 error
= sooptcopyin(sopt
, &sonpx
, sizeof(sonpx
), sizeof(sonpx
));
3239 if (sonpx
.npx_mask
& ~SONPX_MASK_VALID
) {
3244 * Only one bit defined for now
3246 if ((sonpx
.npx_mask
& SONPX_SETOPTSHUT
)) {
3247 if ((sonpx
.npx_flags
& SONPX_SETOPTSHUT
))
3248 so
->so_flags
|= SOF_NPX_SETOPTSHUT
;
3250 so
->so_flags
&= ~SOF_NPX_SETOPTSHUT
;
3256 error
= ENOPROTOOPT
;
3259 if (error
== 0 && so
->so_proto
&& so
->so_proto
->pr_ctloutput
) {
3260 (void) ((*so
->so_proto
->pr_ctloutput
)(so
, sopt
));
3264 socket_unlock(so
, 1);
3268 /* Helper routines for getsockopt */
3270 sooptcopyout(struct sockopt
*sopt
, void *buf
, size_t len
)
3278 * Documented get behavior is that we always return a value,
3279 * possibly truncated to fit in the user's buffer.
3280 * Traditional behavior is that we always tell the user
3281 * precisely how much we copied, rather than something useful
3282 * like the total amount we had available for her.
3283 * Note that this interface is not idempotent; the entire answer must
3284 * generated ahead of time.
3286 valsize
= min(len
, sopt
->sopt_valsize
);
3287 sopt
->sopt_valsize
= valsize
;
3288 if (sopt
->sopt_val
!= USER_ADDR_NULL
) {
3289 if (sopt
->sopt_p
!= kernproc
)
3290 error
= copyout(buf
, sopt
->sopt_val
, valsize
);
3292 bcopy(buf
, CAST_DOWN(caddr_t
, sopt
->sopt_val
), valsize
);
3298 sooptcopyout_timeval(struct sockopt
*sopt
, const struct timeval
* tv_p
)
3302 struct user64_timeval tv64
;
3303 struct user32_timeval tv32
;
3308 if (proc_is64bit(sopt
->sopt_p
)) {
3310 tv64
.tv_sec
= tv_p
->tv_sec
;
3311 tv64
.tv_usec
= tv_p
->tv_usec
;
3315 tv32
.tv_sec
= tv_p
->tv_sec
;
3316 tv32
.tv_usec
= tv_p
->tv_usec
;
3319 valsize
= min(len
, sopt
->sopt_valsize
);
3320 sopt
->sopt_valsize
= valsize
;
3321 if (sopt
->sopt_val
!= USER_ADDR_NULL
) {
3322 if (sopt
->sopt_p
!= kernproc
)
3323 error
= copyout(val
, sopt
->sopt_val
, valsize
);
3325 bcopy(val
, CAST_DOWN(caddr_t
, sopt
->sopt_val
), valsize
);
3333 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
3334 * <pr_ctloutput>:???
3335 * <sf_getoption>:???
3338 sogetopt(struct socket
*so
, struct sockopt
*sopt
)
3343 struct socket_filter_entry
*filter
;
3345 #if CONFIG_MACF_SOCKET
3347 #endif /* MAC_SOCKET */
3349 if (sopt
->sopt_dir
!= SOPT_GET
) {
3350 sopt
->sopt_dir
= SOPT_GET
;
3356 for (filter
= so
->so_filt
; filter
&& (error
== 0);
3357 filter
= filter
->sfe_next_onsocket
) {
3358 if (filter
->sfe_filter
->sf_filter
.sf_getoption
) {
3359 if (filtered
== 0) {
3362 socket_unlock(so
, 0);
3364 error
= filter
->sfe_filter
->sf_filter
.
3365 sf_getoption(filter
->sfe_cookie
, so
, sopt
);
3368 if (filtered
!= 0) {
3373 if (error
== EJUSTRETURN
)
3375 socket_unlock(so
, 1);
3381 if (sopt
->sopt_level
!= SOL_SOCKET
) {
3382 if (so
->so_proto
&& so
->so_proto
->pr_ctloutput
) {
3383 error
= (*so
->so_proto
->pr_ctloutput
)(so
, sopt
);
3384 socket_unlock(so
, 1);
3387 socket_unlock(so
, 1);
3388 return (ENOPROTOOPT
);
3391 switch (sopt
->sopt_name
) {
3394 l
.l_onoff
= so
->so_options
& SO_LINGER
;
3395 l
.l_linger
= (sopt
->sopt_name
== SO_LINGER
) ?
3396 so
->so_linger
: so
->so_linger
/ hz
;
3397 error
= sooptcopyout(sopt
, &l
, sizeof (l
));
3400 case SO_USELOOPBACK
:
3412 case SO_WANTOOBFLAG
:
3414 optval
= so
->so_options
& sopt
->sopt_name
;
3416 error
= sooptcopyout(sopt
, &optval
, sizeof (optval
));
3420 optval
= so
->so_type
;
3425 if (so
->so_proto
->pr_flags
& PR_ATOMIC
) {
3430 m1
= so
->so_rcv
.sb_mb
;
3432 if (m1
->m_type
== MT_DATA
|| m1
->m_type
== MT_HEADER
||
3433 m1
->m_type
== MT_OOBDATA
)
3434 pkt_total
+= m1
->m_len
;
3439 optval
= so
->so_rcv
.sb_cc
- so
->so_rcv
.sb_ctl
;
3444 optval
= so
->so_snd
.sb_cc
;
3448 optval
= so
->so_error
;
3453 optval
= so
->so_snd
.sb_hiwat
;
3457 optval
= so
->so_rcv
.sb_hiwat
;
3461 optval
= so
->so_snd
.sb_lowat
;
3465 optval
= so
->so_rcv
.sb_lowat
;
3470 tv
= (sopt
->sopt_name
== SO_SNDTIMEO
?
3471 so
->so_snd
.sb_timeo
: so
->so_rcv
.sb_timeo
);
3473 error
= sooptcopyout_timeval(sopt
, &tv
);
3477 optval
= (so
->so_flags
& SOF_NOSIGPIPE
);
3481 optval
= (so
->so_flags
& SOF_NOADDRAVAIL
);
3484 case SO_REUSESHAREUID
:
3485 optval
= (so
->so_flags
& SOF_REUSESHAREUID
);
3488 #ifdef __APPLE_API_PRIVATE
3489 case SO_NOTIFYCONFLICT
:
3490 optval
= (so
->so_flags
& SOF_NOTIFYCONFLICT
);
3493 case SO_RESTRICTIONS
:
3494 optval
= so
->so_restrictions
& (SO_RESTRICT_DENYIN
|
3495 SO_RESTRICT_DENYOUT
| SO_RESTRICT_DENYSET
);
3499 #if CONFIG_MACF_SOCKET
3500 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof (extmac
),
3501 sizeof (extmac
))) != 0 ||
3502 (error
= mac_socket_label_get(proc_ucred(
3503 sopt
->sopt_p
), so
, &extmac
)) != 0)
3506 error
= sooptcopyout(sopt
, &extmac
, sizeof (extmac
));
3509 #endif /* MAC_SOCKET */
3513 #if CONFIG_MACF_SOCKET
3514 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof (extmac
),
3515 sizeof (extmac
))) != 0 ||
3516 (error
= mac_socketpeer_label_get(proc_ucred(
3517 sopt
->sopt_p
), so
, &extmac
)) != 0)
3520 error
= sooptcopyout(sopt
, &extmac
, sizeof (extmac
));
3523 #endif /* MAC_SOCKET */
3526 #ifdef __APPLE_API_PRIVATE
3527 case SO_UPCALLCLOSEWAIT
:
3528 optval
= (so
->so_flags
& SOF_UPCALLCLOSEWAIT
);
3532 optval
= (so
->so_flags
& SOF_BINDRANDOMPORT
);
3535 case SO_NP_EXTENSIONS
: {
3536 struct so_np_extensions sonpx
;
3538 sonpx
.npx_flags
= (so
->so_flags
& SOF_NPX_SETOPTSHUT
) ? SONPX_SETOPTSHUT
: 0;
3539 sonpx
.npx_mask
= SONPX_MASK_VALID
;
3541 error
= sooptcopyout(sopt
, &sonpx
, sizeof(struct so_np_extensions
));
3545 error
= ENOPROTOOPT
;
3548 socket_unlock(so
, 1);
3553 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
3555 soopt_getm(struct sockopt
*sopt
, struct mbuf
**mp
)
3557 struct mbuf
*m
, *m_prev
;
3558 int sopt_size
= sopt
->sopt_valsize
;
3561 if (sopt_size
> MAX_SOOPTGETM_SIZE
)
3564 how
= sopt
->sopt_p
!= kernproc
? M_WAIT
: M_DONTWAIT
;
3565 MGET(m
, how
, MT_DATA
);
3568 if (sopt_size
> MLEN
) {
3570 if ((m
->m_flags
& M_EXT
) == 0) {
3574 m
->m_len
= min(MCLBYTES
, sopt_size
);
3576 m
->m_len
= min(MLEN
, sopt_size
);
3578 sopt_size
-= m
->m_len
;
3583 MGET(m
, how
, MT_DATA
);
3588 if (sopt_size
> MLEN
) {
3590 if ((m
->m_flags
& M_EXT
) == 0) {
3594 m
->m_len
= min(MCLBYTES
, sopt_size
);
3596 m
->m_len
= min(MLEN
, sopt_size
);
3598 sopt_size
-= m
->m_len
;
3605 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
3607 soopt_mcopyin(struct sockopt
*sopt
, struct mbuf
*m
)
3609 struct mbuf
*m0
= m
;
3611 if (sopt
->sopt_val
== USER_ADDR_NULL
)
3613 while (m
!= NULL
&& sopt
->sopt_valsize
>= m
->m_len
) {
3614 if (sopt
->sopt_p
!= kernproc
) {
3617 error
= copyin(sopt
->sopt_val
, mtod(m
, char *),
3624 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
),
3625 mtod(m
, char *), m
->m_len
);
3627 sopt
->sopt_valsize
-= m
->m_len
;
3628 sopt
->sopt_val
+= m
->m_len
;
3631 if (m
!= NULL
) /* should be allocated enoughly at ip6_sooptmcopyin() */
3632 panic("soopt_mcopyin");
3636 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
3638 soopt_mcopyout(struct sockopt
*sopt
, struct mbuf
*m
)
3640 struct mbuf
*m0
= m
;
3643 if (sopt
->sopt_val
== USER_ADDR_NULL
)
3645 while (m
!= NULL
&& sopt
->sopt_valsize
>= m
->m_len
) {
3646 if (sopt
->sopt_p
!= kernproc
) {
3649 error
= copyout(mtod(m
, char *), sopt
->sopt_val
,
3656 bcopy(mtod(m
, char *),
3657 CAST_DOWN(caddr_t
, sopt
->sopt_val
), m
->m_len
);
3659 sopt
->sopt_valsize
-= m
->m_len
;
3660 sopt
->sopt_val
+= m
->m_len
;
3661 valsize
+= m
->m_len
;
3665 /* enough soopt buffer should be given from user-land */
3669 sopt
->sopt_valsize
= valsize
;
3674 sohasoutofband(struct socket
*so
)
3677 if (so
->so_pgid
< 0)
3678 gsignal(-so
->so_pgid
, SIGURG
);
3679 else if (so
->so_pgid
> 0)
3680 proc_signal(so
->so_pgid
, SIGURG
);
3681 selwakeup(&so
->so_rcv
.sb_sel
);
3685 sopoll(struct socket
*so
, int events
, __unused kauth_cred_t cred
, void * wql
)
3687 struct proc
*p
= current_proc();
3692 if (events
& (POLLIN
| POLLRDNORM
))
3694 revents
|= events
& (POLLIN
| POLLRDNORM
);
3696 if (events
& (POLLOUT
| POLLWRNORM
))
3697 if (sowriteable(so
))
3698 revents
|= events
& (POLLOUT
| POLLWRNORM
);
3700 if (events
& (POLLPRI
| POLLRDBAND
))
3701 if (so
->so_oobmark
|| (so
->so_state
& SS_RCVATMARK
))
3702 revents
|= events
& (POLLPRI
| POLLRDBAND
);
3705 if (events
& (POLLIN
| POLLPRI
| POLLRDNORM
| POLLRDBAND
)) {
3707 * Darwin sets the flag first,
3708 * BSD calls selrecord first
3710 so
->so_rcv
.sb_flags
|= SB_SEL
;
3711 selrecord(p
, &so
->so_rcv
.sb_sel
, wql
);
3714 if (events
& (POLLOUT
| POLLWRNORM
)) {
3716 * Darwin sets the flag first,
3717 * BSD calls selrecord first
3719 so
->so_snd
.sb_flags
|= SB_SEL
;
3720 selrecord(p
, &so
->so_snd
.sb_sel
, wql
);
3724 socket_unlock(so
, 1);
3729 soo_kqfilter(__unused
struct fileproc
*fp
, struct knote
*kn
,
3730 __unused
struct proc
*p
)
3732 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
3737 #if CONFIG_MACF_SOCKET
3738 if (mac_socket_check_kqfilter(proc_ucred(p
), kn
, so
) != 0) {
3739 socket_unlock(so
, 1);
3742 #endif /* MAC_SOCKET */
3744 switch (kn
->kn_filter
) {
3746 kn
->kn_fop
= &soread_filtops
;
3750 kn
->kn_fop
= &sowrite_filtops
;
3754 socket_unlock(so
, 1);
3758 if (KNOTE_ATTACH(&sb
->sb_sel
.si_note
, kn
))
3759 sb
->sb_flags
|= SB_KNOTE
;
3760 socket_unlock(so
, 1);
3765 filt_sordetach(struct knote
*kn
)
3767 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
3770 if (so
->so_rcv
.sb_flags
& SB_KNOTE
)
3771 if (KNOTE_DETACH(&so
->so_rcv
.sb_sel
.si_note
, kn
))
3772 so
->so_rcv
.sb_flags
&= ~SB_KNOTE
;
3773 socket_unlock(so
, 1);
3778 filt_soread(struct knote
*kn
, long hint
)
3780 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
3782 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3785 if (so
->so_options
& SO_ACCEPTCONN
) {
3788 /* Radar 6615193 handle the listen case dynamically
3789 * for kqueue read filter. This allows to call listen() after registering
3790 * the kqueue EVFILT_READ.
3793 kn
->kn_data
= so
->so_qlen
;
3794 isempty
= ! TAILQ_EMPTY(&so
->so_comp
);
3796 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3797 socket_unlock(so
, 1);
3802 /* socket isn't a listener */
3804 kn
->kn_data
= so
->so_rcv
.sb_cc
- so
->so_rcv
.sb_ctl
;
3806 if (so
->so_oobmark
) {
3807 if (kn
->kn_flags
& EV_OOBAND
) {
3808 kn
->kn_data
-= so
->so_oobmark
;
3809 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3810 socket_unlock(so
, 1);
3813 kn
->kn_data
= so
->so_oobmark
;
3814 kn
->kn_flags
|= EV_OOBAND
;
3816 if (so
->so_state
& SS_CANTRCVMORE
) {
3817 kn
->kn_flags
|= EV_EOF
;
3818 kn
->kn_fflags
= so
->so_error
;
3819 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3820 socket_unlock(so
, 1);
3825 if (so
->so_state
& SS_RCVATMARK
) {
3826 if (kn
->kn_flags
& EV_OOBAND
) {
3827 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3828 socket_unlock(so
, 1);
3831 kn
->kn_flags
|= EV_OOBAND
;
3832 } else if (kn
->kn_flags
& EV_OOBAND
) {
3834 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3835 socket_unlock(so
, 1);
3839 if (so
->so_error
) { /* temporary udp error */
3840 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3841 socket_unlock(so
, 1);
3845 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3846 socket_unlock(so
, 1);
3848 return ((kn
->kn_flags
& EV_OOBAND
) ||
3849 kn
->kn_data
>= ((kn
->kn_sfflags
& NOTE_LOWAT
) ?
3850 kn
->kn_sdata
: so
->so_rcv
.sb_lowat
));
3854 filt_sowdetach(struct knote
*kn
)
3856 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
3859 if (so
->so_snd
.sb_flags
& SB_KNOTE
)
3860 if (KNOTE_DETACH(&so
->so_snd
.sb_sel
.si_note
, kn
))
3861 so
->so_snd
.sb_flags
&= ~SB_KNOTE
;
3862 socket_unlock(so
, 1);
3867 filt_sowrite(struct knote
*kn
, long hint
)
3869 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
3871 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3874 kn
->kn_data
= sbspace(&so
->so_snd
);
3875 if (so
->so_state
& SS_CANTSENDMORE
) {
3876 kn
->kn_flags
|= EV_EOF
;
3877 kn
->kn_fflags
= so
->so_error
;
3878 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3879 socket_unlock(so
, 1);
3882 if (so
->so_error
) { /* temporary udp error */
3883 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3884 socket_unlock(so
, 1);
3887 if (((so
->so_state
& SS_ISCONNECTED
) == 0) &&
3888 (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
)) {
3889 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3890 socket_unlock(so
, 1);
3893 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3894 socket_unlock(so
, 1);
3895 if (kn
->kn_sfflags
& NOTE_LOWAT
)
3896 return (kn
->kn_data
>= kn
->kn_sdata
);
3897 return (kn
->kn_data
>= so
->so_snd
.sb_lowat
);
3900 #define SO_LOCK_HISTORY_STR_LEN (2 * SO_LCKDBG_MAX * (2 + sizeof(void *) + 1) + 1)
3902 __private_extern__
const char * solockhistory_nr(struct socket
*so
)
3906 static char lock_history_str
[SO_LOCK_HISTORY_STR_LEN
];
3908 for (i
= SO_LCKDBG_MAX
- 1; i
>= 0; i
--) {
3909 n
+= snprintf(lock_history_str
+ n
, SO_LOCK_HISTORY_STR_LEN
- n
, "%lx:%lx ",
3910 (uintptr_t) so
->lock_lr
[(so
->next_lock_lr
+ i
) % SO_LCKDBG_MAX
],
3911 (uintptr_t) so
->unlock_lr
[(so
->next_unlock_lr
+ i
) % SO_LCKDBG_MAX
]);
3913 return lock_history_str
;
3917 socket_lock(struct socket
*so
, int refcount
)
3922 lr_saved
= __builtin_return_address(0);
3924 if (so
->so_proto
->pr_lock
) {
3925 error
= (*so
->so_proto
->pr_lock
)(so
, refcount
, lr_saved
);
3927 #ifdef MORE_LOCKING_DEBUG
3928 lck_mtx_assert(so
->so_proto
->pr_domain
->dom_mtx
,
3929 LCK_MTX_ASSERT_NOTOWNED
);
3931 lck_mtx_lock(so
->so_proto
->pr_domain
->dom_mtx
);
3934 so
->lock_lr
[so
->next_lock_lr
] = lr_saved
;
3935 so
->next_lock_lr
= (so
->next_lock_lr
+1) % SO_LCKDBG_MAX
;
3942 socket_unlock(struct socket
*so
, int refcount
)
3946 lck_mtx_t
*mutex_held
;
3948 lr_saved
= __builtin_return_address(0);
3950 if (so
->so_proto
== NULL
)
3951 panic("socket_unlock null so_proto so=%p\n", so
);
3953 if (so
&& so
->so_proto
->pr_unlock
) {
3954 error
= (*so
->so_proto
->pr_unlock
)(so
, refcount
, lr_saved
);
3956 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
3957 #ifdef MORE_LOCKING_DEBUG
3958 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
3960 so
->unlock_lr
[so
->next_unlock_lr
] = lr_saved
;
3961 so
->next_unlock_lr
= (so
->next_unlock_lr
+1) % SO_LCKDBG_MAX
;
3964 if (so
->so_usecount
<= 0)
3965 panic("socket_unlock: bad refcount=%d so=%p (%d, %d, %d) lrh=%s",
3966 so
->so_usecount
, so
, so
->so_proto
->pr_domain
->dom_family
,
3967 so
->so_type
, so
->so_proto
->pr_protocol
,
3968 solockhistory_nr(so
));
3971 if (so
->so_usecount
== 0) {
3972 sofreelastref(so
, 1);
3975 lck_mtx_unlock(mutex_held
);
3981 /* Called with socket locked, will unlock socket */
3983 sofree(struct socket
*so
)
3986 lck_mtx_t
*mutex_held
;
3987 if (so
->so_proto
->pr_getlock
!= NULL
)
3988 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
3990 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
3991 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
3993 sofreelastref(so
, 0);
3997 soreference(struct socket
*so
)
3999 socket_lock(so
, 1); /* locks & take one reference on socket */
4000 socket_unlock(so
, 0); /* unlock only */
4004 sodereference(struct socket
*so
)
4007 socket_unlock(so
, 1);
4011 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
4012 * possibility of using jumbo clusters. Caller must ensure to hold
4016 somultipages(struct socket
*so
, boolean_t set
)
4019 so
->so_flags
|= SOF_MULTIPAGES
;
4021 so
->so_flags
&= ~SOF_MULTIPAGES
;
4025 so_isdstlocal(struct socket
*so
) {
4027 struct inpcb
*inp
= (struct inpcb
*)so
->so_pcb
;
4029 if (so
->so_proto
->pr_domain
->dom_family
== AF_INET
) {
4030 return inaddr_local(inp
->inp_faddr
);
4031 } else if (so
->so_proto
->pr_domain
->dom_family
== AF_INET6
) {
4032 return in6addr_local(&inp
->in6p_faddr
);