2 * Copyright (c) 1998-2007 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 * $FreeBSD: src/sys/kern/uipc_socket.c,v 1.68.2.16 2001/06/14 20:46:06 ume Exp $
65 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
66 * support for mandatory and extensible security protections. This notice
67 * is included in support of clause 2.2 (b) of the Apple Public License,
71 #include <sys/param.h>
72 #include <sys/systm.h>
73 #include <sys/filedesc.h>
75 #include <sys/proc_internal.h>
76 #include <sys/kauth.h>
77 #include <sys/file_internal.h>
78 #include <sys/fcntl.h>
79 #include <sys/malloc.h>
81 #include <sys/domain.h>
82 #include <sys/kernel.h>
83 #include <sys/event.h>
85 #include <sys/protosw.h>
86 #include <sys/socket.h>
87 #include <sys/socketvar.h>
88 #include <sys/resourcevar.h>
89 #include <sys/signalvar.h>
90 #include <sys/sysctl.h>
93 #include <sys/kdebug.h>
95 #include <net/route.h>
96 #include <netinet/in.h>
97 #include <netinet/in_pcb.h>
98 #include <kern/zalloc.h>
99 #include <kern/locks.h>
100 #include <machine/limits.h>
101 #include <libkern/OSAtomic.h>
102 #include <pexpert/pexpert.h>
105 #include <security/mac.h>
106 #include <security/mac_framework.h>
109 /* how a timeval looks to a 64-bit process */
116 int so_cache_timeouts
= 0;
117 int so_cache_max_freed
= 0;
118 int cached_sock_count
= 0;
119 struct socket
*socket_cache_head
= 0;
120 struct socket
*socket_cache_tail
= 0;
121 u_long so_cache_time
= 0;
122 int so_cache_init_done
= 0;
123 struct zone
*so_cache_zone
;
125 static lck_grp_t
*so_cache_mtx_grp
;
126 static lck_attr_t
*so_cache_mtx_attr
;
127 static lck_grp_attr_t
*so_cache_mtx_grp_attr
;
128 lck_mtx_t
*so_cache_mtx
;
130 #include <machine/limits.h>
132 static void filt_sordetach(struct knote
*kn
);
133 static int filt_soread(struct knote
*kn
, long hint
);
134 static void filt_sowdetach(struct knote
*kn
);
135 static int filt_sowrite(struct knote
*kn
, long hint
);
136 static int filt_solisten(struct knote
*kn
, long hint
);
139 sooptcopyin_timeval(struct sockopt
*sopt
, struct timeval
* tv_p
);
142 sooptcopyout_timeval(struct sockopt
*sopt
, const struct timeval
* tv_p
);
144 static struct filterops solisten_filtops
=
145 { 1, NULL
, filt_sordetach
, filt_solisten
};
146 static struct filterops soread_filtops
=
147 { 1, NULL
, filt_sordetach
, filt_soread
};
148 static struct filterops sowrite_filtops
=
149 { 1, NULL
, filt_sowdetach
, filt_sowrite
};
151 #define EVEN_MORE_LOCKING_DEBUG 0
152 int socket_debug
= 0;
153 int socket_zone
= M_SOCKET
;
154 so_gen_t so_gencnt
; /* generation count for sockets */
156 MALLOC_DEFINE(M_SONAME
, "soname", "socket name");
157 MALLOC_DEFINE(M_PCB
, "pcb", "protocol control block");
159 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
160 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
161 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
162 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
163 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
164 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
165 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
167 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
170 SYSCTL_DECL(_kern_ipc
);
172 int somaxconn
= SOMAXCONN
;
173 SYSCTL_INT(_kern_ipc
, KIPC_SOMAXCONN
, somaxconn
, CTLFLAG_RW
, &somaxconn
, 0, "");
175 /* Should we get a maximum also ??? */
176 static int sosendmaxchain
= 65536;
177 static int sosendminchain
= 16384;
178 static int sorecvmincopy
= 16384;
179 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendminchain
, CTLFLAG_RW
, &sosendminchain
,
181 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sorecvmincopy
, CTLFLAG_RW
, &sorecvmincopy
,
185 * Set to enable jumbo clusters (if available) for large writes when
186 * the socket is marked with SOF_MULTIPAGES; see below.
189 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendjcl
, CTLFLAG_RW
, &sosendjcl
, 0, "");
192 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
193 * writes on the socket for all protocols on any network interfaces,
194 * depending upon sosendjcl above. Be extra careful when setting this
195 * to 1, because sending down packets that cross physical pages down to
196 * broken drivers (those that falsely assume that the physical pages
197 * are contiguous) might lead to system panics or silent data corruption.
198 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
199 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
200 * capable. Set this to 1 only for testing/debugging purposes.
202 int sosendjcl_ignore_capab
= 0;
203 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sosendjcl_ignore_capab
, CTLFLAG_RW
,
204 &sosendjcl_ignore_capab
, 0, "");
207 * Socket operation routines.
208 * These routines are called by the routines in
209 * sys_socket.c or from a system process, and
210 * implement the semantics of socket operations by
211 * switching out to the protocol specific routines.
215 extern void postevent(struct socket
*, struct sockbuf
*, int);
216 extern void evsofree(struct socket
*);
218 /* TODO: these should be in header file */
219 extern int get_inpcb_str_size(void);
220 extern int get_tcp_str_size(void);
221 extern struct domain
*pffinddomain(int);
222 extern struct protosw
*pffindprotonotype(int, int);
223 extern int soclose_locked(struct socket
*);
224 extern int soo_kqfilter(struct fileproc
*, struct knote
*, struct proc
*);
228 vm_size_t so_cache_zone_element_size
;
230 static int sodelayed_copy(struct socket
*, struct uio
*, struct mbuf
**, int *);
231 static void cached_sock_alloc(struct socket
**, int);
232 static void cached_sock_free(struct socket
*);
233 static void so_cache_timer(void *);
235 void soclose_wait_locked(struct socket
*so
);
243 if (so_cache_init_done
) {
244 printf("socketinit: already called...\n");
248 PE_parse_boot_arg("socket_debug", &socket_debug
);
251 * allocate lock group attribute and group for socket cache mutex
253 so_cache_mtx_grp_attr
= lck_grp_attr_alloc_init();
255 so_cache_mtx_grp
= lck_grp_alloc_init("so_cache",
256 so_cache_mtx_grp_attr
);
259 * allocate the lock attribute for socket cache mutex
261 so_cache_mtx_attr
= lck_attr_alloc_init();
263 so_cache_init_done
= 1;
265 /* cached sockets mutex */
266 so_cache_mtx
= lck_mtx_alloc_init(so_cache_mtx_grp
, so_cache_mtx_attr
);
268 if (so_cache_mtx
== NULL
)
269 return; /* we're hosed... */
271 str_size
= (vm_size_t
)(sizeof (struct socket
) + 4 +
272 get_inpcb_str_size() + 4 + get_tcp_str_size());
274 so_cache_zone
= zinit(str_size
, 120000*str_size
, 8192, "socache zone");
276 printf("cached_sock_alloc -- so_cache_zone size is %x\n", str_size
);
278 timeout(so_cache_timer
, NULL
, (SO_CACHE_FLUSH_INTERVAL
* hz
));
280 so_cache_zone_element_size
= str_size
;
286 cached_sock_alloc(struct socket
**so
, int waitok
)
289 register u_long offset
;
291 lck_mtx_lock(so_cache_mtx
);
293 if (cached_sock_count
) {
295 *so
= socket_cache_head
;
297 panic("cached_sock_alloc: cached sock is null");
299 socket_cache_head
= socket_cache_head
->cache_next
;
300 if (socket_cache_head
)
301 socket_cache_head
->cache_prev
= 0;
303 socket_cache_tail
= 0;
305 lck_mtx_unlock(so_cache_mtx
);
307 temp
= (*so
)->so_saved_pcb
;
308 bzero((caddr_t
)*so
, sizeof (struct socket
));
310 kprintf("cached_sock_alloc - retreiving cached sock %p - "
311 "count == %d\n", *so
, cached_sock_count
);
313 (*so
)->so_saved_pcb
= temp
;
314 (*so
)->cached_in_sock_layer
= 1;
317 kprintf("Allocating cached sock %p from memory\n", *so
);
320 lck_mtx_unlock(so_cache_mtx
);
323 *so
= (struct socket
*)zalloc(so_cache_zone
);
325 *so
= (struct socket
*)zalloc_noblock(so_cache_zone
);
330 bzero((caddr_t
)*so
, sizeof (struct socket
));
333 * Define offsets for extra structures into our single block of
334 * memory. Align extra structures on longword boundaries.
336 offset
= (u_long
) *so
;
337 offset
+= sizeof (struct socket
);
340 offset
&= 0xfffffffc;
342 (*so
)->so_saved_pcb
= (caddr_t
)offset
;
343 offset
+= get_inpcb_str_size();
346 offset
&= 0xfffffffc;
349 ((struct inpcb
*)(*so
)->so_saved_pcb
)->inp_saved_ppcb
=
352 kprintf("Allocating cached socket - %p, pcb=%p tcpcb=%p\n",
353 *so
, (*so
)->so_saved_pcb
,
354 ((struct inpcb
*)(*so
)->so_saved_pcb
)->inp_saved_ppcb
);
358 (*so
)->cached_in_sock_layer
= 1;
362 cached_sock_free(struct socket
*so
)
365 lck_mtx_lock(so_cache_mtx
);
367 if (++cached_sock_count
> MAX_CACHED_SOCKETS
) {
369 lck_mtx_unlock(so_cache_mtx
);
371 kprintf("Freeing overflowed cached socket %p\n", so
);
373 zfree(so_cache_zone
, so
);
376 kprintf("Freeing socket %p into cache\n", so
);
378 if (so_cache_hw
< cached_sock_count
)
379 so_cache_hw
= cached_sock_count
;
381 so
->cache_next
= socket_cache_head
;
383 if (socket_cache_head
)
384 socket_cache_head
->cache_prev
= so
;
386 socket_cache_tail
= so
;
388 so
->cache_timestamp
= so_cache_time
;
389 socket_cache_head
= so
;
390 lck_mtx_unlock(so_cache_mtx
);
394 kprintf("Freed cached sock %p into cache - count is %d\n",
395 so
, cached_sock_count
);
400 so_cache_timer(__unused
void *dummy
)
402 register struct socket
*p
;
403 register int n_freed
= 0;
405 lck_mtx_lock(so_cache_mtx
);
409 while ((p
= socket_cache_tail
)) {
410 if ((so_cache_time
- p
->cache_timestamp
) < SO_CACHE_TIME_LIMIT
)
415 if ((socket_cache_tail
= p
->cache_prev
))
416 p
->cache_prev
->cache_next
= 0;
417 if (--cached_sock_count
== 0)
418 socket_cache_head
= 0;
420 zfree(so_cache_zone
, p
);
422 if (++n_freed
>= SO_CACHE_MAX_FREE_BATCH
) {
423 so_cache_max_freed
++;
427 lck_mtx_unlock(so_cache_mtx
);
429 timeout(so_cache_timer
, NULL
, (SO_CACHE_FLUSH_INTERVAL
* hz
));
431 #endif /* __APPLE__ */
434 * Get a socket structure from our zone, and initialize it.
435 * We don't implement `waitok' yet (see comments in uipc_domain.c).
436 * Note that it would probably be better to allocate socket
437 * and PCB at the same time, but I'm not convinced that all
438 * the protocols can be easily modified to do this.
441 soalloc(int waitok
, int dom
, int type
)
445 if ((dom
== PF_INET
) && (type
== SOCK_STREAM
)) {
446 cached_sock_alloc(&so
, waitok
);
448 MALLOC_ZONE(so
, struct socket
*, sizeof (*so
), socket_zone
,
451 bzero(so
, sizeof (*so
));
453 /* XXX race condition for reentrant kernel */
454 //###LD Atomic add for so_gencnt
456 so
->so_gencnt
= ++so_gencnt
;
457 so
->so_zone
= socket_zone
;
458 #if CONFIG_MACF_SOCKET
459 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
460 if (mac_socket_label_init(so
, !waitok
) != 0) {
464 #endif /* MAC_SOCKET */
476 * <pru_attach>:ENOBUFS[AF_UNIX]
477 * <pru_attach>:ENOBUFS[TCP]
478 * <pru_attach>:ENOMEM[TCP]
479 * <pru_attach>:EISCONN[TCP]
480 * <pru_attach>:??? [other protocol families, IPSEC]
483 socreate(int dom
, struct socket
**aso
, int type
, int proto
)
485 struct proc
*p
= current_proc();
486 register struct protosw
*prp
;
487 register struct socket
*so
;
488 register int error
= 0;
490 extern int tcpconsdebug
;
493 prp
= pffindproto(dom
, proto
, type
);
495 prp
= pffindtype(dom
, type
);
497 if (prp
== 0 || prp
->pr_usrreqs
->pru_attach
== 0) {
498 if (pffinddomain(dom
) == NULL
) {
499 return (EAFNOSUPPORT
);
502 if (pffindprotonotype(dom
, proto
) != NULL
) {
506 return (EPROTONOSUPPORT
);
508 if (prp
->pr_type
!= type
)
510 so
= soalloc(p
!= 0, dom
, type
);
514 TAILQ_INIT(&so
->so_incomp
);
515 TAILQ_INIT(&so
->so_comp
);
519 so
->so_uid
= kauth_cred_getuid(kauth_cred_get());
520 if (!suser(kauth_cred_get(), NULL
))
521 so
->so_state
= SS_PRIV
;
525 so
->so_rcv
.sb_flags
|= SB_RECV
; /* XXX */
526 so
->so_rcv
.sb_so
= so
->so_snd
.sb_so
= so
;
528 so
->next_lock_lr
= 0;
529 so
->next_unlock_lr
= 0;
531 #if CONFIG_MACF_SOCKET
532 mac_socket_label_associate(kauth_cred_get(), so
);
533 #endif /* MAC_SOCKET */
535 //### Attachement will create the per pcb lock if necessary and increase refcount
537 * for creation, make sure it's done before
538 * socket is inserted in lists
542 error
= (*prp
->pr_usrreqs
->pru_attach
)(so
, proto
, p
);
546 * If so_pcb is not zero, the socket will be leaked,
547 * so protocol attachment handler must be coded carefuly
549 so
->so_state
|= SS_NOFDREF
;
551 sofreelastref(so
, 1); /* will deallocate the socket */
555 prp
->pr_domain
->dom_refs
++;
556 TAILQ_INIT(&so
->so_evlist
);
558 /* Attach socket filters for this protocol */
561 if (tcpconsdebug
== 2)
562 so
->so_options
|= SO_DEBUG
;
571 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
572 * <pru_bind>:EAFNOSUPPORT Address family not supported
573 * <pru_bind>:EADDRNOTAVAIL Address not available.
574 * <pru_bind>:EINVAL Invalid argument
575 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
576 * <pru_bind>:EACCES Permission denied
577 * <pru_bind>:EADDRINUSE Address in use
578 * <pru_bind>:EAGAIN Resource unavailable, try again
579 * <pru_bind>:EPERM Operation not permitted
583 * Notes: It's not possible to fully enumerate the return codes above,
584 * since socket filter authors and protocol family authors may
585 * not choose to limit their error returns to those listed, even
586 * though this may result in some software operating incorrectly.
588 * The error codes which are enumerated above are those known to
589 * be returned by the tcp_usr_bind function supplied.
592 sobind(struct socket
*so
, struct sockaddr
*nam
)
594 struct proc
*p
= current_proc();
596 struct socket_filter_entry
*filter
;
602 * If this is a bind request on a previously-accepted socket
603 * that has been marked as inactive, reject it now before
606 if (so
->so_flags
& SOF_DEFUNCT
) {
613 for (filter
= so
->so_filt
; filter
&& (error
== 0);
614 filter
= filter
->sfe_next_onsocket
) {
615 if (filter
->sfe_filter
->sf_filter
.sf_bind
) {
619 socket_unlock(so
, 0);
621 error
= filter
->sfe_filter
->sf_filter
.
622 sf_bind(filter
->sfe_cookie
, so
, nam
);
629 /* End socket filter */
632 error
= (*so
->so_proto
->pr_usrreqs
->pru_bind
)(so
, nam
, p
);
634 socket_unlock(so
, 1);
636 if (error
== EJUSTRETURN
)
643 sodealloc(struct socket
*so
)
645 so
->so_gencnt
= ++so_gencnt
;
647 #if CONFIG_MACF_SOCKET
648 mac_socket_label_destroy(so
);
649 #endif /* MAC_SOCKET */
650 if (so
->cached_in_sock_layer
== 1) {
651 cached_sock_free(so
);
653 if (so
->cached_in_sock_layer
== -1)
654 panic("sodealloc: double dealloc: so=%p\n", so
);
655 so
->cached_in_sock_layer
= -1;
656 FREE_ZONE(so
, sizeof (*so
), so
->so_zone
);
664 * <pru_listen>:EINVAL[AF_UNIX]
665 * <pru_listen>:EINVAL[TCP]
666 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
667 * <pru_listen>:EINVAL[TCP] Invalid argument
668 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
669 * <pru_listen>:EACCES[TCP] Permission denied
670 * <pru_listen>:EADDRINUSE[TCP] Address in use
671 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
672 * <pru_listen>:EPERM[TCP] Operation not permitted
675 * Notes: Other <pru_listen> returns depend on the protocol family; all
676 * <sf_listen> returns depend on what the filter author causes
677 * their filter to return.
680 solisten(struct socket
*so
, int backlog
)
682 struct proc
*p
= current_proc();
684 struct socket_filter_entry
*filter
;
688 if (so
->so_proto
== NULL
) {
692 if ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) == 0) {
698 * If the listen request is made on a socket that is not fully
699 * disconnected, or on a previously-accepted socket that has
700 * been marked as inactive, reject the request now.
703 (SS_ISCONNECTED
|SS_ISCONNECTING
|SS_ISDISCONNECTING
)) ||
704 (so
->so_flags
& SOF_DEFUNCT
)) {
709 if ((so
->so_restrictions
& SO_RESTRICT_DENYIN
) != 0) {
715 for (filter
= so
->so_filt
; filter
&& (error
== 0);
716 filter
= filter
->sfe_next_onsocket
) {
717 if (filter
->sfe_filter
->sf_filter
.sf_listen
) {
721 socket_unlock(so
, 0);
723 error
= filter
->sfe_filter
->sf_filter
.
724 sf_listen(filter
->sfe_cookie
, so
);
733 error
= (*so
->so_proto
->pr_usrreqs
->pru_listen
)(so
, p
);
737 if (error
== EJUSTRETURN
)
742 if (TAILQ_EMPTY(&so
->so_comp
))
743 so
->so_options
|= SO_ACCEPTCONN
;
745 * POSIX: The implementation may have an upper limit on the length of
746 * the listen queue-either global or per accepting socket. If backlog
747 * exceeds this limit, the length of the listen queue is set to the
750 * If listen() is called with a backlog argument value that is less
751 * than 0, the function behaves as if it had been called with a backlog
752 * argument value of 0.
754 * A backlog argument of 0 may allow the socket to accept connections,
755 * in which case the length of the listen queue may be set to an
756 * implementation-defined minimum value.
758 if (backlog
<= 0 || backlog
> somaxconn
)
761 so
->so_qlimit
= backlog
;
763 socket_unlock(so
, 1);
768 sofreelastref(struct socket
*so
, int dealloc
)
770 struct socket
*head
= so
->so_head
;
772 /* Assume socket is locked */
774 /* Remove any filters - may be called more than once */
777 if ((!(so
->so_flags
& SOF_PCBCLEARING
)) ||
778 ((so
->so_state
& SS_NOFDREF
) == 0)) {
780 selthreadclear(&so
->so_snd
.sb_sel
);
781 selthreadclear(&so
->so_rcv
.sb_sel
);
782 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
783 so
->so_snd
.sb_flags
&= ~SB_UPCALL
;
788 socket_lock(head
, 1);
789 if (so
->so_state
& SS_INCOMP
) {
790 TAILQ_REMOVE(&head
->so_incomp
, so
, so_list
);
792 } else if (so
->so_state
& SS_COMP
) {
794 * We must not decommission a socket that's
795 * on the accept(2) queue. If we do, then
796 * accept(2) may hang after select(2) indicated
797 * that the listening socket was ready.
800 selthreadclear(&so
->so_snd
.sb_sel
);
801 selthreadclear(&so
->so_rcv
.sb_sel
);
802 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
803 so
->so_snd
.sb_flags
&= ~SB_UPCALL
;
805 socket_unlock(head
, 1);
808 panic("sofree: not queued");
811 so
->so_state
&= ~SS_INCOMP
;
813 socket_unlock(head
, 1);
816 selthreadclear(&so
->so_snd
.sb_sel
);
817 sbrelease(&so
->so_snd
);
821 /* 3932268: disable upcall */
822 so
->so_rcv
.sb_flags
&= ~SB_UPCALL
;
823 so
->so_snd
.sb_flags
&= ~SB_UPCALL
;
830 soclose_wait_locked(struct socket
*so
)
832 lck_mtx_t
*mutex_held
;
834 if (so
->so_proto
->pr_getlock
!= NULL
)
835 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
837 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
838 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
840 /* Double check here and return if there's no outstanding upcall */
841 if (!(so
->so_flags
& SOF_UPCALLINUSE
))
844 so
->so_flags
|= SOF_CLOSEWAIT
;
845 (void) msleep((caddr_t
)&so
->so_upcall
, mutex_held
, (PZERO
- 1),
846 "soclose_wait_locked", NULL
);
847 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
848 so
->so_flags
&= ~SOF_CLOSEWAIT
;
852 * Close a socket on last file table reference removal.
853 * Initiate disconnect if connected.
854 * Free socket when disconnect complete.
857 soclose_locked(struct socket
*so
)
860 lck_mtx_t
*mutex_held
;
863 if (so
->so_usecount
== 0) {
864 panic("soclose: so=%p refcount=0\n", so
);
867 sflt_notify(so
, sock_evt_closing
, NULL
);
869 if ((so
->so_options
& SO_ACCEPTCONN
)) {
870 struct socket
*sp
, *sonext
;
874 * We do not want new connection to be added
875 * to the connection queues
877 so
->so_options
&= ~SO_ACCEPTCONN
;
879 for (sp
= TAILQ_FIRST(&so
->so_incomp
); sp
!= NULL
; sp
= sonext
) {
880 sonext
= TAILQ_NEXT(sp
, so_list
);
883 * skip sockets thrown away by tcpdropdropblreq
884 * they will get cleanup by the garbage collection.
885 * otherwise, remove the incomp socket from the queue
886 * and let soabort trigger the appropriate cleanup.
888 if (sp
->so_flags
& SOF_OVERFLOW
)
891 if (so
->so_proto
->pr_getlock
!= NULL
) {
892 /* lock ordering for consistency with the rest of the stack,
893 * we lock the socket first and then grabb the head.
895 socket_unlock(so
, 0);
901 TAILQ_REMOVE(&so
->so_incomp
, sp
, so_list
);
904 if (sp
->so_state
& SS_INCOMP
) {
905 sp
->so_state
&= ~SS_INCOMP
;
912 socket_unlock(sp
, 1);
915 while ((sp
= TAILQ_FIRST(&so
->so_comp
)) != NULL
) {
916 /* Dequeue from so_comp since sofree() won't do it */
917 TAILQ_REMOVE(&so
->so_comp
, sp
, so_list
);
920 if (so
->so_proto
->pr_getlock
!= NULL
) {
921 socket_unlock(so
, 0);
925 if (sp
->so_state
& SS_COMP
) {
926 sp
->so_state
&= ~SS_COMP
;
932 if (so
->so_proto
->pr_getlock
!= NULL
) {
933 socket_unlock(sp
, 1);
938 if (so
->so_pcb
== 0) {
939 /* 3915887: mark the socket as ready for dealloc */
940 so
->so_flags
|= SOF_PCBCLEARING
;
943 if (so
->so_state
& SS_ISCONNECTED
) {
944 if ((so
->so_state
& SS_ISDISCONNECTING
) == 0) {
945 error
= sodisconnectlocked(so
);
949 if (so
->so_options
& SO_LINGER
) {
950 if ((so
->so_state
& SS_ISDISCONNECTING
) &&
951 (so
->so_state
& SS_NBIO
))
953 if (so
->so_proto
->pr_getlock
!= NULL
)
954 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
956 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
957 while (so
->so_state
& SS_ISCONNECTED
) {
958 ts
.tv_sec
= (so
->so_linger
/100);
959 ts
.tv_nsec
= (so
->so_linger
% 100) *
960 NSEC_PER_USEC
* 1000 * 10;
961 error
= msleep((caddr_t
)&so
->so_timeo
,
962 mutex_held
, PSOCK
| PCATCH
, "soclose", &ts
);
965 * It's OK when the time fires,
966 * don't report an error
968 if (error
== EWOULDBLOCK
)
976 if (so
->so_usecount
== 0)
977 panic("soclose: usecount is zero so=%p\n", so
);
978 if (so
->so_pcb
&& !(so
->so_flags
& SOF_PCBCLEARING
)) {
979 int error2
= (*so
->so_proto
->pr_usrreqs
->pru_detach
)(so
);
983 if (so
->so_usecount
<= 0)
984 panic("soclose: usecount is zero so=%p\n", so
);
986 if (so
->so_pcb
&& so
->so_state
& SS_NOFDREF
)
987 panic("soclose: NOFDREF");
988 so
->so_state
|= SS_NOFDREF
;
990 so
->so_proto
->pr_domain
->dom_refs
--;
999 soclose(struct socket
*so
)
1004 if (so
->so_flags
& SOF_UPCALLINUSE
)
1005 soclose_wait_locked(so
);
1007 if (so
->so_retaincnt
== 0) {
1008 error
= soclose_locked(so
);
1011 * if the FD is going away, but socket is
1012 * retained in kernel remove its reference
1015 if (so
->so_usecount
< 2)
1016 panic("soclose: retaincnt non null and so=%p "
1017 "usecount=%d\n", so
, so
->so_usecount
);
1019 socket_unlock(so
, 1);
1024 * Must be called at splnet...
1026 /* Should already be locked */
1028 soabort(struct socket
*so
)
1032 #ifdef MORE_LOCKING_DEBUG
1033 lck_mtx_t
*mutex_held
;
1035 if (so
->so_proto
->pr_getlock
!= NULL
)
1036 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
1038 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1039 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1042 if ((so
->so_flags
& SOF_ABORTED
) == 0) {
1043 so
->so_flags
|= SOF_ABORTED
;
1044 error
= (*so
->so_proto
->pr_usrreqs
->pru_abort
)(so
);
1054 soacceptlock(struct socket
*so
, struct sockaddr
**nam
, int dolock
)
1061 if ((so
->so_state
& SS_NOFDREF
) == 0)
1062 panic("soaccept: !NOFDREF");
1063 so
->so_state
&= ~SS_NOFDREF
;
1064 error
= (*so
->so_proto
->pr_usrreqs
->pru_accept
)(so
, nam
);
1067 socket_unlock(so
, 1);
1072 soaccept(struct socket
*so
, struct sockaddr
**nam
)
1074 return (soacceptlock(so
, nam
, 1));
1078 soacceptfilter(struct socket
*so
)
1080 struct sockaddr
*local
= NULL
, *remote
= NULL
;
1081 struct socket_filter_entry
*filter
;
1082 int error
= 0, filtered
= 0;
1083 struct socket
*head
= so
->so_head
;
1086 * There's no need to hold the lock; this socket
1087 * has not been made visible to the filter(s).
1089 if ((sock_getaddr(so
, &remote
, 1) != 0) ||
1090 sock_getaddr(so
, &local
, 0) != 0) {
1091 so
->so_state
&= ~(SS_NOFDREF
| SS_COMP
);
1094 /* Out of resources; try it again next time */
1095 error
= ECONNABORTED
;
1100 * At this point, we have a reference on the listening socket
1101 * so we know it won't be going away. Do the same for the newly
1102 * accepted socket while we invoke the accept callback routine.
1105 for (filter
= so
->so_filt
; filter
!= NULL
&& error
== 0;
1106 filter
= filter
->sfe_next_onsocket
) {
1107 if (filter
->sfe_filter
->sf_filter
.sf_accept
!= NULL
) {
1111 socket_unlock(so
, 0);
1113 error
= filter
->sfe_filter
->sf_filter
.
1114 sf_accept(filter
->sfe_cookie
,
1115 head
, so
, local
, remote
);
1125 * If we get EJUSTRETURN from one of the filters, mark this socket
1126 * as inactive and return it anyway. This newly accepted socket
1127 * will be disconnected later before we hand it off to the caller.
1129 if (error
== EJUSTRETURN
) {
1131 so
->so_flags
|= SOF_DEFUNCT
;
1132 /* Prevent data from being appended to the socket buffers */
1133 so
->so_snd
.sb_flags
|= SB_DROP
;
1134 so
->so_rcv
.sb_flags
|= SB_DROP
;
1139 * This may seem like a duplication to the above error
1140 * handling part when we return ECONNABORTED, except
1141 * the following is done while holding the lock since
1142 * the socket has been exposed to the filter(s) earlier.
1144 so
->so_state
&= ~(SS_NOFDREF
| SS_COMP
);
1146 socket_unlock(so
, 1);
1148 /* Propagate socket filter's error code to the caller */
1150 socket_unlock(so
, 1);
1153 /* Callee checks for NULL pointer */
1154 sock_freeaddr(remote
);
1155 sock_freeaddr(local
);
1160 * Returns: 0 Success
1161 * EOPNOTSUPP Operation not supported on socket
1162 * EISCONN Socket is connected
1163 * <pru_connect>:EADDRNOTAVAIL Address not available.
1164 * <pru_connect>:EINVAL Invalid argument
1165 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1166 * <pru_connect>:EACCES Permission denied
1167 * <pru_connect>:EADDRINUSE Address in use
1168 * <pru_connect>:EAGAIN Resource unavailable, try again
1169 * <pru_connect>:EPERM Operation not permitted
1170 * <sf_connect_out>:??? [anything a filter writer might set]
1173 soconnectlock(struct socket
*so
, struct sockaddr
*nam
, int dolock
)
1176 struct proc
*p
= current_proc();
1182 * If this is a listening socket or if this is a previously-accepted
1183 * socket that has been marked as inactive, reject the connect request.
1185 if ((so
->so_options
& SO_ACCEPTCONN
) || (so
->so_flags
& SOF_DEFUNCT
)) {
1187 socket_unlock(so
, 1);
1188 return (EOPNOTSUPP
);
1191 if ((so
->so_restrictions
& SO_RESTRICT_DENYOUT
) != 0) {
1193 socket_unlock(so
, 1);
1198 * If protocol is connection-based, can only connect once.
1199 * Otherwise, if connected, try to disconnect first.
1200 * This allows user to disconnect by connecting to, e.g.,
1203 if (so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
) &&
1204 ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ||
1205 (error
= sodisconnectlocked(so
)))) {
1209 * Run connect filter before calling protocol:
1210 * - non-blocking connect returns before completion;
1212 struct socket_filter_entry
*filter
;
1216 for (filter
= so
->so_filt
; filter
&& (error
== 0);
1217 filter
= filter
->sfe_next_onsocket
) {
1218 if (filter
->sfe_filter
->sf_filter
.sf_connect_out
) {
1219 if (filtered
== 0) {
1222 socket_unlock(so
, 0);
1224 error
= filter
->sfe_filter
->sf_filter
.
1225 sf_connect_out(filter
->sfe_cookie
, so
, nam
);
1228 if (filtered
!= 0) {
1234 if (error
== EJUSTRETURN
)
1237 socket_unlock(so
, 1);
1241 error
= (*so
->so_proto
->pr_usrreqs
->pru_connect
)(so
, nam
, p
);
1244 socket_unlock(so
, 1);
1249 soconnect(struct socket
*so
, struct sockaddr
*nam
)
1251 return (soconnectlock(so
, nam
, 1));
1255 * Returns: 0 Success
1256 * <pru_connect2>:EINVAL[AF_UNIX]
1257 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1258 * <pru_connect2>:??? [other protocol families]
1260 * Notes: <pru_connect2> is not supported by [TCP].
1263 soconnect2(struct socket
*so1
, struct socket
*so2
)
1267 socket_lock(so1
, 1);
1268 if (so2
->so_proto
->pr_lock
)
1269 socket_lock(so2
, 1);
1271 error
= (*so1
->so_proto
->pr_usrreqs
->pru_connect2
)(so1
, so2
);
1273 socket_unlock(so1
, 1);
1274 if (so2
->so_proto
->pr_lock
)
1275 socket_unlock(so2
, 1);
1280 sodisconnectlocked(struct socket
*so
)
1284 if ((so
->so_state
& SS_ISCONNECTED
) == 0) {
1288 if (so
->so_state
& SS_ISDISCONNECTING
) {
1293 error
= (*so
->so_proto
->pr_usrreqs
->pru_disconnect
)(so
);
1296 sflt_notify(so
, sock_evt_disconnected
, NULL
);
1302 /* Locking version */
1304 sodisconnect(struct socket
*so
)
1309 error
= sodisconnectlocked(so
);
1310 socket_unlock(so
, 1);
1314 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_DONTWAIT : M_WAIT)
1317 * sosendcheck will lock the socket buffer if it isn't locked and
1318 * verify that there is space for the data being inserted.
1320 * Returns: 0 Success
1322 * sblock:EWOULDBLOCK
1329 sosendcheck(struct socket
*so
, struct sockaddr
*addr
, long resid
, long clen
,
1330 long atomic
, int flags
, int *sblocked
)
1337 if (*sblocked
== 0) {
1338 if ((so
->so_snd
.sb_flags
& SB_LOCK
) != 0 &&
1339 so
->so_send_filt_thread
!= 0 &&
1340 so
->so_send_filt_thread
== current_thread()) {
1342 * We're being called recursively from a filter,
1343 * allow this to continue. Radar 4150520.
1344 * Don't set sblocked because we don't want
1345 * to perform an unlock later.
1349 error
= sblock(&so
->so_snd
, SBLOCKWAIT(flags
));
1358 * If a send attempt is made on a previously-accepted socket
1359 * that has been marked as inactive (disconnected), reject
1362 if (so
->so_flags
& SOF_DEFUNCT
)
1365 if (so
->so_state
& SS_CANTSENDMORE
)
1369 error
= so
->so_error
;
1374 if ((so
->so_state
& SS_ISCONNECTED
) == 0) {
1375 if ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) != 0) {
1376 if ((so
->so_state
& SS_ISCONFIRMING
) == 0 &&
1377 !(resid
== 0 && clen
!= 0))
1379 } else if (addr
== 0 && !(flags
&MSG_HOLD
)) {
1380 return ((so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ?
1381 ENOTCONN
: EDESTADDRREQ
);
1384 space
= sbspace(&so
->so_snd
);
1385 if (flags
& MSG_OOB
)
1387 if ((atomic
&& resid
> so
->so_snd
.sb_hiwat
) ||
1388 clen
> so
->so_snd
.sb_hiwat
)
1390 if (space
< resid
+ clen
&&
1391 (atomic
|| space
< (long)so
->so_snd
.sb_lowat
|| space
< clen
)) {
1392 if ((so
->so_state
& SS_NBIO
) || (flags
& MSG_NBIO
) ||
1394 return (EWOULDBLOCK
);
1396 sbunlock(&so
->so_snd
, 1);
1397 error
= sbwait(&so
->so_snd
);
1409 * If send must go all at once and message is larger than
1410 * send buffering, then hard error.
1411 * Lock against other senders.
1412 * If must go all at once and not enough room now, then
1413 * inform user that this would block and do nothing.
1414 * Otherwise, if nonblocking, send as much as possible.
1415 * The data to be sent is described by "uio" if nonzero,
1416 * otherwise by the mbuf chain "top" (which must be null
1417 * if uio is not). Data provided in mbuf chain must be small
1418 * enough to send all at once.
1420 * Returns nonzero on error, timeout or signal; callers
1421 * must check for short counts if EINTR/ERESTART are returned.
1422 * Data and control buffers are freed on return.
1424 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1425 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1426 * point at the mbuf chain being constructed and go from there.
1428 * Returns: 0 Success
1434 * sosendcheck:EWOULDBLOCK
1438 * sosendcheck:??? [value from so_error]
1439 * <pru_send>:ECONNRESET[TCP]
1440 * <pru_send>:EINVAL[TCP]
1441 * <pru_send>:ENOBUFS[TCP]
1442 * <pru_send>:EADDRINUSE[TCP]
1443 * <pru_send>:EADDRNOTAVAIL[TCP]
1444 * <pru_send>:EAFNOSUPPORT[TCP]
1445 * <pru_send>:EACCES[TCP]
1446 * <pru_send>:EAGAIN[TCP]
1447 * <pru_send>:EPERM[TCP]
1448 * <pru_send>:EMSGSIZE[TCP]
1449 * <pru_send>:EHOSTUNREACH[TCP]
1450 * <pru_send>:ENETUNREACH[TCP]
1451 * <pru_send>:ENETDOWN[TCP]
1452 * <pru_send>:ENOMEM[TCP]
1453 * <pru_send>:ENOBUFS[TCP]
1454 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
1455 * <pru_send>:EINVAL[AF_UNIX]
1456 * <pru_send>:EOPNOTSUPP[AF_UNIX]
1457 * <pru_send>:EPIPE[AF_UNIX]
1458 * <pru_send>:ENOTCONN[AF_UNIX]
1459 * <pru_send>:EISCONN[AF_UNIX]
1460 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
1461 * <sf_data_out>:??? [whatever a filter author chooses]
1463 * Notes: Other <pru_send> returns depend on the protocol family; all
1464 * <sf_data_out> returns depend on what the filter author causes
1465 * their filter to return.
1468 sosend(struct socket
*so
, struct sockaddr
*addr
, struct uio
*uio
,
1469 struct mbuf
*top
, struct mbuf
*control
, int flags
)
1472 register struct mbuf
*m
, *freelist
= NULL
;
1473 register long space
, len
, resid
;
1474 int clen
= 0, error
, dontroute
, mlen
, sendflags
;
1475 int atomic
= sosendallatonce(so
) || top
;
1477 struct proc
*p
= current_proc();
1480 // LP64todo - fix this!
1481 resid
= uio_resid(uio
);
1483 resid
= top
->m_pkthdr
.len
;
1485 KERNEL_DEBUG((DBG_FNC_SOSEND
| DBG_FUNC_START
), so
, resid
,
1486 so
->so_snd
.sb_cc
, so
->so_snd
.sb_lowat
, so
->so_snd
.sb_hiwat
);
1489 if (so
->so_type
!= SOCK_STREAM
&& (flags
& MSG_OOB
) != 0) {
1491 socket_unlock(so
, 1);
1496 * In theory resid should be unsigned.
1497 * However, space must be signed, as it might be less than 0
1498 * if we over-committed, and we must use a signed comparison
1499 * of space and resid. On the other hand, a negative resid
1500 * causes us to loop sending 0-length segments to the protocol.
1502 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
1503 * type sockets since that's an error.
1505 if (resid
< 0 || (so
->so_type
== SOCK_STREAM
&& (flags
& MSG_EOR
))) {
1507 socket_unlock(so
, 1);
1512 (flags
& MSG_DONTROUTE
) && (so
->so_options
& SO_DONTROUTE
) == 0 &&
1513 (so
->so_proto
->pr_flags
& PR_ATOMIC
);
1515 OSIncrementAtomic(&p
->p_stats
->p_ru
.ru_msgsnd
);
1517 clen
= control
->m_len
;
1520 error
= sosendcheck(so
, addr
, resid
, clen
, atomic
, flags
,
1526 space
= sbspace(&so
->so_snd
) - clen
+ ((flags
& MSG_OOB
) ?
1530 struct socket_filter_entry
*filter
;
1532 boolean_t recursive
;
1536 * Data is prepackaged in "top".
1539 if (flags
& MSG_EOR
)
1540 top
->m_flags
|= M_EOR
;
1546 bytes_to_copy
= min(resid
, space
);
1548 if (sosendminchain
> 0) {
1551 chainlength
= sosendmaxchain
;
1555 * Attempt to use larger than system page-size
1556 * clusters for large writes only if there is
1557 * a jumbo cluster pool and if the socket is
1558 * marked accordingly.
1560 jumbocl
= sosendjcl
&& njcl
> 0 &&
1561 ((so
->so_flags
& SOF_MULTIPAGES
) ||
1562 sosendjcl_ignore_capab
);
1564 socket_unlock(so
, 0);
1568 int hdrs_needed
= (top
== 0) ? 1 : 0;
1571 * try to maintain a local cache of mbuf
1572 * clusters needed to complete this
1573 * write the list is further limited to
1574 * the number that are currently needed
1575 * to fill the socket this mechanism
1576 * allows a large number of mbufs/
1577 * clusters to be grabbed under a single
1578 * mbuf lock... if we can't get any
1579 * clusters, than fall back to trying
1580 * for mbufs if we fail early (or
1581 * miscalcluate the number needed) make
1582 * sure to release any clusters we
1583 * haven't yet consumed.
1585 if (freelist
== NULL
&&
1586 bytes_to_copy
> NBPG
&& jumbocl
) {
1588 bytes_to_copy
/ M16KCLBYTES
;
1590 if ((bytes_to_copy
-
1591 (num_needed
* M16KCLBYTES
))
1596 m_getpackets_internal(
1597 (unsigned int *)&num_needed
,
1598 hdrs_needed
, M_WAIT
, 0,
1601 * Fall back to 4K cluster size
1602 * if allocation failed
1606 if (freelist
== NULL
&&
1607 bytes_to_copy
> MCLBYTES
) {
1609 bytes_to_copy
/ NBPG
;
1611 if ((bytes_to_copy
-
1612 (num_needed
* NBPG
)) >=
1617 m_getpackets_internal(
1618 (unsigned int *)&num_needed
,
1619 hdrs_needed
, M_WAIT
, 0,
1622 * Fall back to cluster size
1623 * if allocation failed
1627 if (freelist
== NULL
&&
1628 bytes_to_copy
> MINCLSIZE
) {
1630 bytes_to_copy
/ MCLBYTES
;
1632 if ((bytes_to_copy
-
1633 (num_needed
* MCLBYTES
)) >=
1638 m_getpackets_internal(
1639 (unsigned int *)&num_needed
,
1640 hdrs_needed
, M_WAIT
, 0,
1643 * Fall back to a single mbuf
1644 * if allocation failed
1648 if (freelist
== NULL
) {
1656 if (freelist
== NULL
) {
1662 * For datagram protocols,
1663 * leave room for protocol
1664 * headers in first mbuf.
1666 if (atomic
&& top
== 0 &&
1667 bytes_to_copy
< MHLEN
) {
1673 freelist
= m
->m_next
;
1676 if ((m
->m_flags
& M_EXT
))
1677 mlen
= m
->m_ext
.ext_size
;
1678 else if ((m
->m_flags
& M_PKTHDR
))
1680 MHLEN
- m_leadingspace(m
);
1683 len
= min(mlen
, bytes_to_copy
);
1689 error
= uiomove(mtod(m
, caddr_t
),
1692 // LP64todo - fix this!
1693 resid
= uio_resid(uio
);
1697 top
->m_pkthdr
.len
+= len
;
1702 if (flags
& MSG_EOR
)
1703 top
->m_flags
|= M_EOR
;
1706 bytes_to_copy
= min(resid
, space
);
1708 } while (space
> 0 &&
1709 (chainlength
< sosendmaxchain
|| atomic
||
1710 resid
< MINCLSIZE
));
1718 if (flags
& (MSG_HOLD
|MSG_SEND
)) {
1719 /* Enqueue for later, go away if HOLD */
1720 register struct mbuf
*mb1
;
1721 if (so
->so_temp
&& (flags
& MSG_FLUSH
)) {
1722 m_freem(so
->so_temp
);
1726 so
->so_tail
->m_next
= top
;
1733 if (flags
& MSG_HOLD
) {
1740 so
->so_options
|= SO_DONTROUTE
;
1742 /* Compute flags here, for pru_send and NKEs */
1743 sendflags
= (flags
& MSG_OOB
) ? PRUS_OOB
:
1745 * If the user set MSG_EOF, the protocol
1746 * understands this flag and nothing left to
1747 * send then use PRU_SEND_EOF instead of PRU_SEND.
1749 ((flags
& MSG_EOF
) &&
1750 (so
->so_proto
->pr_flags
& PR_IMPLOPCL
) &&
1753 /* If there is more to send set PRUS_MORETOCOME */
1754 (resid
> 0 && space
> 0) ? PRUS_MORETOCOME
: 0;
1757 * Socket filter processing
1759 recursive
= (so
->so_send_filt_thread
!= NULL
);
1762 for (filter
= so
->so_filt
; filter
&& (error
== 0);
1763 filter
= filter
->sfe_next_onsocket
) {
1764 if (filter
->sfe_filter
->sf_filter
.sf_data_out
) {
1766 if (filtered
== 0) {
1768 so
->so_send_filt_thread
=
1771 socket_unlock(so
, 0);
1773 (sendflags
& MSG_OOB
) ?
1774 sock_data_filt_flag_oob
: 0;
1776 error
= filter
->sfe_filter
->sf_filter
.
1777 sf_data_out(filter
->sfe_cookie
, so
,
1778 addr
, &top
, &control
, so_flags
);
1784 * At this point, we've run at least one
1785 * filter. The socket is unlocked as is
1786 * the socket buffer. Clear the recorded
1787 * filter thread only when we are outside
1788 * of a filter's context. This allows for
1789 * a filter to issue multiple inject calls
1790 * from its sf_data_out callback routine.
1795 so
->so_send_filt_thread
= 0;
1797 if (error
== EJUSTRETURN
) {
1808 * End Socket filter processing
1811 if (error
== EJUSTRETURN
) {
1812 /* A socket filter handled this data */
1815 error
= (*so
->so_proto
->pr_usrreqs
->pru_send
)
1816 (so
, sendflags
, top
, addr
, control
, p
);
1819 if (flags
& MSG_SEND
)
1823 so
->so_options
&= ~SO_DONTROUTE
;
1831 } while (resid
&& space
> 0);
1836 sbunlock(&so
->so_snd
, 0); /* will unlock socket */
1838 socket_unlock(so
, 1);
1845 m_freem_list(freelist
);
1847 KERNEL_DEBUG(DBG_FNC_SOSEND
| DBG_FUNC_END
, so
, resid
, so
->so_snd
.sb_cc
,
1854 * Implement receive operations on a socket.
1855 * We depend on the way that records are added to the sockbuf
1856 * by sbappend*. In particular, each record (mbufs linked through m_next)
1857 * must begin with an address if the protocol so specifies,
1858 * followed by an optional mbuf or mbufs containing ancillary data,
1859 * and then zero or more mbufs of data.
1860 * In order to avoid blocking network interrupts for the entire time here,
1861 * we splx() while doing the actual copy to user space.
1862 * Although the sockbuf is locked, new data may still be appended,
1863 * and thus we must maintain consistency of the sockbuf during that time.
1865 * The caller may receive the data as a single mbuf chain by supplying
1866 * an mbuf **mp0 for use in returning the chain. The uio is then used
1867 * only for the count in uio_resid.
1869 * Returns: 0 Success
1874 * sblock:EWOULDBLOCK
1878 * sodelayed_copy:EFAULT
1879 * <pru_rcvoob>:EINVAL[TCP]
1880 * <pru_rcvoob>:EWOULDBLOCK[TCP]
1882 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
1883 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
1884 * <pr_domain->dom_externalize>:???
1886 * Notes: Additional return values from calls through <pru_rcvoob> and
1887 * <pr_domain->dom_externalize> depend on protocols other than
1888 * TCP or AF_UNIX, which are documented above.
1891 soreceive(struct socket
*so
, struct sockaddr
**psa
, struct uio
*uio
,
1892 struct mbuf
**mp0
, struct mbuf
**controlp
, int *flagsp
)
1894 register struct mbuf
*m
, **mp
, *ml
= NULL
;
1895 register int flags
, len
, error
, offset
;
1896 struct protosw
*pr
= so
->so_proto
;
1897 struct mbuf
*nextrecord
;
1899 // LP64todo - fix this!
1900 int orig_resid
= uio_resid(uio
);
1901 struct mbuf
*free_list
;
1902 int delayed_copy_len
;
1905 struct proc
*p
= current_proc();
1907 // LP64todo - fix this!
1908 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_START
, so
, uio_resid(uio
),
1909 so
->so_rcv
.sb_cc
, so
->so_rcv
.sb_lowat
, so
->so_rcv
.sb_hiwat
);
1913 #ifdef MORE_LOCKING_DEBUG
1914 if (so
->so_usecount
== 1)
1915 panic("soreceive: so=%x no other reference on socket\n", so
);
1923 flags
= *flagsp
&~ MSG_EOR
;
1928 * If a recv attempt is made on a previously-accepted socket
1929 * that has been marked as inactive (disconnected), reject
1932 if (so
->so_flags
& SOF_DEFUNCT
) {
1933 struct sockbuf
*sb
= &so
->so_rcv
;
1936 * This socket should have been disconnected and flushed
1937 * prior to being returned from accept; there should be
1938 * no data on its receive list, so panic otherwise.
1940 sb_empty_assert(sb
, __func__
);
1941 socket_unlock(so
, 1);
1946 * When SO_WANTOOBFLAG is set we try to get out-of-band data
1947 * regardless of the flags argument. Here is the case were
1948 * out-of-band data is not inline.
1950 if ((flags
& MSG_OOB
) ||
1951 ((so
->so_options
& SO_WANTOOBFLAG
) != 0 &&
1952 (so
->so_options
& SO_OOBINLINE
) == 0 &&
1953 (so
->so_oobmark
|| (so
->so_state
& SS_RCVATMARK
)))) {
1954 m
= m_get(M_WAIT
, MT_DATA
);
1956 socket_unlock(so
, 1);
1957 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
,
1958 ENOBUFS
, 0, 0, 0, 0);
1961 error
= (*pr
->pr_usrreqs
->pru_rcvoob
)(so
, m
, flags
& MSG_PEEK
);
1964 socket_unlock(so
, 0);
1966 // LP64todo - fix this!
1967 error
= uiomove(mtod(m
, caddr_t
),
1968 (int)min(uio_resid(uio
), m
->m_len
), uio
);
1970 } while (uio_resid(uio
) && error
== 0 && m
);
1976 if ((so
->so_options
& SO_WANTOOBFLAG
) != 0) {
1977 if (error
== EWOULDBLOCK
|| error
== EINVAL
) {
1979 * Let's try to get normal data:
1980 * EWOULDBLOCK: out-of-band data not
1981 * receive yet. EINVAL: out-of-band data
1986 } else if (error
== 0 && flagsp
) {
1990 socket_unlock(so
, 1);
1991 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
1998 *mp
= (struct mbuf
*)0;
1999 if (so
->so_state
& SS_ISCONFIRMING
&& uio_resid(uio
))
2000 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, 0);
2003 free_list
= (struct mbuf
*)0;
2004 delayed_copy_len
= 0;
2006 #ifdef MORE_LOCKING_DEBUG
2007 if (so
->so_usecount
<= 1)
2008 printf("soreceive: sblock so=%p ref=%d on socket\n",
2009 so
, so
->so_usecount
);
2012 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2013 * and if so just return to the caller. This could happen when
2014 * soreceive() is called by a socket upcall function during the
2015 * time the socket is freed. The socket buffer would have been
2016 * locked across the upcall, therefore we cannot put this thread
2017 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2018 * we may livelock), because the lock on the socket buffer will
2019 * only be released when the upcall routine returns to its caller.
2020 * Because the socket has been officially closed, there can be
2021 * no further read on it.
2023 if ((so
->so_state
& (SS_NOFDREF
| SS_CANTRCVMORE
)) ==
2024 (SS_NOFDREF
| SS_CANTRCVMORE
)) {
2025 socket_unlock(so
, 1);
2029 error
= sblock(&so
->so_rcv
, SBLOCKWAIT(flags
));
2031 socket_unlock(so
, 1);
2032 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
2037 m
= so
->so_rcv
.sb_mb
;
2039 * If we have less data than requested, block awaiting more
2040 * (subject to any timeout) if:
2041 * 1. the current count is less than the low water mark, or
2042 * 2. MSG_WAITALL is set, and it is possible to do the entire
2043 * receive operation at once if we block (resid <= hiwat).
2044 * 3. MSG_DONTWAIT is not set
2045 * If MSG_WAITALL is set but resid is larger than the receive buffer,
2046 * we have to do the receive in sections, and thus risk returning
2047 * a short count if a timeout or signal occurs after we start.
2049 if (m
== 0 || (((flags
& MSG_DONTWAIT
) == 0 &&
2050 so
->so_rcv
.sb_cc
< uio_resid(uio
)) &&
2051 (so
->so_rcv
.sb_cc
< so
->so_rcv
.sb_lowat
||
2052 ((flags
& MSG_WAITALL
) && uio_resid(uio
) <= so
->so_rcv
.sb_hiwat
)) &&
2053 m
->m_nextpkt
== 0 && (pr
->pr_flags
& PR_ATOMIC
) == 0)) {
2055 * Panic if we notice inconsistencies in the socket's
2056 * receive list; both sb_mb and sb_cc should correctly
2057 * reflect the contents of the list, otherwise we may
2058 * end up with false positives during select() or poll()
2059 * which could put the application in a bad state.
2061 if (m
== NULL
&& so
->so_rcv
.sb_cc
!= 0)
2062 panic("soreceive corrupted so_rcv: m %p cc %lu",
2063 m
, so
->so_rcv
.sb_cc
);
2068 error
= so
->so_error
;
2069 if ((flags
& MSG_PEEK
) == 0)
2073 if (so
->so_state
& SS_CANTRCVMORE
) {
2079 for (; m
; m
= m
->m_next
)
2080 if (m
->m_type
== MT_OOBDATA
|| (m
->m_flags
& M_EOR
)) {
2081 m
= so
->so_rcv
.sb_mb
;
2084 if ((so
->so_state
& (SS_ISCONNECTED
|SS_ISCONNECTING
)) == 0 &&
2085 (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
)) {
2089 if (uio_resid(uio
) == 0)
2091 if ((so
->so_state
& SS_NBIO
) ||
2092 (flags
& (MSG_DONTWAIT
|MSG_NBIO
))) {
2093 error
= EWOULDBLOCK
;
2096 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 1");
2097 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 1");
2098 sbunlock(&so
->so_rcv
, 1);
2099 #if EVEN_MORE_LOCKING_DEBUG
2101 printf("Waiting for socket data\n");
2104 error
= sbwait(&so
->so_rcv
);
2105 #if EVEN_MORE_LOCKING_DEBUG
2107 printf("SORECEIVE - sbwait returned %d\n", error
);
2109 if (so
->so_usecount
< 1)
2110 panic("soreceive: after 2nd sblock so=%p ref=%d on "
2111 "socket\n", so
, so
->so_usecount
);
2113 socket_unlock(so
, 1);
2114 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, error
,
2123 uio
->uio_procp
->p_stats
->p_ru
.ru_msgrcv
++;
2124 #else /* __APPLE__ */
2127 * This should be uio->uio-procp; however, some callers of this
2128 * function use auto variables with stack garbage, and fail to
2129 * fill out the uio structure properly.
2132 OSIncrementAtomic(&p
->p_stats
->p_ru
.ru_msgrcv
);
2133 #endif /* __APPLE__ */
2134 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1");
2135 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1");
2136 nextrecord
= m
->m_nextpkt
;
2137 if ((pr
->pr_flags
& PR_ADDR
) && m
->m_type
== MT_SONAME
) {
2138 KASSERT(m
->m_type
== MT_SONAME
, ("receive 1a"));
2139 #if CONFIG_MACF_SOCKET_SUBSET
2141 * Call the MAC framework for policy checking if we're in
2142 * the user process context and the socket isn't connected.
2144 if (p
!= kernproc
&& !(so
->so_state
& SS_ISCONNECTED
)) {
2145 struct mbuf
*m0
= m
;
2147 * Dequeue this record (temporarily) from the receive
2148 * list since we're about to drop the socket's lock
2149 * where a new record may arrive and be appended to
2150 * the list. Upon MAC policy failure, the record
2151 * will be freed. Otherwise, we'll add it back to
2152 * the head of the list. We cannot rely on SB_LOCK
2153 * because append operation uses the socket's lock.
2156 m
->m_nextpkt
= NULL
;
2157 sbfree(&so
->so_rcv
, m
);
2159 } while (m
!= NULL
);
2161 so
->so_rcv
.sb_mb
= nextrecord
;
2162 SB_EMPTY_FIXUP(&so
->so_rcv
);
2163 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1a");
2164 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1a");
2165 socket_unlock(so
, 0);
2166 if (mac_socket_check_received(proc_ucred(p
), so
,
2167 mtod(m
, struct sockaddr
*)) != 0) {
2169 * MAC policy failure; free this record and
2170 * process the next record (or block until
2171 * one is available). We have adjusted sb_cc
2172 * and sb_mbcnt above so there is no need to
2173 * call sbfree() again.
2177 } while (m
!= NULL
);
2179 * Clear SB_LOCK but don't unlock the socket.
2180 * Process the next record or wait for one.
2183 sbunlock(&so
->so_rcv
, 1);
2188 * Re-adjust the socket receive list and re-enqueue
2189 * the record in front of any packets which may have
2190 * been appended while we dropped the lock.
2192 for (m
= m0
; m
->m_next
!= NULL
; m
= m
->m_next
)
2193 sballoc(&so
->so_rcv
, m
);
2194 sballoc(&so
->so_rcv
, m
);
2195 if (so
->so_rcv
.sb_mb
== NULL
) {
2196 so
->so_rcv
.sb_lastrecord
= m0
;
2197 so
->so_rcv
.sb_mbtail
= m
;
2200 nextrecord
= m
->m_nextpkt
= so
->so_rcv
.sb_mb
;
2201 so
->so_rcv
.sb_mb
= m
;
2202 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 1b");
2203 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 1b");
2205 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2208 *psa
= dup_sockaddr(mtod(m
, struct sockaddr
*),
2210 if ((*psa
== 0) && (flags
& MSG_NEEDSA
)) {
2211 error
= EWOULDBLOCK
;
2215 if (flags
& MSG_PEEK
) {
2218 sbfree(&so
->so_rcv
, m
);
2219 if (m
->m_next
== 0 && so
->so_rcv
.sb_cc
!= 0)
2220 panic("soreceive: about to create invalid "
2222 MFREE(m
, so
->so_rcv
.sb_mb
);
2223 m
= so
->so_rcv
.sb_mb
;
2225 m
->m_nextpkt
= nextrecord
;
2227 so
->so_rcv
.sb_mb
= nextrecord
;
2228 SB_EMPTY_FIXUP(&so
->so_rcv
);
2234 * Process one or more MT_CONTROL mbufs present before any data mbufs
2235 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2236 * just copy the data; if !MSG_PEEK, we call into the protocol to
2237 * perform externalization.
2239 if (m
!= NULL
&& m
->m_type
== MT_CONTROL
) {
2240 struct mbuf
*cm
= NULL
, *cmn
;
2241 struct mbuf
**cme
= &cm
;
2242 struct sockbuf
*sb_rcv
= &so
->so_rcv
;
2245 * Externalizing the control messages would require us to
2246 * drop the socket's lock below. Once we re-acquire the
2247 * lock, the mbuf chain might change. In order to preserve
2248 * consistency, we unlink all control messages from the
2249 * first mbuf chain in one shot and link them separately
2250 * onto a different chain.
2253 if (flags
& MSG_PEEK
) {
2254 if (controlp
!= NULL
) {
2255 *controlp
= m_copy(m
, 0, m
->m_len
);
2256 controlp
= &(*controlp
)->m_next
;
2260 m
->m_nextpkt
= NULL
;
2262 sb_rcv
->sb_mb
= m
->m_next
;
2265 cme
= &(*cme
)->m_next
;
2268 } while (m
!= NULL
&& m
->m_type
== MT_CONTROL
);
2270 if (!(flags
& MSG_PEEK
)) {
2271 if (sb_rcv
->sb_mb
!= NULL
) {
2272 sb_rcv
->sb_mb
->m_nextpkt
= nextrecord
;
2274 sb_rcv
->sb_mb
= nextrecord
;
2275 SB_EMPTY_FIXUP(sb_rcv
);
2277 if (nextrecord
== NULL
)
2278 sb_rcv
->sb_lastrecord
= m
;
2281 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive ctl");
2282 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive ctl");
2284 while (cm
!= NULL
) {
2289 cmsg_type
= mtod(cm
, struct cmsghdr
*)->cmsg_type
;
2292 * Call the protocol to externalize SCM_RIGHTS message
2293 * and return the modified message to the caller upon
2294 * success. Otherwise, all other control messages are
2295 * returned unmodified to the caller. Note that we
2296 * only get into this loop if MSG_PEEK is not set.
2298 if (pr
->pr_domain
->dom_externalize
!= NULL
&&
2299 cmsg_type
== SCM_RIGHTS
) {
2301 * Release socket lock: see 3903171. This
2302 * would also allow more records to be appended
2303 * to the socket buffer. We still have SB_LOCK
2304 * set on it, so we can be sure that the head
2305 * of the mbuf chain won't change.
2307 socket_unlock(so
, 0);
2308 error
= (*pr
->pr_domain
->dom_externalize
)(cm
);
2314 if (controlp
!= NULL
&& error
== 0) {
2316 controlp
= &(*controlp
)->m_next
;
2324 if (sb_rcv
->sb_mb
!= NULL
)
2325 nextrecord
= sb_rcv
->sb_mb
->m_nextpkt
;
2331 if (!(flags
& MSG_PEEK
)) {
2333 * We get here because m points to an mbuf following
2334 * any MT_SONAME or MT_CONTROL mbufs which have been
2335 * processed above. In any case, m should be pointing
2336 * to the head of the mbuf chain, and the nextrecord
2337 * should be either NULL or equal to m->m_nextpkt.
2338 * See comments above about SB_LOCK.
2340 if (m
!= so
->so_rcv
.sb_mb
|| m
->m_nextpkt
!= nextrecord
)
2341 panic("soreceive: post-control !sync so=%p "
2342 "m=%p nextrecord=%p\n", so
, m
, nextrecord
);
2344 if (nextrecord
== NULL
)
2345 so
->so_rcv
.sb_lastrecord
= m
;
2348 if (type
== MT_OOBDATA
)
2351 if (!(flags
& MSG_PEEK
)) {
2352 so
->so_rcv
.sb_mb
= nextrecord
;
2353 SB_EMPTY_FIXUP(&so
->so_rcv
);
2356 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 2");
2357 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 2");
2362 if (!(flags
& MSG_PEEK
) && uio_resid(uio
) > sorecvmincopy
)
2369 while (m
&& (uio_resid(uio
) - delayed_copy_len
) > 0 && error
== 0) {
2370 if (m
->m_type
== MT_OOBDATA
) {
2371 if (type
!= MT_OOBDATA
)
2373 } else if (type
== MT_OOBDATA
) {
2377 * Make sure to allways set MSG_OOB event when getting
2378 * out of band data inline.
2380 if ((so
->so_options
& SO_WANTOOBFLAG
) != 0 &&
2381 (so
->so_options
& SO_OOBINLINE
) != 0 &&
2382 (so
->so_state
& SS_RCVATMARK
) != 0) {
2385 so
->so_state
&= ~SS_RCVATMARK
;
2386 // LP64todo - fix this!
2387 len
= uio_resid(uio
) - delayed_copy_len
;
2388 if (so
->so_oobmark
&& len
> so
->so_oobmark
- offset
)
2389 len
= so
->so_oobmark
- offset
;
2390 if (len
> m
->m_len
- moff
)
2391 len
= m
->m_len
- moff
;
2393 * If mp is set, just pass back the mbufs.
2394 * Otherwise copy them out via the uio, then free.
2395 * Sockbuf must be consistent here (points to current mbuf,
2396 * it points to next record) when we drop priority;
2397 * we must note any additions to the sockbuf when we
2398 * block interrupts again.
2401 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive uiomove");
2402 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive uiomove");
2403 if (can_delay
&& len
== m
->m_len
) {
2405 * only delay the copy if we're consuming the
2406 * mbuf and we're NOT in MSG_PEEK mode
2407 * and we have enough data to make it worthwile
2408 * to drop and retake the lock... can_delay
2409 * reflects the state of the 2 latter
2410 * constraints moff should always be zero
2413 delayed_copy_len
+= len
;
2415 if (delayed_copy_len
) {
2416 error
= sodelayed_copy(so
, uio
,
2417 &free_list
, &delayed_copy_len
);
2423 * can only get here if MSG_PEEK is not
2424 * set therefore, m should point at the
2425 * head of the rcv queue; if it doesn't,
2426 * it means something drastically
2427 * changed while we were out from behind
2428 * the lock in sodelayed_copy. perhaps
2429 * a RST on the stream. in any event,
2430 * the stream has been interrupted. it's
2431 * probably best just to return whatever
2432 * data we've moved and let the caller
2435 if (m
!= so
->so_rcv
.sb_mb
) {
2439 socket_unlock(so
, 0);
2440 error
= uiomove(mtod(m
, caddr_t
) + moff
,
2448 uio_setresid(uio
, (uio_resid(uio
) - len
));
2450 if (len
== m
->m_len
- moff
) {
2451 if (m
->m_flags
& M_EOR
)
2453 if (flags
& MSG_PEEK
) {
2457 nextrecord
= m
->m_nextpkt
;
2458 sbfree(&so
->so_rcv
, m
);
2459 m
->m_nextpkt
= NULL
;
2464 so
->so_rcv
.sb_mb
= m
= m
->m_next
;
2465 *mp
= (struct mbuf
*)0;
2467 if (free_list
== NULL
)
2472 so
->so_rcv
.sb_mb
= m
= m
->m_next
;
2476 m
->m_nextpkt
= nextrecord
;
2477 if (nextrecord
== NULL
)
2478 so
->so_rcv
.sb_lastrecord
= m
;
2480 so
->so_rcv
.sb_mb
= nextrecord
;
2481 SB_EMPTY_FIXUP(&so
->so_rcv
);
2483 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 3");
2484 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 3");
2487 if (flags
& MSG_PEEK
) {
2491 *mp
= m_copym(m
, 0, len
, M_WAIT
);
2494 so
->so_rcv
.sb_cc
-= len
;
2497 if (so
->so_oobmark
) {
2498 if ((flags
& MSG_PEEK
) == 0) {
2499 so
->so_oobmark
-= len
;
2500 if (so
->so_oobmark
== 0) {
2501 so
->so_state
|= SS_RCVATMARK
;
2503 * delay posting the actual event until
2504 * after any delayed copy processing
2512 if (offset
== so
->so_oobmark
)
2516 if (flags
& MSG_EOR
)
2519 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
2520 * (for non-atomic socket), we must not quit until
2521 * "uio->uio_resid == 0" or an error termination.
2522 * If a signal/timeout occurs, return with a short
2523 * count but without error. Keep sockbuf locked
2524 * against other readers.
2526 while (flags
& (MSG_WAITALL
|MSG_WAITSTREAM
) && m
== 0 &&
2527 (uio_resid(uio
) - delayed_copy_len
) > 0 &&
2528 !sosendallatonce(so
) && !nextrecord
) {
2529 if (so
->so_error
|| so
->so_state
& SS_CANTRCVMORE
)
2533 * Depending on the protocol (e.g. TCP), the following
2534 * might cause the socket lock to be dropped and later
2535 * be reacquired, and more data could have arrived and
2536 * have been appended to the receive socket buffer by
2537 * the time it returns. Therefore, we only sleep in
2538 * sbwait() below if and only if the socket buffer is
2539 * empty, in order to avoid a false sleep.
2541 if (pr
->pr_flags
& PR_WANTRCVD
&& so
->so_pcb
&&
2542 (((struct inpcb
*)so
->so_pcb
)->inp_state
!=
2544 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
2546 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive sbwait 2");
2547 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive sbwait 2");
2549 if (so
->so_rcv
.sb_mb
== NULL
&& sbwait(&so
->so_rcv
)) {
2554 * have to wait until after we get back from the sbwait
2555 * to do the copy because we will drop the lock if we
2556 * have enough data that has been delayed... by dropping
2557 * the lock we open up a window allowing the netisr
2558 * thread to process the incoming packets and to change
2559 * the state of this socket... we're issuing the sbwait
2560 * because the socket is empty and we're expecting the
2561 * netisr thread to wake us up when more packets arrive;
2562 * if we allow that processing to happen and then sbwait
2563 * we could stall forever with packets sitting in the
2564 * socket if no further packets arrive from the remote
2567 * we want to copy before we've collected all the data
2568 * to satisfy this request to allow the copy to overlap
2569 * the incoming packet processing on an MP system
2571 if (delayed_copy_len
> sorecvmincopy
&&
2572 (delayed_copy_len
> (so
->so_rcv
.sb_hiwat
/ 2))) {
2573 error
= sodelayed_copy(so
, uio
,
2574 &free_list
, &delayed_copy_len
);
2579 m
= so
->so_rcv
.sb_mb
;
2581 nextrecord
= m
->m_nextpkt
;
2585 #ifdef MORE_LOCKING_DEBUG
2586 if (so
->so_usecount
<= 1)
2587 panic("soreceive: after big while so=%p ref=%d on socket\n",
2588 so
, so
->so_usecount
);
2591 if (m
&& pr
->pr_flags
& PR_ATOMIC
) {
2593 if (so
->so_options
& SO_DONTTRUNC
) {
2594 flags
|= MSG_RCVMORE
;
2598 if ((flags
& MSG_PEEK
) == 0)
2599 (void) sbdroprecord(&so
->so_rcv
);
2606 * pru_rcvd below (for TCP) may cause more data to be received
2607 * if the socket lock is dropped prior to sending the ACK; some
2608 * legacy OpenTransport applications don't handle this well
2609 * (if it receives less data than requested while MSG_HAVEMORE
2610 * is set), and so we set the flag now based on what we know
2611 * prior to calling pru_rcvd.
2613 if ((so
->so_options
& SO_WANTMORE
) && so
->so_rcv
.sb_cc
> 0)
2614 flags
|= MSG_HAVEMORE
;
2616 if ((flags
& MSG_PEEK
) == 0) {
2618 so
->so_rcv
.sb_mb
= nextrecord
;
2620 * First part is an inline SB_EMPTY_FIXUP(). Second
2621 * part makes sure sb_lastrecord is up-to-date if
2622 * there is still data in the socket buffer.
2624 if (so
->so_rcv
.sb_mb
== NULL
) {
2625 so
->so_rcv
.sb_mbtail
= NULL
;
2626 so
->so_rcv
.sb_lastrecord
= NULL
;
2627 } else if (nextrecord
->m_nextpkt
== NULL
) {
2628 so
->so_rcv
.sb_lastrecord
= nextrecord
;
2631 SBLASTRECORDCHK(&so
->so_rcv
, "soreceive 4");
2632 SBLASTMBUFCHK(&so
->so_rcv
, "soreceive 4");
2633 if (pr
->pr_flags
& PR_WANTRCVD
&& so
->so_pcb
)
2634 (*pr
->pr_usrreqs
->pru_rcvd
)(so
, flags
);
2637 if (delayed_copy_len
) {
2638 error
= sodelayed_copy(so
, uio
, &free_list
, &delayed_copy_len
);
2644 m_freem_list((struct mbuf
*)free_list
);
2645 free_list
= (struct mbuf
*)0;
2648 postevent(so
, 0, EV_OOB
);
2650 if (orig_resid
== uio_resid(uio
) && orig_resid
&&
2651 (flags
& MSG_EOR
) == 0 && (so
->so_state
& SS_CANTRCVMORE
) == 0) {
2652 sbunlock(&so
->so_rcv
, 1);
2659 #ifdef MORE_LOCKING_DEBUG
2660 if (so
->so_usecount
<= 1)
2661 panic("soreceive: release so=%p ref=%d on socket\n",
2662 so
, so
->so_usecount
);
2664 if (delayed_copy_len
) {
2665 error
= sodelayed_copy(so
, uio
, &free_list
, &delayed_copy_len
);
2668 m_freem_list((struct mbuf
*)free_list
);
2670 sbunlock(&so
->so_rcv
, 0); /* will unlock socket */
2672 // LP64todo - fix this!
2673 KERNEL_DEBUG(DBG_FNC_SORECEIVE
| DBG_FUNC_END
, so
, uio_resid(uio
),
2674 so
->so_rcv
.sb_cc
, 0, error
);
2680 * Returns: 0 Success
2684 sodelayed_copy(struct socket
*so
, struct uio
*uio
, struct mbuf
**free_list
,
2692 socket_unlock(so
, 0);
2694 while (m
&& error
== 0) {
2696 error
= uiomove(mtod(m
, caddr_t
), (int)m
->m_len
, uio
);
2700 m_freem_list(*free_list
);
2702 *free_list
= (struct mbuf
*)NULL
;
2712 * Returns: 0 Success
2715 * <pru_shutdown>:EINVAL
2716 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
2717 * <pru_shutdown>:ENOBUFS[TCP]
2718 * <pru_shutdown>:EMSGSIZE[TCP]
2719 * <pru_shutdown>:EHOSTUNREACH[TCP]
2720 * <pru_shutdown>:ENETUNREACH[TCP]
2721 * <pru_shutdown>:ENETDOWN[TCP]
2722 * <pru_shutdown>:ENOMEM[TCP]
2723 * <pru_shutdown>:EACCES[TCP]
2724 * <pru_shutdown>:EMSGSIZE[TCP]
2725 * <pru_shutdown>:ENOBUFS[TCP]
2726 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2727 * <pru_shutdown>:??? [other protocol families]
2730 soshutdown(struct socket
*so
, int how
)
2740 (SS_ISCONNECTED
|SS_ISCONNECTING
|SS_ISDISCONNECTING
)) == 0) {
2743 error
= soshutdownlock(so
, how
);
2745 socket_unlock(so
, 1);
2756 soshutdownlock(struct socket
*so
, int how
)
2758 struct protosw
*pr
= so
->so_proto
;
2761 sflt_notify(so
, sock_evt_shutdown
, &how
);
2763 if (how
!= SHUT_WR
) {
2764 if ((so
->so_state
& SS_CANTRCVMORE
) != 0) {
2765 /* read already shut down */
2770 postevent(so
, 0, EV_RCLOSED
);
2772 if (how
!= SHUT_RD
) {
2773 if ((so
->so_state
& SS_CANTSENDMORE
) != 0) {
2774 /* write already shut down */
2778 error
= (*pr
->pr_usrreqs
->pru_shutdown
)(so
);
2779 postevent(so
, 0, EV_WCLOSED
);
2782 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN
| DBG_FUNC_END
, 0, 0, 0, 0, 0);
2787 sorflush(struct socket
*so
)
2789 register struct sockbuf
*sb
= &so
->so_rcv
;
2790 register struct protosw
*pr
= so
->so_proto
;
2793 #ifdef MORE_LOCKING_DEBUG
2794 lck_mtx_t
*mutex_held
;
2796 if (so
->so_proto
->pr_getlock
!= NULL
)
2797 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
2799 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
2800 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
2803 sflt_notify(so
, sock_evt_flush_read
, NULL
);
2805 sb
->sb_flags
|= SB_NOINTR
;
2806 (void) sblock(sb
, M_WAIT
);
2810 selthreadclear(&sb
->sb_sel
);
2813 bzero((caddr_t
)sb
, sizeof (*sb
));
2814 sb
->sb_so
= so
; /* reestablish link to socket */
2815 if (asb
.sb_flags
& SB_KNOTE
) {
2816 sb
->sb_sel
.si_note
= asb
.sb_sel
.si_note
;
2817 sb
->sb_flags
= SB_KNOTE
;
2819 if (asb
.sb_flags
& SB_DROP
)
2820 sb
->sb_flags
|= SB_DROP
;
2821 if (asb
.sb_flags
& SB_UNIX
)
2822 sb
->sb_flags
|= SB_UNIX
;
2823 if ((pr
->pr_flags
& PR_RIGHTS
) && pr
->pr_domain
->dom_dispose
) {
2824 boolean_t unp
= (pr
->pr_domain
->dom_dispose
== unp_dispose
);
2826 * Currently AF_UNIX domain uses a global domain mutex;
2827 * unp_dispose() may end up calling soclose() on another
2828 * AF_UNIX socket and therefore the lock must not be held
2832 socket_unlock(so
, 0);
2833 (*pr
->pr_domain
->dom_dispose
)(asb
.sb_mb
);
2841 * Perhaps this routine, and sooptcopyout(), below, ought to come in
2842 * an additional variant to handle the case where the option value needs
2843 * to be some kind of integer, but not a specific size.
2844 * In addition to their use here, these functions are also called by the
2845 * protocol-level pr_ctloutput() routines.
2847 * Returns: 0 Success
2852 sooptcopyin(struct sockopt
*sopt
, void *buf
, size_t len
, size_t minlen
)
2857 * If the user gives us more than we wanted, we ignore it,
2858 * but if we don't get the minimum length the caller
2859 * wants, we return EINVAL. On success, sopt->sopt_valsize
2860 * is set to however much we actually retrieved.
2862 if ((valsize
= sopt
->sopt_valsize
) < minlen
)
2865 sopt
->sopt_valsize
= valsize
= len
;
2867 if (sopt
->sopt_p
!= 0)
2868 return (copyin(sopt
->sopt_val
, buf
, valsize
));
2870 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), buf
, valsize
);
2875 * sooptcopyin_timeval
2876 * Copy in a timeval value into tv_p, and take into account whether the
2877 * the calling process is 64-bit or 32-bit. Moved the sanity checking
2878 * code here so that we can verify the 64-bit tv_sec value before we lose
2879 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
2882 sooptcopyin_timeval(struct sockopt
*sopt
, struct timeval
* tv_p
)
2886 if (proc_is64bit(sopt
->sopt_p
)) {
2887 struct timeval64 tv64
;
2889 if (sopt
->sopt_valsize
< sizeof(tv64
)) {
2892 sopt
->sopt_valsize
= sizeof(tv64
);
2893 error
= copyin(sopt
->sopt_val
, &tv64
, sizeof(tv64
));
2897 if (tv64
.tv_sec
< 0 || tv64
.tv_sec
> LONG_MAX
2898 || tv64
.tv_usec
< 0 || tv64
.tv_usec
>= 1000000) {
2901 tv_p
->tv_sec
= tv64
.tv_sec
;
2902 tv_p
->tv_usec
= tv64
.tv_usec
;
2904 if (sopt
->sopt_valsize
< sizeof(*tv_p
)) {
2907 sopt
->sopt_valsize
= sizeof(*tv_p
);
2908 if (sopt
->sopt_p
!= 0) {
2909 error
= copyin(sopt
->sopt_val
, tv_p
, sizeof(*tv_p
));
2914 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
), tv_p
,
2917 if (tv_p
->tv_sec
< 0 || tv_p
->tv_sec
> LONG_MAX
2918 || tv_p
->tv_usec
< 0 || tv_p
->tv_usec
>= 1000000) {
2926 * Returns: 0 Success
2931 * sooptcopyin:EINVAL
2932 * sooptcopyin:EFAULT
2933 * sooptcopyin_timeval:EINVAL
2934 * sooptcopyin_timeval:EFAULT
2935 * sooptcopyin_timeval:EDOM
2936 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
2937 * <pr_ctloutput>:???w
2938 * sflt_attach_private:??? [whatever a filter author chooses]
2939 * <sf_setoption>:??? [whatever a filter author chooses]
2941 * Notes: Other <pru_listen> returns depend on the protocol family; all
2942 * <sf_listen> returns depend on what the filter author causes
2943 * their filter to return.
2946 sosetopt(struct socket
*so
, struct sockopt
*sopt
)
2951 struct socket_filter_entry
*filter
;
2953 #if CONFIG_MACF_SOCKET
2955 #endif /* MAC_SOCKET */
2958 if ((so
->so_state
& (SS_CANTRCVMORE
| SS_CANTSENDMORE
))
2959 == (SS_CANTRCVMORE
| SS_CANTSENDMORE
)) {
2960 /* the socket has been shutdown, no more sockopt's */
2965 if (sopt
->sopt_dir
!= SOPT_SET
) {
2966 sopt
->sopt_dir
= SOPT_SET
;
2970 for (filter
= so
->so_filt
; filter
&& (error
== 0);
2971 filter
= filter
->sfe_next_onsocket
) {
2972 if (filter
->sfe_filter
->sf_filter
.sf_setoption
) {
2973 if (filtered
== 0) {
2976 socket_unlock(so
, 0);
2978 error
= filter
->sfe_filter
->sf_filter
.
2979 sf_setoption(filter
->sfe_cookie
, so
, sopt
);
2983 if (filtered
!= 0) {
2988 if (error
== EJUSTRETURN
)
2995 if (sopt
->sopt_level
!= SOL_SOCKET
) {
2996 if (so
->so_proto
&& so
->so_proto
->pr_ctloutput
) {
2997 error
= (*so
->so_proto
->pr_ctloutput
)(so
, sopt
);
2998 socket_unlock(so
, 1);
3001 error
= ENOPROTOOPT
;
3003 switch (sopt
->sopt_name
) {
3006 error
= sooptcopyin(sopt
, &l
, sizeof (l
), sizeof (l
));
3010 so
->so_linger
= (sopt
->sopt_name
== SO_LINGER
) ?
3011 l
.l_linger
: l
.l_linger
* hz
;
3013 so
->so_options
|= SO_LINGER
;
3015 so
->so_options
&= ~SO_LINGER
;
3021 case SO_USELOOPBACK
:
3030 case SO_WANTOOBFLAG
:
3032 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3037 so
->so_options
|= sopt
->sopt_name
;
3039 so
->so_options
&= ~sopt
->sopt_name
;
3046 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3052 * Values < 1 make no sense for any of these
3053 * options, so disallow them.
3060 switch (sopt
->sopt_name
) {
3063 if (sbreserve(sopt
->sopt_name
== SO_SNDBUF
?
3064 &so
->so_snd
: &so
->so_rcv
,
3065 (u_long
) optval
) == 0) {
3069 if (sopt
->sopt_name
== SO_SNDBUF
)
3070 so
->so_snd
.sb_flags
|= SB_USRSIZE
;
3072 so
->so_rcv
.sb_flags
|= SB_USRSIZE
;
3076 * Make sure the low-water is never greater than
3080 so
->so_snd
.sb_lowat
=
3081 (optval
> so
->so_snd
.sb_hiwat
) ?
3082 so
->so_snd
.sb_hiwat
: optval
;
3085 so
->so_rcv
.sb_lowat
=
3086 (optval
> so
->so_rcv
.sb_hiwat
) ?
3087 so
->so_rcv
.sb_hiwat
: optval
;
3094 error
= sooptcopyin_timeval(sopt
, &tv
);
3098 switch (sopt
->sopt_name
) {
3100 so
->so_snd
.sb_timeo
= tv
;
3103 so
->so_rcv
.sb_timeo
= tv
;
3112 error
= sooptcopyin(sopt
, &nke
, sizeof (nke
),
3117 error
= sflt_attach_private(so
, NULL
,
3123 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3128 so
->so_flags
|= SOF_NOSIGPIPE
;
3130 so
->so_flags
&= ~SOF_NOSIGPIPE
;
3135 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3140 so
->so_flags
|= SOF_NOADDRAVAIL
;
3142 so
->so_flags
&= ~SOF_NOADDRAVAIL
;
3146 case SO_REUSESHAREUID
:
3147 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3152 so
->so_flags
|= SOF_REUSESHAREUID
;
3154 so
->so_flags
&= ~SOF_REUSESHAREUID
;
3156 #ifdef __APPLE_API_PRIVATE
3157 case SO_NOTIFYCONFLICT
:
3158 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3162 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3167 so
->so_flags
|= SOF_NOTIFYCONFLICT
;
3169 so
->so_flags
&= ~SOF_NOTIFYCONFLICT
;
3172 case SO_RESTRICTIONS
:
3173 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3177 error
= sooptcopyin(sopt
, &optval
, sizeof (optval
),
3181 so
->so_restrictions
= (optval
& (SO_RESTRICT_DENYIN
|
3182 SO_RESTRICT_DENYOUT
| SO_RESTRICT_DENYSET
));
3186 #if CONFIG_MACF_SOCKET
3187 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof (extmac
),
3188 sizeof (extmac
))) != 0)
3191 error
= mac_setsockopt_label(proc_ucred(sopt
->sopt_p
),
3195 #endif /* MAC_SOCKET */
3199 error
= ENOPROTOOPT
;
3202 if (error
== 0 && so
->so_proto
&& so
->so_proto
->pr_ctloutput
) {
3203 (void) ((*so
->so_proto
->pr_ctloutput
)(so
, sopt
));
3207 socket_unlock(so
, 1);
3211 /* Helper routines for getsockopt */
3213 sooptcopyout(struct sockopt
*sopt
, void *buf
, size_t len
)
3221 * Documented get behavior is that we always return a value,
3222 * possibly truncated to fit in the user's buffer.
3223 * Traditional behavior is that we always tell the user
3224 * precisely how much we copied, rather than something useful
3225 * like the total amount we had available for her.
3226 * Note that this interface is not idempotent; the entire answer must
3227 * generated ahead of time.
3229 valsize
= min(len
, sopt
->sopt_valsize
);
3230 sopt
->sopt_valsize
= valsize
;
3231 if (sopt
->sopt_val
!= USER_ADDR_NULL
) {
3232 if (sopt
->sopt_p
!= 0)
3233 error
= copyout(buf
, sopt
->sopt_val
, valsize
);
3235 bcopy(buf
, CAST_DOWN(caddr_t
, sopt
->sopt_val
), valsize
);
3241 sooptcopyout_timeval(struct sockopt
*sopt
, const struct timeval
* tv_p
)
3245 struct timeval64 tv64
;
3250 if (proc_is64bit(sopt
->sopt_p
)) {
3251 len
= sizeof(struct timeval64
);
3252 tv64
.tv_sec
= tv_p
->tv_sec
;
3253 tv64
.tv_usec
= tv_p
->tv_usec
;
3256 len
= sizeof(struct timeval
);
3259 valsize
= min(len
, sopt
->sopt_valsize
);
3260 sopt
->sopt_valsize
= valsize
;
3261 if (sopt
->sopt_val
!= USER_ADDR_NULL
) {
3262 if (sopt
->sopt_p
!= 0)
3263 error
= copyout(val
, sopt
->sopt_val
, valsize
);
3265 bcopy(val
, CAST_DOWN(caddr_t
, sopt
->sopt_val
), valsize
);
3273 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
3274 * <pr_ctloutput>:???
3275 * <sf_getoption>:???
3278 sogetopt(struct socket
*so
, struct sockopt
*sopt
)
3283 struct socket_filter_entry
*filter
;
3285 #if CONFIG_MACF_SOCKET
3287 #endif /* MAC_SOCKET */
3289 if (sopt
->sopt_dir
!= SOPT_GET
) {
3290 sopt
->sopt_dir
= SOPT_GET
;
3296 for (filter
= so
->so_filt
; filter
&& (error
== 0);
3297 filter
= filter
->sfe_next_onsocket
) {
3298 if (filter
->sfe_filter
->sf_filter
.sf_getoption
) {
3299 if (filtered
== 0) {
3302 socket_unlock(so
, 0);
3304 error
= filter
->sfe_filter
->sf_filter
.
3305 sf_getoption(filter
->sfe_cookie
, so
, sopt
);
3308 if (filtered
!= 0) {
3313 if (error
== EJUSTRETURN
)
3315 socket_unlock(so
, 1);
3321 if (sopt
->sopt_level
!= SOL_SOCKET
) {
3322 if (so
->so_proto
&& so
->so_proto
->pr_ctloutput
) {
3323 error
= (*so
->so_proto
->pr_ctloutput
)(so
, sopt
);
3324 socket_unlock(so
, 1);
3327 socket_unlock(so
, 1);
3328 return (ENOPROTOOPT
);
3331 switch (sopt
->sopt_name
) {
3334 l
.l_onoff
= so
->so_options
& SO_LINGER
;
3335 l
.l_linger
= (sopt
->sopt_name
== SO_LINGER
) ?
3336 so
->so_linger
: so
->so_linger
/ hz
;
3337 error
= sooptcopyout(sopt
, &l
, sizeof (l
));
3340 case SO_USELOOPBACK
:
3352 case SO_WANTOOBFLAG
:
3354 optval
= so
->so_options
& sopt
->sopt_name
;
3356 error
= sooptcopyout(sopt
, &optval
, sizeof (optval
));
3360 optval
= so
->so_type
;
3365 if (so
->so_proto
->pr_flags
& PR_ATOMIC
) {
3370 m1
= so
->so_rcv
.sb_mb
;
3372 if (m1
->m_type
== MT_DATA
|| m1
->m_type
== MT_HEADER
||
3373 m1
->m_type
== MT_OOBDATA
)
3374 pkt_total
+= m1
->m_len
;
3379 optval
= so
->so_rcv
.sb_cc
- so
->so_rcv
.sb_ctl
;
3384 optval
= so
->so_snd
.sb_cc
;
3388 optval
= so
->so_error
;
3393 optval
= so
->so_snd
.sb_hiwat
;
3397 optval
= so
->so_rcv
.sb_hiwat
;
3401 optval
= so
->so_snd
.sb_lowat
;
3405 optval
= so
->so_rcv
.sb_lowat
;
3410 tv
= (sopt
->sopt_name
== SO_SNDTIMEO
?
3411 so
->so_snd
.sb_timeo
: so
->so_rcv
.sb_timeo
);
3413 error
= sooptcopyout_timeval(sopt
, &tv
);
3417 optval
= (so
->so_flags
& SOF_NOSIGPIPE
);
3421 optval
= (so
->so_flags
& SOF_NOADDRAVAIL
);
3424 case SO_REUSESHAREUID
:
3425 optval
= (so
->so_flags
& SOF_REUSESHAREUID
);
3428 #ifdef __APPLE_API_PRIVATE
3429 case SO_NOTIFYCONFLICT
:
3430 optval
= (so
->so_flags
& SOF_NOTIFYCONFLICT
);
3433 case SO_RESTRICTIONS
:
3434 optval
= so
->so_restrictions
& (SO_RESTRICT_DENYIN
|
3435 SO_RESTRICT_DENYOUT
| SO_RESTRICT_DENYSET
);
3439 #if CONFIG_MACF_SOCKET
3440 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof (extmac
),
3441 sizeof (extmac
))) != 0 ||
3442 (error
= mac_socket_label_get(proc_ucred(
3443 sopt
->sopt_p
), so
, &extmac
)) != 0)
3446 error
= sooptcopyout(sopt
, &extmac
, sizeof (extmac
));
3449 #endif /* MAC_SOCKET */
3453 #if CONFIG_MACF_SOCKET
3454 if ((error
= sooptcopyin(sopt
, &extmac
, sizeof (extmac
),
3455 sizeof (extmac
))) != 0 ||
3456 (error
= mac_socketpeer_label_get(proc_ucred(
3457 sopt
->sopt_p
), so
, &extmac
)) != 0)
3460 error
= sooptcopyout(sopt
, &extmac
, sizeof (extmac
));
3463 #endif /* MAC_SOCKET */
3467 error
= ENOPROTOOPT
;
3470 socket_unlock(so
, 1);
3475 /* XXX; prepare mbuf for (__FreeBSD__ < 3) routines. */
3477 soopt_getm(struct sockopt
*sopt
, struct mbuf
**mp
)
3479 struct mbuf
*m
, *m_prev
;
3480 int sopt_size
= sopt
->sopt_valsize
;
3482 if (sopt_size
> MAX_SOOPTGETM_SIZE
)
3485 MGET(m
, sopt
->sopt_p
? M_WAIT
: M_DONTWAIT
, MT_DATA
);
3488 if (sopt_size
> MLEN
) {
3489 MCLGET(m
, sopt
->sopt_p
? M_WAIT
: M_DONTWAIT
);
3490 if ((m
->m_flags
& M_EXT
) == 0) {
3494 m
->m_len
= min(MCLBYTES
, sopt_size
);
3496 m
->m_len
= min(MLEN
, sopt_size
);
3498 sopt_size
-= m
->m_len
;
3503 MGET(m
, sopt
->sopt_p
? M_WAIT
: M_DONTWAIT
, MT_DATA
);
3508 if (sopt_size
> MLEN
) {
3509 MCLGET(m
, sopt
->sopt_p
? M_WAIT
: M_DONTWAIT
);
3510 if ((m
->m_flags
& M_EXT
) == 0) {
3514 m
->m_len
= min(MCLBYTES
, sopt_size
);
3516 m
->m_len
= min(MLEN
, sopt_size
);
3518 sopt_size
-= m
->m_len
;
3525 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
3527 soopt_mcopyin(struct sockopt
*sopt
, struct mbuf
*m
)
3529 struct mbuf
*m0
= m
;
3531 if (sopt
->sopt_val
== USER_ADDR_NULL
)
3533 while (m
!= NULL
&& sopt
->sopt_valsize
>= m
->m_len
) {
3534 if (sopt
->sopt_p
!= NULL
) {
3537 error
= copyin(sopt
->sopt_val
, mtod(m
, char *),
3544 bcopy(CAST_DOWN(caddr_t
, sopt
->sopt_val
),
3545 mtod(m
, char *), m
->m_len
);
3547 sopt
->sopt_valsize
-= m
->m_len
;
3548 sopt
->sopt_val
+= m
->m_len
;
3551 if (m
!= NULL
) /* should be allocated enoughly at ip6_sooptmcopyin() */
3552 panic("soopt_mcopyin");
3556 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
3558 soopt_mcopyout(struct sockopt
*sopt
, struct mbuf
*m
)
3560 struct mbuf
*m0
= m
;
3563 if (sopt
->sopt_val
== USER_ADDR_NULL
)
3565 while (m
!= NULL
&& sopt
->sopt_valsize
>= m
->m_len
) {
3566 if (sopt
->sopt_p
!= NULL
) {
3569 error
= copyout(mtod(m
, char *), sopt
->sopt_val
,
3576 bcopy(mtod(m
, char *),
3577 CAST_DOWN(caddr_t
, sopt
->sopt_val
), m
->m_len
);
3579 sopt
->sopt_valsize
-= m
->m_len
;
3580 sopt
->sopt_val
+= m
->m_len
;
3581 valsize
+= m
->m_len
;
3585 /* enough soopt buffer should be given from user-land */
3589 sopt
->sopt_valsize
= valsize
;
3594 sohasoutofband(struct socket
*so
)
3597 if (so
->so_pgid
< 0)
3598 gsignal(-so
->so_pgid
, SIGURG
);
3599 else if (so
->so_pgid
> 0)
3600 proc_signal(so
->so_pgid
, SIGURG
);
3601 selwakeup(&so
->so_rcv
.sb_sel
);
3605 sopoll(struct socket
*so
, int events
, __unused kauth_cred_t cred
, void * wql
)
3607 struct proc
*p
= current_proc();
3612 if (events
& (POLLIN
| POLLRDNORM
))
3614 revents
|= events
& (POLLIN
| POLLRDNORM
);
3616 if (events
& (POLLOUT
| POLLWRNORM
))
3617 if (sowriteable(so
))
3618 revents
|= events
& (POLLOUT
| POLLWRNORM
);
3620 if (events
& (POLLPRI
| POLLRDBAND
))
3621 if (so
->so_oobmark
|| (so
->so_state
& SS_RCVATMARK
))
3622 revents
|= events
& (POLLPRI
| POLLRDBAND
);
3625 if (events
& (POLLIN
| POLLPRI
| POLLRDNORM
| POLLRDBAND
)) {
3627 * Darwin sets the flag first,
3628 * BSD calls selrecord first
3630 so
->so_rcv
.sb_flags
|= SB_SEL
;
3631 selrecord(p
, &so
->so_rcv
.sb_sel
, wql
);
3634 if (events
& (POLLOUT
| POLLWRNORM
)) {
3636 * Darwin sets the flag first,
3637 * BSD calls selrecord first
3639 so
->so_snd
.sb_flags
|= SB_SEL
;
3640 selrecord(p
, &so
->so_snd
.sb_sel
, wql
);
3644 socket_unlock(so
, 1);
3649 soo_kqfilter(__unused
struct fileproc
*fp
, struct knote
*kn
,
3650 __unused
struct proc
*p
)
3652 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
3657 #if CONFIG_MACF_SOCKET
3658 if (mac_socket_check_kqfilter(proc_ucred(p
), kn
, so
) != 0) {
3659 socket_unlock(so
, 1);
3662 #endif /* MAC_SOCKET */
3664 switch (kn
->kn_filter
) {
3666 if (so
->so_options
& SO_ACCEPTCONN
)
3667 kn
->kn_fop
= &solisten_filtops
;
3669 kn
->kn_fop
= &soread_filtops
;
3673 kn
->kn_fop
= &sowrite_filtops
;
3677 socket_unlock(so
, 1);
3681 if (KNOTE_ATTACH(&sb
->sb_sel
.si_note
, kn
))
3682 sb
->sb_flags
|= SB_KNOTE
;
3683 socket_unlock(so
, 1);
3688 filt_sordetach(struct knote
*kn
)
3690 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
3693 if (so
->so_rcv
.sb_flags
& SB_KNOTE
)
3694 if (KNOTE_DETACH(&so
->so_rcv
.sb_sel
.si_note
, kn
))
3695 so
->so_rcv
.sb_flags
&= ~SB_KNOTE
;
3696 socket_unlock(so
, 1);
3701 filt_soread(struct knote
*kn
, long hint
)
3703 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
3705 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3708 kn
->kn_data
= so
->so_rcv
.sb_cc
- so
->so_rcv
.sb_ctl
;
3710 if (so
->so_oobmark
) {
3711 if (kn
->kn_flags
& EV_OOBAND
) {
3712 kn
->kn_data
-= so
->so_oobmark
;
3713 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3714 socket_unlock(so
, 1);
3717 kn
->kn_data
= so
->so_oobmark
;
3718 kn
->kn_flags
|= EV_OOBAND
;
3720 if (so
->so_state
& SS_CANTRCVMORE
) {
3721 kn
->kn_flags
|= EV_EOF
;
3722 kn
->kn_fflags
= so
->so_error
;
3723 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3724 socket_unlock(so
, 1);
3729 if (so
->so_state
& SS_RCVATMARK
) {
3730 if (kn
->kn_flags
& EV_OOBAND
) {
3731 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3732 socket_unlock(so
, 1);
3735 kn
->kn_flags
|= EV_OOBAND
;
3736 } else if (kn
->kn_flags
& EV_OOBAND
) {
3738 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3739 socket_unlock(so
, 1);
3743 if (so
->so_error
) { /* temporary udp error */
3744 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3745 socket_unlock(so
, 1);
3749 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3750 socket_unlock(so
, 1);
3752 return ((kn
->kn_flags
& EV_OOBAND
) ||
3753 kn
->kn_data
>= ((kn
->kn_sfflags
& NOTE_LOWAT
) ?
3754 kn
->kn_sdata
: so
->so_rcv
.sb_lowat
));
3758 filt_sowdetach(struct knote
*kn
)
3760 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
3763 if (so
->so_snd
.sb_flags
& SB_KNOTE
)
3764 if (KNOTE_DETACH(&so
->so_snd
.sb_sel
.si_note
, kn
))
3765 so
->so_snd
.sb_flags
&= ~SB_KNOTE
;
3766 socket_unlock(so
, 1);
3771 filt_sowrite(struct knote
*kn
, long hint
)
3773 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
3775 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3778 kn
->kn_data
= sbspace(&so
->so_snd
);
3779 if (so
->so_state
& SS_CANTSENDMORE
) {
3780 kn
->kn_flags
|= EV_EOF
;
3781 kn
->kn_fflags
= so
->so_error
;
3782 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3783 socket_unlock(so
, 1);
3786 if (so
->so_error
) { /* temporary udp error */
3787 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3788 socket_unlock(so
, 1);
3791 if (((so
->so_state
& SS_ISCONNECTED
) == 0) &&
3792 (so
->so_proto
->pr_flags
& PR_CONNREQUIRED
)) {
3793 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3794 socket_unlock(so
, 1);
3797 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3798 socket_unlock(so
, 1);
3799 if (kn
->kn_sfflags
& NOTE_LOWAT
)
3800 return (kn
->kn_data
>= kn
->kn_sdata
);
3801 return (kn
->kn_data
>= so
->so_snd
.sb_lowat
);
3806 filt_solisten(struct knote
*kn
, long hint
)
3808 struct socket
*so
= (struct socket
*)kn
->kn_fp
->f_fglob
->fg_data
;
3811 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3813 kn
->kn_data
= so
->so_qlen
;
3814 isempty
= ! TAILQ_EMPTY(&so
->so_comp
);
3815 if ((hint
& SO_FILT_HINT_LOCKED
) == 0)
3816 socket_unlock(so
, 1);
3822 socket_lock(struct socket
*so
, int refcount
)
3824 int error
= 0, lr_saved
;
3826 lr_saved
= (unsigned int) __builtin_return_address(0);
3828 if (so
->so_proto
->pr_lock
) {
3829 error
= (*so
->so_proto
->pr_lock
)(so
, refcount
, lr_saved
);
3831 #ifdef MORE_LOCKING_DEBUG
3832 lck_mtx_assert(so
->so_proto
->pr_domain
->dom_mtx
,
3833 LCK_MTX_ASSERT_NOTOWNED
);
3835 lck_mtx_lock(so
->so_proto
->pr_domain
->dom_mtx
);
3838 so
->lock_lr
[so
->next_lock_lr
] = (u_int32_t
)lr_saved
;
3839 so
->next_lock_lr
= (so
->next_lock_lr
+1) % SO_LCKDBG_MAX
;
3846 socket_unlock(struct socket
*so
, int refcount
)
3848 int error
= 0, lr_saved
;
3849 lck_mtx_t
*mutex_held
;
3851 lr_saved
= (unsigned int) __builtin_return_address(0);
3853 if (so
->so_proto
== NULL
)
3854 panic("socket_unlock null so_proto so=%p\n", so
);
3856 if (so
&& so
->so_proto
->pr_unlock
) {
3857 error
= (*so
->so_proto
->pr_unlock
)(so
, refcount
, lr_saved
);
3859 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
3860 #ifdef MORE_LOCKING_DEBUG
3861 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
3863 so
->unlock_lr
[so
->next_unlock_lr
] = (u_int32_t
)lr_saved
;
3864 so
->next_unlock_lr
= (so
->next_unlock_lr
+1) % SO_LCKDBG_MAX
;
3867 if (so
->so_usecount
<= 0)
3868 panic("socket_unlock: bad refcount so=%p "
3869 "value=%d\n", so
, so
->so_usecount
);
3871 if (so
->so_usecount
== 0) {
3872 sofreelastref(so
, 1);
3875 lck_mtx_unlock(mutex_held
);
3881 /* Called with socket locked, will unlock socket */
3883 sofree(struct socket
*so
)
3886 lck_mtx_t
*mutex_held
;
3887 if (so
->so_proto
->pr_getlock
!= NULL
)
3888 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
3890 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
3891 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
3893 sofreelastref(so
, 0);
3897 soreference(struct socket
*so
)
3899 socket_lock(so
, 1); /* locks & take one reference on socket */
3900 socket_unlock(so
, 0); /* unlock only */
3904 sodereference(struct socket
*so
)
3907 socket_unlock(so
, 1);
3911 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
3912 * possibility of using jumbo clusters. Caller must ensure to hold
3916 somultipages(struct socket
*so
, boolean_t set
)
3919 so
->so_flags
|= SOF_MULTIPAGES
;
3921 so
->so_flags
&= ~SOF_MULTIPAGES
;