2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1998, 1999 Apple Computer, Inc. All Rights Reserved */
29 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
31 * Copyright (c) 1982, 1986, 1988, 1990, 1993
32 * The Regents of the University of California. All rights reserved.
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions
37 * 1. Redistributions of source code must retain the above copyright
38 * notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 * notice, this list of conditions and the following disclaimer in the
41 * documentation and/or other materials provided with the distribution.
42 * 3. All advertising materials mentioning features or use of this software
43 * must display the following acknowledgement:
44 * This product includes software developed by the University of
45 * California, Berkeley and its contributors.
46 * 4. Neither the name of the University nor the names of its contributors
47 * may be used to endorse or promote products derived from this software
48 * without specific prior written permission.
50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
63 * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.9 2001/07/26 18:53:02 peter Exp $
66 #include <sys/param.h>
67 #include <sys/systm.h>
68 #include <sys/domain.h>
69 #include <sys/kernel.h>
70 #include <sys/proc_internal.h>
71 #include <sys/kauth.h>
72 #include <sys/malloc.h>
74 #include <sys/protosw.h>
76 #include <sys/socket.h>
77 #include <sys/socketvar.h>
78 #include <sys/signalvar.h>
79 #include <sys/sysctl.h>
81 #include <kern/locks.h>
82 #include <net/route.h>
83 #include <netinet/in.h>
84 #include <netinet/in_pcb.h>
85 #include <sys/kdebug.h>
87 #define DBG_FNC_SBDROP NETDBG_CODE(DBG_NETSOCK, 4)
88 #define DBG_FNC_SBAPPEND NETDBG_CODE(DBG_NETSOCK, 5)
90 static int sbcompress(struct sockbuf
*, struct mbuf
*, struct mbuf
*);
93 * Primitive routines for operating on sockets and socket buffers
96 u_long sb_max
= SB_MAX
; /* XXX should be static */
98 static u_long sb_efficiency
= 8; /* parameter for sbreserve() */
101 * Procedures to manipulate state flags of socket
102 * and do appropriate wakeups. Normal sequence from the
103 * active (originating) side is that soisconnecting() is
104 * called during processing of connect() call,
105 * resulting in an eventual call to soisconnected() if/when the
106 * connection is established. When the connection is torn down
107 * soisdisconnecting() is called during processing of disconnect() call,
108 * and soisdisconnected() is called when the connection to the peer
109 * is totally severed. The semantics of these routines are such that
110 * connectionless protocols can call soisconnected() and soisdisconnected()
111 * only, bypassing the in-progress calls when setting up a ``connection''
114 * From the passive side, a socket is created with
115 * two queues of sockets: so_incomp for connections in progress
116 * and so_comp for connections already made and awaiting user acceptance.
117 * As a protocol is preparing incoming connections, it creates a socket
118 * structure queued on so_incomp by calling sonewconn(). When the connection
119 * is established, soisconnected() is called, and transfers the
120 * socket structure to so_comp, making it available to accept().
122 * If a socket is closed with sockets on either
123 * so_incomp or so_comp, these sockets are dropped.
125 * If higher level protocols are implemented in
126 * the kernel, the wakeups done here will sometimes
127 * cause software-interrupt process scheduling.
131 register struct socket
*so
;
134 so
->so_state
&= ~(SS_ISCONNECTED
|SS_ISDISCONNECTING
);
135 so
->so_state
|= SS_ISCONNECTING
;
137 sflt_notify(so
, sock_evt_connecting
, NULL
);
144 struct socket
*head
= so
->so_head
;
146 so
->so_state
&= ~(SS_ISCONNECTING
|SS_ISDISCONNECTING
|SS_ISCONFIRMING
);
147 so
->so_state
|= SS_ISCONNECTED
;
149 sflt_notify(so
, sock_evt_connected
, NULL
);
151 if (head
&& (so
->so_state
& SS_INCOMP
)) {
152 so
->so_state
&= ~SS_INCOMP
;
153 so
->so_state
|= SS_COMP
;
154 if (head
->so_proto
->pr_getlock
!= NULL
) {
155 socket_unlock(so
, 0);
156 socket_lock(head
, 1);
158 postevent(head
, 0, EV_RCONN
);
159 TAILQ_REMOVE(&head
->so_incomp
, so
, so_list
);
161 TAILQ_INSERT_TAIL(&head
->so_comp
, so
, so_list
);
163 wakeup_one((caddr_t
)&head
->so_timeo
);
164 if (head
->so_proto
->pr_getlock
!= NULL
) {
165 socket_unlock(head
, 1);
169 postevent(so
, 0, EV_WCONN
);
170 wakeup((caddr_t
)&so
->so_timeo
);
177 soisdisconnecting(so
)
178 register struct socket
*so
;
180 so
->so_state
&= ~SS_ISCONNECTING
;
181 so
->so_state
|= (SS_ISDISCONNECTING
|SS_CANTRCVMORE
|SS_CANTSENDMORE
);
182 sflt_notify(so
, sock_evt_disconnecting
, NULL
);
183 wakeup((caddr_t
)&so
->so_timeo
);
190 register struct socket
*so
;
192 so
->so_state
&= ~(SS_ISCONNECTING
|SS_ISCONNECTED
|SS_ISDISCONNECTING
);
193 so
->so_state
|= (SS_CANTRCVMORE
|SS_CANTSENDMORE
|SS_ISDISCONNECTED
);
194 sflt_notify(so
, sock_evt_disconnected
, NULL
);
195 wakeup((caddr_t
)&so
->so_timeo
);
201 * Return a random connection that hasn't been serviced yet and
202 * is eligible for discard. There is a one in qlen chance that
203 * we will return a null, saying that there are no dropable
204 * requests. In this case, the protocol specific code should drop
205 * the new request. This insures fairness.
207 * This may be used in conjunction with protocol specific queue
208 * congestion routines.
212 register struct socket
*head
;
214 struct socket
*so
, *sonext
= NULL
;
215 unsigned int i
, j
, qlen
;
217 static struct timeval old_runtime
;
218 static unsigned int cur_cnt
, old_cnt
;
222 if ((i
= (tv
.tv_sec
- old_runtime
.tv_sec
)) != 0) {
224 old_cnt
= cur_cnt
/ i
;
228 so
= TAILQ_FIRST(&head
->so_incomp
);
232 qlen
= head
->so_incqlen
;
233 if (++cur_cnt
> qlen
|| old_cnt
> qlen
) {
234 rnd
= (314159 * rnd
+ 66329) & 0xffff;
235 j
= ((qlen
+ 1) * rnd
) >> 16;
238 // if (in_pcb_checkstate(so->so_pcb, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
240 sonext
= TAILQ_NEXT(so
, so_list
);
241 // in_pcb_check_state(so->so_pcb, WNT_RELEASE, 0);
242 socket_unlock(so
, 1);
247 // if (in_pcb_checkstate(so->so_pcb, WNT_ACQUIRE, 0) == WNT_STOPUSING)
254 * When an attempt at a new connection is noted on a socket
255 * which accepts connections, sonewconn is called. If the
256 * connection is possible (subject to space constraints, etc.)
257 * then we allocate a new structure, propoerly linked into the
258 * data structure of the original socket, and return this.
259 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
261 static struct socket
*
262 sonewconn_internal(head
, connstatus
)
263 register struct socket
*head
;
267 register struct socket
*so
;
268 lck_mtx_t
*mutex_held
;
270 if (head
->so_proto
->pr_getlock
!= NULL
)
271 mutex_held
= (*head
->so_proto
->pr_getlock
)(head
, 0);
273 mutex_held
= head
->so_proto
->pr_domain
->dom_mtx
;
274 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
276 if (head
->so_qlen
> 3 * head
->so_qlimit
/ 2)
277 return ((struct socket
*)0);
278 so
= soalloc(1, head
->so_proto
->pr_domain
->dom_family
, head
->so_type
);
280 return ((struct socket
*)0);
281 /* check if head was closed during the soalloc */
282 if (head
->so_proto
== NULL
) {
284 return ((struct socket
*)0);
288 so
->so_type
= head
->so_type
;
289 so
->so_options
= head
->so_options
&~ SO_ACCEPTCONN
;
290 so
->so_linger
= head
->so_linger
;
291 so
->so_state
= head
->so_state
| SS_NOFDREF
;
292 so
->so_proto
= head
->so_proto
;
293 so
->so_timeo
= head
->so_timeo
;
294 so
->so_pgid
= head
->so_pgid
;
295 so
->so_uid
= head
->so_uid
;
297 so
->next_lock_lr
= 0;
298 so
->next_unlock_lr
= 0;
301 so
->so_rcv
.sb_flags
|= SB_RECV
; /* XXX */
302 so
->so_rcv
.sb_so
= so
->so_snd
.sb_so
= so
;
303 TAILQ_INIT(&so
->so_evlist
);
306 if (soreserve(so
, head
->so_snd
.sb_hiwat
, head
->so_rcv
.sb_hiwat
)) {
309 return ((struct socket
*)0);
313 * Must be done with head unlocked to avoid deadlock for protocol with per socket mutexes.
315 if (head
->so_proto
->pr_unlock
)
316 socket_unlock(head
, 0);
317 if (((*so
->so_proto
->pr_usrreqs
->pru_attach
)(so
, 0, NULL
) != 0) || error
) {
320 if (head
->so_proto
->pr_unlock
)
321 socket_lock(head
, 0);
322 return ((struct socket
*)0);
324 if (head
->so_proto
->pr_unlock
)
325 socket_lock(head
, 0);
327 so
->so_proto
->pr_domain
->dom_refs
++;
331 TAILQ_INSERT_TAIL(&head
->so_comp
, so
, so_list
);
332 so
->so_state
|= SS_COMP
;
334 TAILQ_INSERT_TAIL(&head
->so_incomp
, so
, so_list
);
335 so
->so_state
|= SS_INCOMP
;
341 /* Attach socket filters for this protocol */
345 so
->so_state
|= connstatus
;
347 wakeup((caddr_t
)&head
->so_timeo
);
357 const struct sockaddr
*from
)
360 struct socket_filter_entry
*filter
;
364 for (filter
= head
->so_filt
; filter
&& (error
== 0);
365 filter
= filter
->sfe_next_onsocket
) {
366 if (filter
->sfe_filter
->sf_filter
.sf_connect_in
) {
370 socket_unlock(head
, 0);
372 error
= filter
->sfe_filter
->sf_filter
.sf_connect_in(
373 filter
->sfe_cookie
, head
, from
);
377 socket_lock(head
, 0);
385 return sonewconn_internal(head
, connstatus
);
389 * Socantsendmore indicates that no more data will be sent on the
390 * socket; it would normally be applied to a socket when the user
391 * informs the system that no more data is to be sent, by the protocol
392 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data
393 * will be received, and will normally be applied to the socket by a
394 * protocol when it detects that the peer will send no more data.
395 * Data queued for reading in the socket may yet be read.
402 so
->so_state
|= SS_CANTSENDMORE
;
403 sflt_notify(so
, sock_evt_cantsendmore
, NULL
);
411 so
->so_state
|= SS_CANTRCVMORE
;
412 sflt_notify(so
, sock_evt_cantrecvmore
, NULL
);
417 * Wait for data to arrive at/drain from a socket buffer.
423 int error
= 0, lr_saved
;
424 struct socket
*so
= sb
->sb_so
;
425 lck_mtx_t
*mutex_held
;
428 lr_saved
= (unsigned int) __builtin_return_address(0);
430 if (so
->so_proto
->pr_getlock
!= NULL
)
431 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
433 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
435 sb
->sb_flags
|= SB_WAIT
;
437 if (so
->so_usecount
< 1)
438 panic("sbwait: so=%x refcount=%d\n", so
, so
->so_usecount
);
439 ts
.tv_sec
= sb
->sb_timeo
.tv_sec
;
440 ts
.tv_nsec
= sb
->sb_timeo
.tv_usec
* 1000;
441 error
= msleep((caddr_t
)&sb
->sb_cc
, mutex_held
,
442 (sb
->sb_flags
& SB_NOINTR
) ? PSOCK
: PSOCK
| PCATCH
, "sbwait",
445 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
447 if (so
->so_usecount
< 1)
448 panic("sbwait: so=%x refcount=%d\n", so
, so
->so_usecount
);
450 if ((so
->so_state
& SS_DRAINING
)) {
458 * Lock a sockbuf already known to be locked;
459 * return any error returned from sleep (EINTR).
463 register struct sockbuf
*sb
;
465 struct socket
*so
= sb
->sb_so
;
466 lck_mtx_t
* mutex_held
;
470 panic("sb_lock: null so back pointer sb=%x\n", sb
);
472 while (sb
->sb_flags
& SB_LOCK
) {
473 sb
->sb_flags
|= SB_WANT
;
474 if (so
->so_proto
->pr_getlock
!= NULL
)
475 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
477 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
478 if (so
->so_usecount
< 1)
479 panic("sb_lock: so=%x refcount=%d\n", so
, so
->so_usecount
);
481 error
= msleep((caddr_t
)&sb
->sb_flags
, mutex_held
,
482 (sb
->sb_flags
& SB_NOINTR
) ? PSOCK
: PSOCK
| PCATCH
, "sblock", 0);
483 if (so
->so_usecount
< 1)
484 panic("sb_lock: 2 so=%x refcount=%d\n", so
, so
->so_usecount
);
488 sb
->sb_flags
|= SB_LOCK
;
493 * Wakeup processes waiting on a socket buffer.
494 * Do asynchronous notification via SIGIO
495 * if the socket has the SS_ASYNC flag set.
499 register struct socket
*so
;
500 register struct sockbuf
*sb
;
502 struct proc
*p
= current_proc();
503 sb
->sb_flags
&= ~SB_SEL
;
504 selwakeup(&sb
->sb_sel
);
505 if (sb
->sb_flags
& SB_WAIT
) {
506 sb
->sb_flags
&= ~SB_WAIT
;
507 wakeup((caddr_t
)&sb
->sb_cc
);
509 if (so
->so_state
& SS_ASYNC
) {
511 gsignal(-so
->so_pgid
, SIGIO
);
512 else if (so
->so_pgid
> 0 && (p
= pfind(so
->so_pgid
)) != 0)
515 if (sb
->sb_flags
& SB_KNOTE
) {
516 KNOTE(&sb
->sb_sel
.si_note
, SO_FILT_HINT_LOCKED
);
518 if (sb
->sb_flags
& SB_UPCALL
) {
519 socket_unlock(so
, 0);
520 (*so
->so_upcall
)(so
, so
->so_upcallarg
, M_DONTWAIT
);
526 * Socket buffer (struct sockbuf) utility routines.
528 * Each socket contains two socket buffers: one for sending data and
529 * one for receiving data. Each buffer contains a queue of mbufs,
530 * information about the number of mbufs and amount of data in the
531 * queue, and other fields allowing select() statements and notification
532 * on data availability to be implemented.
534 * Data stored in a socket buffer is maintained as a list of records.
535 * Each record is a list of mbufs chained together with the m_next
536 * field. Records are chained together with the m_nextpkt field. The upper
537 * level routine soreceive() expects the following conventions to be
538 * observed when placing information in the receive buffer:
540 * 1. If the protocol requires each message be preceded by the sender's
541 * name, then a record containing that name must be present before
542 * any associated data (mbuf's must be of type MT_SONAME).
543 * 2. If the protocol supports the exchange of ``access rights'' (really
544 * just additional data associated with the message), and there are
545 * ``rights'' to be received, then a record containing this data
546 * should be present (mbuf's must be of type MT_RIGHTS).
547 * 3. If a name or rights record exists, then it must be followed by
548 * a data record, perhaps of zero length.
550 * Before using a new socket structure it is first necessary to reserve
551 * buffer space to the socket, by calling sbreserve(). This should commit
552 * some of the available buffer space in the system buffer pool for the
553 * socket (currently, it does nothing but enforce limits). The space
554 * should be released by calling sbrelease() when the socket is destroyed.
558 soreserve(so
, sndcc
, rcvcc
)
559 register struct socket
*so
;
563 if (sbreserve(&so
->so_snd
, sndcc
) == 0)
565 if (sbreserve(&so
->so_rcv
, rcvcc
) == 0)
567 if (so
->so_rcv
.sb_lowat
== 0)
568 so
->so_rcv
.sb_lowat
= 1;
569 if (so
->so_snd
.sb_lowat
== 0)
570 so
->so_snd
.sb_lowat
= MCLBYTES
;
571 if (so
->so_snd
.sb_lowat
> so
->so_snd
.sb_hiwat
)
572 so
->so_snd
.sb_lowat
= so
->so_snd
.sb_hiwat
;
576 selthreadclear(&so
->so_snd
.sb_sel
);
578 sbrelease(&so
->so_snd
);
584 * Allot mbufs to a sockbuf.
585 * Attempt to scale mbmax so that mbcnt doesn't become limiting
586 * if buffering efficiency is near the normal case.
593 if ((u_quad_t
)cc
> (u_quad_t
)sb_max
* MCLBYTES
/ (MSIZE
+ MCLBYTES
))
596 sb
->sb_mbmax
= min(cc
* sb_efficiency
, sb_max
);
597 if (sb
->sb_lowat
> sb
->sb_hiwat
)
598 sb
->sb_lowat
= sb
->sb_hiwat
;
603 * Free mbufs held by a socket, and reserved mbuf space.
605 /* WARNING needs to do selthreadclear() before calling this */
618 * Routines to add and remove
619 * data from an mbuf queue.
621 * The routines sbappend() or sbappendrecord() are normally called to
622 * append new mbufs to a socket buffer, after checking that adequate
623 * space is available, comparing the function sbspace() with the amount
624 * of data to be added. sbappendrecord() differs from sbappend() in
625 * that data supplied is treated as the beginning of a new record.
626 * To place a sender's address, optional access rights, and data in a
627 * socket receive buffer, sbappendaddr() should be used. To place
628 * access rights and data in a socket receive buffer, sbappendrights()
629 * should be used. In either case, the new data begins a new record.
630 * Note that unlike sbappend() and sbappendrecord(), these routines check
631 * for the caller that there will be enough space to store the data.
632 * Each fails if there is not enough space, or if it cannot find mbufs
633 * to store additional information in.
635 * Reliable protocols may use the socket send buffer to hold data
636 * awaiting acknowledgement. Data is normally copied from a socket
637 * send buffer in a protocol with m_copy for output to a peer,
638 * and then removing the data from the socket buffer with sbdrop()
639 * or sbdroprecord() when the data is acknowledged by the peer.
643 * Append mbuf chain m to the last record in the
644 * socket buffer sb. The additional space associated
645 * the mbuf chain is recorded in sb. Empty mbufs are
646 * discarded and mbufs are compacted where possible.
653 register struct mbuf
*n
, *sb_first
;
659 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_START
), sb
, m
->m_len
, 0, 0, 0);
665 sb_first
= n
= sb
->sb_mb
;
670 if (n
->m_flags
& M_EOR
) {
671 result
= sbappendrecord(sb
, m
); /* XXXXXX!!!! */
672 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_END
), sb
, sb
->sb_cc
, 0, 0, 0);
675 } while (n
->m_next
&& (n
= n
->m_next
));
678 if (!filtered
&& (sb
->sb_flags
& SB_RECV
) != 0) {
679 error
= sflt_data_in(sb
->sb_so
, NULL
, &m
, NULL
, 0, &filtered
);
681 /* no data was appended, caller should not call sowakeup */
686 If we any filters, the socket lock was dropped. n and sb_first
687 cached data from the socket buffer. This cache is not valid
688 since we dropped the lock. We must start over. Since filtered
689 is set we won't run through the filters a second time. We just
690 set n and sb_start again.
696 result
= sbcompress(sb
, m
, n
);
698 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_END
), sb
, sb
->sb_cc
, 0, 0, 0);
706 register struct sockbuf
*sb
;
708 register struct mbuf
*m
;
709 register struct mbuf
*n
= 0;
710 register u_long len
= 0, mbcnt
= 0;
711 lck_mtx_t
*mutex_held
;
713 if (sb
->sb_so
->so_proto
->pr_getlock
!= NULL
)
714 mutex_held
= (*sb
->sb_so
->so_proto
->pr_getlock
)(sb
->sb_so
, 0);
716 mutex_held
= sb
->sb_so
->so_proto
->pr_domain
->dom_mtx
;
718 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
723 for (m
= sb
->sb_mb
; m
; m
= n
) {
725 for (; m
; m
= m
->m_next
) {
728 if (m
->m_flags
& M_EXT
) /*XXX*/ /* pretty sure this is bogus */
729 mbcnt
+= m
->m_ext
.ext_size
;
732 if (len
!= sb
->sb_cc
|| mbcnt
!= sb
->sb_mbcnt
) {
733 panic("cc %ld != %ld || mbcnt %ld != %ld\n", len
, sb
->sb_cc
,
734 mbcnt
, sb
->sb_mbcnt
);
740 * As above, except the mbuf chain
741 * begins a new record.
744 sbappendrecord(sb
, m0
)
745 register struct sockbuf
*sb
;
748 register struct mbuf
*m
;
754 if ((sb
->sb_flags
& SB_RECV
) != 0) {
755 int error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, NULL
, sock_data_filt_flag_record
, NULL
);
757 if (error
!= EJUSTRETURN
)
768 * Put the first mbuf on the queue.
769 * Note this permits zero length records.
778 if (m
&& (m0
->m_flags
& M_EOR
)) {
779 m0
->m_flags
&= ~M_EOR
;
782 return sbcompress(sb
, m
, m0
);
786 * As above except that OOB data
787 * is inserted at the beginning of the sockbuf,
788 * but after any other OOB data.
801 if ((sb
->sb_flags
& SB_RECV
) != 0) {
802 int error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, NULL
,
803 sock_data_filt_flag_oob
, NULL
);
806 if (error
!= EJUSTRETURN
) {
813 for (mp
= &sb
->sb_mb
; *mp
; mp
= &((*mp
)->m_nextpkt
)) {
819 continue; /* WANT next train */
824 goto again
; /* inspect THIS train further */
829 * Put the first mbuf on the queue.
830 * Note this permits zero length records.
837 if (m
&& (m0
->m_flags
& M_EOR
)) {
838 m0
->m_flags
&= ~M_EOR
;
841 return sbcompress(sb
, m
, m0
);
845 * Append address and data, and optionally, control (ancillary) data
846 * to the receive queue of a socket. If present,
847 * m0 must include a packet header with total length.
848 * Returns 0 if no space in sockbuf or insufficient mbufs.
851 sbappendaddr_internal(sb
, asa
, m0
, control
)
852 register struct sockbuf
*sb
;
853 struct sockaddr
*asa
;
854 struct mbuf
*m0
, *control
;
856 register struct mbuf
*m
, *n
;
857 int space
= asa
->sa_len
;
859 if (m0
&& (m0
->m_flags
& M_PKTHDR
) == 0)
860 panic("sbappendaddr");
863 space
+= m0
->m_pkthdr
.len
;
864 for (n
= control
; n
; n
= n
->m_next
) {
866 if (n
->m_next
== 0) /* keep pointer to last control buf */
869 if (space
> sbspace(sb
))
871 if (asa
->sa_len
> MLEN
)
873 MGET(m
, M_DONTWAIT
, MT_SONAME
);
876 m
->m_len
= asa
->sa_len
;
877 bcopy((caddr_t
)asa
, mtod(m
, caddr_t
), asa
->sa_len
);
879 n
->m_next
= m0
; /* concatenate data to control */
883 for (n
= m
; n
; n
= n
->m_next
)
892 postevent(0,sb
,EV_RWBYTES
);
899 struct sockaddr
* asa
,
901 struct mbuf
*control
,
906 if (error_out
) *error_out
= 0;
908 if (m0
&& (m0
->m_flags
& M_PKTHDR
) == 0)
909 panic("sbappendaddrorfree");
911 /* Call socket data in filters */
912 if ((sb
->sb_flags
& SB_RECV
) != 0) {
914 error
= sflt_data_in(sb
->sb_so
, asa
, &m0
, &control
, 0, NULL
);
916 if (error
!= EJUSTRETURN
) {
918 if (control
) m_freem(control
);
919 if (error_out
) *error_out
= error
;
925 result
= sbappendaddr_internal(sb
, asa
, m0
, control
);
928 if (control
) m_freem(control
);
929 if (error_out
) *error_out
= ENOBUFS
;
936 sbappendcontrol_internal(sb
, m0
, control
)
938 struct mbuf
*control
, *m0
;
940 register struct mbuf
*m
, *n
;
944 panic("sbappendcontrol");
946 for (m
= control
; ; m
= m
->m_next
) {
951 n
= m
; /* save pointer to last control buffer */
952 for (m
= m0
; m
; m
= m
->m_next
)
954 if (space
> sbspace(sb
))
956 n
->m_next
= m0
; /* concatenate data to control */
957 for (m
= control
; m
; m
= m
->m_next
)
963 n
->m_nextpkt
= control
;
966 postevent(0,sb
,EV_RWBYTES
);
974 struct mbuf
*control
,
979 if (error_out
) *error_out
= 0;
981 if (sb
->sb_flags
& SB_RECV
) {
983 error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, &control
, 0, NULL
);
985 if (error
!= EJUSTRETURN
) {
987 if (control
) m_freem(control
);
988 if (error_out
) *error_out
= error
;
994 result
= sbappendcontrol_internal(sb
, m0
, control
);
997 if (control
) m_freem(control
);
998 if (error_out
) *error_out
= ENOBUFS
;
1005 * Compress mbuf chain m into the socket
1006 * buffer sb following mbuf n. If n
1007 * is null, the buffer is presumed empty.
1010 sbcompress(sb
, m
, n
)
1011 register struct sockbuf
*sb
;
1012 register struct mbuf
*m
, *n
;
1014 register int eor
= 0;
1015 register struct mbuf
*o
;
1018 eor
|= m
->m_flags
& M_EOR
;
1019 if (m
->m_len
== 0 &&
1021 (((o
= m
->m_next
) || (o
= n
)) &&
1022 o
->m_type
== m
->m_type
))) {
1026 if (n
&& (n
->m_flags
& M_EOR
) == 0 &&
1030 m
->m_len
<= MCLBYTES
/ 4 && /* XXX: Don't copy too much */
1031 m
->m_len
<= M_TRAILINGSPACE(n
) &&
1032 n
->m_type
== m
->m_type
) {
1033 bcopy(mtod(m
, caddr_t
), mtod(n
, caddr_t
) + n
->m_len
,
1034 (unsigned)m
->m_len
);
1035 n
->m_len
+= m
->m_len
;
1036 sb
->sb_cc
+= m
->m_len
;
1046 m
->m_flags
&= ~M_EOR
;
1054 printf("semi-panic: sbcompress\n");
1056 postevent(0,sb
, EV_RWBYTES
);
1061 * Free all mbufs in a sockbuf.
1062 * Check that all resources are reclaimed.
1066 register struct sockbuf
*sb
;
1068 if (sb
->sb_so
== NULL
)
1069 panic ("sbflush sb->sb_so already null sb=%x\n", sb
);
1070 (void)sblock(sb
, M_WAIT
);
1071 while (sb
->sb_mbcnt
) {
1073 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
1074 * we would loop forever. Panic instead.
1076 if (!sb
->sb_cc
&& (sb
->sb_mb
== NULL
|| sb
->sb_mb
->m_len
))
1078 sbdrop(sb
, (int)sb
->sb_cc
);
1080 if (sb
->sb_cc
|| sb
->sb_mb
|| sb
->sb_mbcnt
|| sb
->sb_so
== NULL
)
1081 panic("sbflush: cc %ld || mb %p || mbcnt %ld sb_so=%x", sb
->sb_cc
, (void *)sb
->sb_mb
, sb
->sb_mbcnt
, sb
->sb_so
);
1083 postevent(0, sb
, EV_RWBYTES
);
1084 sbunlock(sb
, 1); /* keep socket locked */
1089 * Drop data from (the front of) a sockbuf.
1090 * use m_freem_list to free the mbuf structures
1091 * under a single lock... this is done by pruning
1092 * the top of the tree from the body by keeping track
1093 * of where we get to in the tree and then zeroing the
1094 * two pertinent pointers m_nextpkt and m_next
1095 * the socket buffer is then updated to point at the new
1096 * top of the tree and the pruned area is released via
1101 register struct sockbuf
*sb
;
1104 register struct mbuf
*m
, *free_list
, *ml
;
1105 struct mbuf
*next
, *last
;
1107 KERNEL_DEBUG((DBG_FNC_SBDROP
| DBG_FUNC_START
), sb
, len
, 0, 0, 0);
1109 next
= (m
= sb
->sb_mb
) ? m
->m_nextpkt
: 0;
1110 free_list
= last
= m
;
1111 ml
= (struct mbuf
*)0;
1116 /* temporarily replacing this panic with printf because
1117 * it occurs occasionally when closing a socket when there
1118 * is no harm in ignoring it. This problem will be investigated
1121 /* panic("sbdrop"); */
1122 printf("sbdrop - count not zero\n");
1124 /* zero the counts. if we have no mbufs, we have no data (PR-2986815) */
1130 next
= m
->m_nextpkt
;
1133 if (m
->m_len
> len
) {
1145 while (m
&& m
->m_len
== 0) {
1152 ml
->m_next
= (struct mbuf
*)0;
1153 last
->m_nextpkt
= (struct mbuf
*)0;
1154 m_freem_list(free_list
);
1158 m
->m_nextpkt
= next
;
1162 postevent(0, sb
, EV_RWBYTES
);
1164 KERNEL_DEBUG((DBG_FNC_SBDROP
| DBG_FUNC_END
), sb
, 0, 0, 0, 0);
1168 * Drop a record off the front of a sockbuf
1169 * and move the next record to the front.
1173 register struct sockbuf
*sb
;
1175 register struct mbuf
*m
, *mn
;
1179 sb
->sb_mb
= m
->m_nextpkt
;
1186 postevent(0, sb
, EV_RWBYTES
);
1190 * Create a "control" mbuf containing the specified data
1191 * with the specified type for presentation on a socket buffer.
1194 sbcreatecontrol(p
, size
, type
, level
)
1199 register struct cmsghdr
*cp
;
1202 if (CMSG_SPACE((u_int
)size
) > MLEN
)
1203 return ((struct mbuf
*) NULL
);
1204 if ((m
= m_get(M_DONTWAIT
, MT_CONTROL
)) == NULL
)
1205 return ((struct mbuf
*) NULL
);
1206 cp
= mtod(m
, struct cmsghdr
*);
1207 /* XXX check size? */
1208 (void)memcpy(CMSG_DATA(cp
), p
, size
);
1209 m
->m_len
= CMSG_SPACE(size
);
1210 cp
->cmsg_len
= CMSG_LEN(size
);
1211 cp
->cmsg_level
= level
;
1212 cp
->cmsg_type
= type
;
1217 * Some routines that return EOPNOTSUPP for entry points that are not
1218 * supported by a protocol. Fill in as needed.
1221 pru_abort_notsupp(struct socket
*so
)
1228 pru_accept_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1234 pru_attach_notsupp(struct socket
*so
, int proto
, struct proc
*p
)
1240 pru_bind_notsupp(struct socket
*so
, struct sockaddr
*nam
, struct proc
*p
)
1246 pru_connect_notsupp(struct socket
*so
, struct sockaddr
*nam
, struct proc
*p
)
1252 pru_connect2_notsupp(struct socket
*so1
, struct socket
*so2
)
1258 pru_control_notsupp(struct socket
*so
, u_long cmd
, caddr_t data
,
1259 struct ifnet
*ifp
, struct proc
*p
)
1265 pru_detach_notsupp(struct socket
*so
)
1271 pru_disconnect_notsupp(struct socket
*so
)
1277 pru_listen_notsupp(struct socket
*so
, struct proc
*p
)
1283 pru_peeraddr_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1289 pru_rcvd_notsupp(struct socket
*so
, int flags
)
1295 pru_rcvoob_notsupp(struct socket
*so
, struct mbuf
*m
, int flags
)
1301 pru_send_notsupp(struct socket
*so
, int flags
, struct mbuf
*m
,
1302 struct sockaddr
*addr
, struct mbuf
*control
,
1311 * This isn't really a ``null'' operation, but it's the default one
1312 * and doesn't do anything destructive.
1315 pru_sense_null(struct socket
*so
, struct stat
*sb
)
1317 sb
->st_blksize
= so
->so_snd
.sb_hiwat
;
1322 int pru_sosend_notsupp(struct socket
*so
, struct sockaddr
*addr
,
1323 struct uio
*uio
, struct mbuf
*top
,
1324 struct mbuf
*control
, int flags
)
1330 int pru_soreceive_notsupp(struct socket
*so
,
1331 struct sockaddr
**paddr
,
1332 struct uio
*uio
, struct mbuf
**mp0
,
1333 struct mbuf
**controlp
, int *flagsp
)
1340 pru_shutdown_notsupp(struct socket
*so
)
1346 pru_sockaddr_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1351 int pru_sosend(struct socket
*so
, struct sockaddr
*addr
,
1352 struct uio
*uio
, struct mbuf
*top
,
1353 struct mbuf
*control
, int flags
)
1358 int pru_soreceive(struct socket
*so
,
1359 struct sockaddr
**paddr
,
1360 struct uio
*uio
, struct mbuf
**mp0
,
1361 struct mbuf
**controlp
, int *flagsp
)
1368 pru_sopoll_notsupp(__unused
struct socket
*so
, __unused
int events
,
1369 __unused kauth_cred_t cred
, __unused
void *wql
)
1377 * The following are macros on BSD and functions on Darwin
1381 * Do we need to notify the other side when I/O is possible?
1385 sb_notify(struct sockbuf
*sb
)
1387 return ((sb
->sb_flags
& (SB_WAIT
|SB_SEL
|SB_ASYNC
|SB_UPCALL
|SB_KNOTE
)) != 0);
1391 * How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
1392 * This is problematical if the fields are unsigned, as the space might
1393 * still be negative (cc > hiwat or mbcnt > mbmax). Should detect
1394 * overflow and return 0. Should use "lmin" but it doesn't exist now.
1397 sbspace(struct sockbuf
*sb
)
1399 return ((long) imin((int)(sb
->sb_hiwat
- sb
->sb_cc
),
1400 (int)(sb
->sb_mbmax
- sb
->sb_mbcnt
)));
1403 /* do we have to send all at once on a socket? */
1405 sosendallatonce(struct socket
*so
)
1407 return (so
->so_proto
->pr_flags
& PR_ATOMIC
);
1410 /* can we read something from so? */
1412 soreadable(struct socket
*so
)
1414 return (so
->so_rcv
.sb_cc
>= so
->so_rcv
.sb_lowat
||
1415 (so
->so_state
& SS_CANTRCVMORE
) ||
1416 so
->so_comp
.tqh_first
|| so
->so_error
);
1419 /* can we write something to so? */
1422 sowriteable(struct socket
*so
)
1424 return ((sbspace(&(so
)->so_snd
) >= (so
)->so_snd
.sb_lowat
&&
1425 ((so
->so_state
&SS_ISCONNECTED
) ||
1426 (so
->so_proto
->pr_flags
&PR_CONNREQUIRED
)==0)) ||
1427 (so
->so_state
& SS_CANTSENDMORE
) ||
1431 /* adjust counters in sb reflecting allocation of m */
1434 sballoc(struct sockbuf
*sb
, struct mbuf
*m
)
1436 sb
->sb_cc
+= m
->m_len
;
1437 sb
->sb_mbcnt
+= MSIZE
;
1438 if (m
->m_flags
& M_EXT
)
1439 sb
->sb_mbcnt
+= m
->m_ext
.ext_size
;
1442 /* adjust counters in sb reflecting freeing of m */
1444 sbfree(struct sockbuf
*sb
, struct mbuf
*m
)
1446 sb
->sb_cc
-= m
->m_len
;
1447 sb
->sb_mbcnt
-= MSIZE
;
1448 if (m
->m_flags
& M_EXT
)
1449 sb
->sb_mbcnt
-= m
->m_ext
.ext_size
;
1453 * Set lock on sockbuf sb; sleep if lock is already held.
1454 * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
1455 * Returns error without lock if sleep is interrupted.
1458 sblock(struct sockbuf
*sb
, int wf
)
1460 return(sb
->sb_flags
& SB_LOCK
?
1461 ((wf
== M_WAIT
) ? sb_lock(sb
) : EWOULDBLOCK
) :
1462 (sb
->sb_flags
|= SB_LOCK
), 0);
1465 /* release lock on sockbuf sb */
1467 sbunlock(struct sockbuf
*sb
, int keeplocked
)
1469 struct socket
*so
= sb
->sb_so
;
1471 lck_mtx_t
*mutex_held
;
1474 lr_saved
= (unsigned int) __builtin_return_address(0);
1476 sb
->sb_flags
&= ~SB_LOCK
;
1478 if (so
->so_proto
->pr_getlock
!= NULL
)
1479 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
1481 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1483 if (keeplocked
== 0)
1484 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1486 if (sb
->sb_flags
& SB_WANT
) {
1487 sb
->sb_flags
&= ~SB_WANT
;
1488 if (so
->so_usecount
< 0)
1489 panic("sbunlock: b4 wakeup so=%x ref=%d lr=%x sb_flags=%x\n", sb
->sb_so
, so
->so_usecount
, lr_saved
, sb
->sb_flags
);
1491 wakeup((caddr_t
)&(sb
)->sb_flags
);
1493 if (keeplocked
== 0) { /* unlock on exit */
1495 if (so
->so_usecount
< 0)
1496 panic("sbunlock: unlock on exit so=%x lr=%x sb_flags=%x\n", so
, so
->so_usecount
,lr_saved
, sb
->sb_flags
);
1497 so
->unlock_lr
[so
->next_unlock_lr
] = (void *)lr_saved
;
1498 so
->next_unlock_lr
= (so
->next_unlock_lr
+1) % SO_LCKDBG_MAX
;
1499 lck_mtx_unlock(mutex_held
);
1504 sorwakeup(struct socket
* so
)
1506 if (sb_notify(&so
->so_rcv
))
1507 sowakeup(so
, &so
->so_rcv
);
1511 sowwakeup(struct socket
* so
)
1513 if (sb_notify(&so
->so_snd
))
1514 sowakeup(so
, &so
->so_snd
);
1519 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
1522 dup_sockaddr(sa
, canwait
)
1523 struct sockaddr
*sa
;
1526 struct sockaddr
*sa2
;
1528 MALLOC(sa2
, struct sockaddr
*, sa
->sa_len
, M_SONAME
,
1529 canwait
? M_WAITOK
: M_NOWAIT
);
1531 bcopy(sa
, sa2
, sa
->sa_len
);
1536 * Create an external-format (``xsocket'') structure using the information
1537 * in the kernel-format socket structure pointed to by so. This is done
1538 * to reduce the spew of irrelevant information over this interface,
1539 * to isolate user code from changes in the kernel structure, and
1540 * potentially to provide information-hiding if we decide that
1541 * some of this information should be hidden from users.
1544 sotoxsocket(struct socket
*so
, struct xsocket
*xso
)
1546 xso
->xso_len
= sizeof *xso
;
1548 xso
->so_type
= so
->so_type
;
1549 xso
->so_options
= so
->so_options
;
1550 xso
->so_linger
= so
->so_linger
;
1551 xso
->so_state
= so
->so_state
;
1552 xso
->so_pcb
= so
->so_pcb
;
1554 xso
->xso_protocol
= so
->so_proto
->pr_protocol
;
1555 xso
->xso_family
= so
->so_proto
->pr_domain
->dom_family
;
1558 xso
->xso_protocol
= xso
->xso_family
= 0;
1559 xso
->so_qlen
= so
->so_qlen
;
1560 xso
->so_incqlen
= so
->so_incqlen
;
1561 xso
->so_qlimit
= so
->so_qlimit
;
1562 xso
->so_timeo
= so
->so_timeo
;
1563 xso
->so_error
= so
->so_error
;
1564 xso
->so_pgid
= so
->so_pgid
;
1565 xso
->so_oobmark
= so
->so_oobmark
;
1566 sbtoxsockbuf(&so
->so_snd
, &xso
->so_snd
);
1567 sbtoxsockbuf(&so
->so_rcv
, &xso
->so_rcv
);
1568 xso
->so_uid
= so
->so_uid
;
1572 * This does the same for sockbufs. Note that the xsockbuf structure,
1573 * since it is always embedded in a socket, does not include a self
1574 * pointer nor a length. We make this entry point public in case
1575 * some other mechanism needs it.
1578 sbtoxsockbuf(struct sockbuf
*sb
, struct xsockbuf
*xsb
)
1580 xsb
->sb_cc
= sb
->sb_cc
;
1581 xsb
->sb_hiwat
= sb
->sb_hiwat
;
1582 xsb
->sb_mbcnt
= sb
->sb_mbcnt
;
1583 xsb
->sb_mbmax
= sb
->sb_mbmax
;
1584 xsb
->sb_lowat
= sb
->sb_lowat
;
1585 xsb
->sb_flags
= sb
->sb_flags
;
1586 xsb
->sb_timeo
= (u_long
)(sb
->sb_timeo
.tv_sec
* hz
) + sb
->sb_timeo
.tv_usec
/ tick
;
1587 if (xsb
->sb_timeo
== 0 && sb
->sb_timeo
.tv_usec
!= 0)
1592 * Here is the definition of some of the basic objects in the kern.ipc
1593 * branch of the MIB.
1595 SYSCTL_NODE(_kern
, KERN_IPC
, ipc
, CTLFLAG_RW
, 0, "IPC");
1597 /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
1599 SYSCTL_INT(_kern
, KERN_DUMMY
, dummy
, CTLFLAG_RW
, &dummy
, 0, "");
1601 SYSCTL_INT(_kern_ipc
, KIPC_MAXSOCKBUF
, maxsockbuf
, CTLFLAG_RW
,
1602 &sb_max
, 0, "Maximum socket buffer size");
1603 SYSCTL_INT(_kern_ipc
, OID_AUTO
, maxsockets
, CTLFLAG_RD
,
1604 &maxsockets
, 0, "Maximum number of sockets avaliable");
1605 SYSCTL_INT(_kern_ipc
, KIPC_SOCKBUF_WASTE
, sockbuf_waste_factor
, CTLFLAG_RW
,
1606 &sb_efficiency
, 0, "");
1607 SYSCTL_INT(_kern_ipc
, KIPC_NMBCLUSTERS
, nmbclusters
, CTLFLAG_RD
, &nmbclusters
, 0, "");