2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
30 /* Copyright (c) 1998, 1999 Apple Computer, Inc. All Rights Reserved */
31 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
33 * Copyright (c) 1982, 1986, 1988, 1990, 1993
34 * The Regents of the University of California. All rights reserved.
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 * must display the following acknowledgement:
46 * This product includes software developed by the University of
47 * California, Berkeley and its contributors.
48 * 4. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
65 * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.9 2001/07/26 18:53:02 peter Exp $
68 #include <sys/param.h>
69 #include <sys/systm.h>
70 #include <sys/domain.h>
71 #include <sys/kernel.h>
72 #include <sys/proc_internal.h>
73 #include <sys/kauth.h>
74 #include <sys/malloc.h>
76 #include <sys/protosw.h>
78 #include <sys/socket.h>
79 #include <sys/socketvar.h>
80 #include <sys/signalvar.h>
81 #include <sys/sysctl.h>
83 #include <kern/locks.h>
84 #include <net/route.h>
85 #include <netinet/in.h>
86 #include <netinet/in_pcb.h>
87 #include <sys/kdebug.h>
89 #define DBG_FNC_SBDROP NETDBG_CODE(DBG_NETSOCK, 4)
90 #define DBG_FNC_SBAPPEND NETDBG_CODE(DBG_NETSOCK, 5)
92 static int sbcompress(struct sockbuf
*, struct mbuf
*, struct mbuf
*);
95 * Primitive routines for operating on sockets and socket buffers
98 u_long sb_max
= SB_MAX
; /* XXX should be static */
100 static u_long sb_efficiency
= 8; /* parameter for sbreserve() */
103 * Procedures to manipulate state flags of socket
104 * and do appropriate wakeups. Normal sequence from the
105 * active (originating) side is that soisconnecting() is
106 * called during processing of connect() call,
107 * resulting in an eventual call to soisconnected() if/when the
108 * connection is established. When the connection is torn down
109 * soisdisconnecting() is called during processing of disconnect() call,
110 * and soisdisconnected() is called when the connection to the peer
111 * is totally severed. The semantics of these routines are such that
112 * connectionless protocols can call soisconnected() and soisdisconnected()
113 * only, bypassing the in-progress calls when setting up a ``connection''
116 * From the passive side, a socket is created with
117 * two queues of sockets: so_incomp for connections in progress
118 * and so_comp for connections already made and awaiting user acceptance.
119 * As a protocol is preparing incoming connections, it creates a socket
120 * structure queued on so_incomp by calling sonewconn(). When the connection
121 * is established, soisconnected() is called, and transfers the
122 * socket structure to so_comp, making it available to accept().
124 * If a socket is closed with sockets on either
125 * so_incomp or so_comp, these sockets are dropped.
127 * If higher level protocols are implemented in
128 * the kernel, the wakeups done here will sometimes
129 * cause software-interrupt process scheduling.
133 register struct socket
*so
;
136 so
->so_state
&= ~(SS_ISCONNECTED
|SS_ISDISCONNECTING
);
137 so
->so_state
|= SS_ISCONNECTING
;
139 sflt_notify(so
, sock_evt_connecting
, NULL
);
146 struct socket
*head
= so
->so_head
;
148 so
->so_state
&= ~(SS_ISCONNECTING
|SS_ISDISCONNECTING
|SS_ISCONFIRMING
);
149 so
->so_state
|= SS_ISCONNECTED
;
151 sflt_notify(so
, sock_evt_connected
, NULL
);
153 if (head
&& (so
->so_state
& SS_INCOMP
)) {
154 so
->so_state
&= ~SS_INCOMP
;
155 so
->so_state
|= SS_COMP
;
156 if (head
->so_proto
->pr_getlock
!= NULL
) {
157 socket_unlock(so
, 0);
158 socket_lock(head
, 1);
160 postevent(head
, 0, EV_RCONN
);
161 TAILQ_REMOVE(&head
->so_incomp
, so
, so_list
);
163 TAILQ_INSERT_TAIL(&head
->so_comp
, so
, so_list
);
165 wakeup_one((caddr_t
)&head
->so_timeo
);
166 if (head
->so_proto
->pr_getlock
!= NULL
) {
167 socket_unlock(head
, 1);
171 postevent(so
, 0, EV_WCONN
);
172 wakeup((caddr_t
)&so
->so_timeo
);
179 soisdisconnecting(so
)
180 register struct socket
*so
;
182 so
->so_state
&= ~SS_ISCONNECTING
;
183 so
->so_state
|= (SS_ISDISCONNECTING
|SS_CANTRCVMORE
|SS_CANTSENDMORE
);
184 sflt_notify(so
, sock_evt_disconnecting
, NULL
);
185 wakeup((caddr_t
)&so
->so_timeo
);
192 register struct socket
*so
;
194 so
->so_state
&= ~(SS_ISCONNECTING
|SS_ISCONNECTED
|SS_ISDISCONNECTING
);
195 so
->so_state
|= (SS_CANTRCVMORE
|SS_CANTSENDMORE
|SS_ISDISCONNECTED
);
196 sflt_notify(so
, sock_evt_disconnected
, NULL
);
197 wakeup((caddr_t
)&so
->so_timeo
);
203 * Return a random connection that hasn't been serviced yet and
204 * is eligible for discard. There is a one in qlen chance that
205 * we will return a null, saying that there are no dropable
206 * requests. In this case, the protocol specific code should drop
207 * the new request. This insures fairness.
209 * This may be used in conjunction with protocol specific queue
210 * congestion routines.
214 register struct socket
*head
;
216 struct socket
*so
, *sonext
= NULL
;
217 unsigned int i
, j
, qlen
;
219 static struct timeval old_runtime
;
220 static unsigned int cur_cnt
, old_cnt
;
224 if ((i
= (tv
.tv_sec
- old_runtime
.tv_sec
)) != 0) {
226 old_cnt
= cur_cnt
/ i
;
230 so
= TAILQ_FIRST(&head
->so_incomp
);
234 qlen
= head
->so_incqlen
;
235 if (++cur_cnt
> qlen
|| old_cnt
> qlen
) {
236 rnd
= (314159 * rnd
+ 66329) & 0xffff;
237 j
= ((qlen
+ 1) * rnd
) >> 16;
240 // if (in_pcb_checkstate(so->so_pcb, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
242 sonext
= TAILQ_NEXT(so
, so_list
);
243 // in_pcb_check_state(so->so_pcb, WNT_RELEASE, 0);
244 socket_unlock(so
, 1);
249 // if (in_pcb_checkstate(so->so_pcb, WNT_ACQUIRE, 0) == WNT_STOPUSING)
256 * When an attempt at a new connection is noted on a socket
257 * which accepts connections, sonewconn is called. If the
258 * connection is possible (subject to space constraints, etc.)
259 * then we allocate a new structure, propoerly linked into the
260 * data structure of the original socket, and return this.
261 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
263 static struct socket
*
264 sonewconn_internal(head
, connstatus
)
265 register struct socket
*head
;
269 register struct socket
*so
;
270 lck_mtx_t
*mutex_held
;
272 if (head
->so_proto
->pr_getlock
!= NULL
)
273 mutex_held
= (*head
->so_proto
->pr_getlock
)(head
, 0);
275 mutex_held
= head
->so_proto
->pr_domain
->dom_mtx
;
276 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
278 if (head
->so_qlen
> 3 * head
->so_qlimit
/ 2)
279 return ((struct socket
*)0);
280 so
= soalloc(1, head
->so_proto
->pr_domain
->dom_family
, head
->so_type
);
282 return ((struct socket
*)0);
283 /* check if head was closed during the soalloc */
284 if (head
->so_proto
== NULL
) {
286 return ((struct socket
*)0);
290 so
->so_type
= head
->so_type
;
291 so
->so_options
= head
->so_options
&~ SO_ACCEPTCONN
;
292 so
->so_linger
= head
->so_linger
;
293 so
->so_state
= head
->so_state
| SS_NOFDREF
;
294 so
->so_proto
= head
->so_proto
;
295 so
->so_timeo
= head
->so_timeo
;
296 so
->so_pgid
= head
->so_pgid
;
297 so
->so_uid
= head
->so_uid
;
299 so
->next_lock_lr
= 0;
300 so
->next_unlock_lr
= 0;
303 so
->so_rcv
.sb_flags
|= SB_RECV
; /* XXX */
304 so
->so_rcv
.sb_so
= so
->so_snd
.sb_so
= so
;
305 TAILQ_INIT(&so
->so_evlist
);
308 if (soreserve(so
, head
->so_snd
.sb_hiwat
, head
->so_rcv
.sb_hiwat
)) {
311 return ((struct socket
*)0);
315 * Must be done with head unlocked to avoid deadlock for protocol with per socket mutexes.
317 if (head
->so_proto
->pr_unlock
)
318 socket_unlock(head
, 0);
319 if (((*so
->so_proto
->pr_usrreqs
->pru_attach
)(so
, 0, NULL
) != 0) || error
) {
322 if (head
->so_proto
->pr_unlock
)
323 socket_lock(head
, 0);
324 return ((struct socket
*)0);
326 if (head
->so_proto
->pr_unlock
)
327 socket_lock(head
, 0);
329 so
->so_proto
->pr_domain
->dom_refs
++;
333 TAILQ_INSERT_TAIL(&head
->so_comp
, so
, so_list
);
334 so
->so_state
|= SS_COMP
;
336 TAILQ_INSERT_TAIL(&head
->so_incomp
, so
, so_list
);
337 so
->so_state
|= SS_INCOMP
;
343 /* Attach socket filters for this protocol */
347 so
->so_state
|= connstatus
;
349 wakeup((caddr_t
)&head
->so_timeo
);
359 const struct sockaddr
*from
)
362 struct socket_filter_entry
*filter
;
366 for (filter
= head
->so_filt
; filter
&& (error
== 0);
367 filter
= filter
->sfe_next_onsocket
) {
368 if (filter
->sfe_filter
->sf_filter
.sf_connect_in
) {
372 socket_unlock(head
, 0);
374 error
= filter
->sfe_filter
->sf_filter
.sf_connect_in(
375 filter
->sfe_cookie
, head
, from
);
379 socket_lock(head
, 0);
387 return sonewconn_internal(head
, connstatus
);
391 * Socantsendmore indicates that no more data will be sent on the
392 * socket; it would normally be applied to a socket when the user
393 * informs the system that no more data is to be sent, by the protocol
394 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data
395 * will be received, and will normally be applied to the socket by a
396 * protocol when it detects that the peer will send no more data.
397 * Data queued for reading in the socket may yet be read.
404 so
->so_state
|= SS_CANTSENDMORE
;
405 sflt_notify(so
, sock_evt_cantsendmore
, NULL
);
413 so
->so_state
|= SS_CANTRCVMORE
;
414 sflt_notify(so
, sock_evt_cantrecvmore
, NULL
);
419 * Wait for data to arrive at/drain from a socket buffer.
425 int error
= 0, lr_saved
;
426 struct socket
*so
= sb
->sb_so
;
427 lck_mtx_t
*mutex_held
;
430 lr_saved
= (unsigned int) __builtin_return_address(0);
432 if (so
->so_proto
->pr_getlock
!= NULL
)
433 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
435 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
437 sb
->sb_flags
|= SB_WAIT
;
439 if (so
->so_usecount
< 1)
440 panic("sbwait: so=%x refcount=%d\n", so
, so
->so_usecount
);
441 ts
.tv_sec
= sb
->sb_timeo
.tv_sec
;
442 ts
.tv_nsec
= sb
->sb_timeo
.tv_usec
* 1000;
443 error
= msleep((caddr_t
)&sb
->sb_cc
, mutex_held
,
444 (sb
->sb_flags
& SB_NOINTR
) ? PSOCK
: PSOCK
| PCATCH
, "sbwait",
447 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
449 if (so
->so_usecount
< 1)
450 panic("sbwait: so=%x refcount=%d\n", so
, so
->so_usecount
);
452 if ((so
->so_state
& SS_DRAINING
)) {
460 * Lock a sockbuf already known to be locked;
461 * return any error returned from sleep (EINTR).
465 register struct sockbuf
*sb
;
467 struct socket
*so
= sb
->sb_so
;
468 lck_mtx_t
* mutex_held
;
472 panic("sb_lock: null so back pointer sb=%x\n", sb
);
474 while (sb
->sb_flags
& SB_LOCK
) {
475 sb
->sb_flags
|= SB_WANT
;
476 if (so
->so_proto
->pr_getlock
!= NULL
)
477 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
479 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
480 if (so
->so_usecount
< 1)
481 panic("sb_lock: so=%x refcount=%d\n", so
, so
->so_usecount
);
483 error
= msleep((caddr_t
)&sb
->sb_flags
, mutex_held
,
484 (sb
->sb_flags
& SB_NOINTR
) ? PSOCK
: PSOCK
| PCATCH
, "sblock", 0);
485 if (so
->so_usecount
< 1)
486 panic("sb_lock: 2 so=%x refcount=%d\n", so
, so
->so_usecount
);
490 sb
->sb_flags
|= SB_LOCK
;
495 * Wakeup processes waiting on a socket buffer.
496 * Do asynchronous notification via SIGIO
497 * if the socket has the SS_ASYNC flag set.
501 register struct socket
*so
;
502 register struct sockbuf
*sb
;
504 struct proc
*p
= current_proc();
505 sb
->sb_flags
&= ~SB_SEL
;
506 selwakeup(&sb
->sb_sel
);
507 if (sb
->sb_flags
& SB_WAIT
) {
508 sb
->sb_flags
&= ~SB_WAIT
;
509 wakeup((caddr_t
)&sb
->sb_cc
);
511 if (so
->so_state
& SS_ASYNC
) {
513 gsignal(-so
->so_pgid
, SIGIO
);
514 else if (so
->so_pgid
> 0 && (p
= pfind(so
->so_pgid
)) != 0)
517 if (sb
->sb_flags
& SB_KNOTE
) {
518 KNOTE(&sb
->sb_sel
.si_note
, SO_FILT_HINT_LOCKED
);
520 if (sb
->sb_flags
& SB_UPCALL
) {
521 socket_unlock(so
, 0);
522 (*so
->so_upcall
)(so
, so
->so_upcallarg
, M_DONTWAIT
);
528 * Socket buffer (struct sockbuf) utility routines.
530 * Each socket contains two socket buffers: one for sending data and
531 * one for receiving data. Each buffer contains a queue of mbufs,
532 * information about the number of mbufs and amount of data in the
533 * queue, and other fields allowing select() statements and notification
534 * on data availability to be implemented.
536 * Data stored in a socket buffer is maintained as a list of records.
537 * Each record is a list of mbufs chained together with the m_next
538 * field. Records are chained together with the m_nextpkt field. The upper
539 * level routine soreceive() expects the following conventions to be
540 * observed when placing information in the receive buffer:
542 * 1. If the protocol requires each message be preceded by the sender's
543 * name, then a record containing that name must be present before
544 * any associated data (mbuf's must be of type MT_SONAME).
545 * 2. If the protocol supports the exchange of ``access rights'' (really
546 * just additional data associated with the message), and there are
547 * ``rights'' to be received, then a record containing this data
548 * should be present (mbuf's must be of type MT_RIGHTS).
549 * 3. If a name or rights record exists, then it must be followed by
550 * a data record, perhaps of zero length.
552 * Before using a new socket structure it is first necessary to reserve
553 * buffer space to the socket, by calling sbreserve(). This should commit
554 * some of the available buffer space in the system buffer pool for the
555 * socket (currently, it does nothing but enforce limits). The space
556 * should be released by calling sbrelease() when the socket is destroyed.
560 soreserve(so
, sndcc
, rcvcc
)
561 register struct socket
*so
;
565 if (sbreserve(&so
->so_snd
, sndcc
) == 0)
567 if (sbreserve(&so
->so_rcv
, rcvcc
) == 0)
569 if (so
->so_rcv
.sb_lowat
== 0)
570 so
->so_rcv
.sb_lowat
= 1;
571 if (so
->so_snd
.sb_lowat
== 0)
572 so
->so_snd
.sb_lowat
= MCLBYTES
;
573 if (so
->so_snd
.sb_lowat
> so
->so_snd
.sb_hiwat
)
574 so
->so_snd
.sb_lowat
= so
->so_snd
.sb_hiwat
;
578 selthreadclear(&so
->so_snd
.sb_sel
);
580 sbrelease(&so
->so_snd
);
586 * Allot mbufs to a sockbuf.
587 * Attempt to scale mbmax so that mbcnt doesn't become limiting
588 * if buffering efficiency is near the normal case.
595 if ((u_quad_t
)cc
> (u_quad_t
)sb_max
* MCLBYTES
/ (MSIZE
+ MCLBYTES
))
598 sb
->sb_mbmax
= min(cc
* sb_efficiency
, sb_max
);
599 if (sb
->sb_lowat
> sb
->sb_hiwat
)
600 sb
->sb_lowat
= sb
->sb_hiwat
;
605 * Free mbufs held by a socket, and reserved mbuf space.
607 /* WARNING needs to do selthreadclear() before calling this */
620 * Routines to add and remove
621 * data from an mbuf queue.
623 * The routines sbappend() or sbappendrecord() are normally called to
624 * append new mbufs to a socket buffer, after checking that adequate
625 * space is available, comparing the function sbspace() with the amount
626 * of data to be added. sbappendrecord() differs from sbappend() in
627 * that data supplied is treated as the beginning of a new record.
628 * To place a sender's address, optional access rights, and data in a
629 * socket receive buffer, sbappendaddr() should be used. To place
630 * access rights and data in a socket receive buffer, sbappendrights()
631 * should be used. In either case, the new data begins a new record.
632 * Note that unlike sbappend() and sbappendrecord(), these routines check
633 * for the caller that there will be enough space to store the data.
634 * Each fails if there is not enough space, or if it cannot find mbufs
635 * to store additional information in.
637 * Reliable protocols may use the socket send buffer to hold data
638 * awaiting acknowledgement. Data is normally copied from a socket
639 * send buffer in a protocol with m_copy for output to a peer,
640 * and then removing the data from the socket buffer with sbdrop()
641 * or sbdroprecord() when the data is acknowledged by the peer.
645 * Append mbuf chain m to the last record in the
646 * socket buffer sb. The additional space associated
647 * the mbuf chain is recorded in sb. Empty mbufs are
648 * discarded and mbufs are compacted where possible.
655 register struct mbuf
*n
, *sb_first
;
661 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_START
), sb
, m
->m_len
, 0, 0, 0);
667 sb_first
= n
= sb
->sb_mb
;
672 if (n
->m_flags
& M_EOR
) {
673 result
= sbappendrecord(sb
, m
); /* XXXXXX!!!! */
674 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_END
), sb
, sb
->sb_cc
, 0, 0, 0);
677 } while (n
->m_next
&& (n
= n
->m_next
));
680 if (!filtered
&& (sb
->sb_flags
& SB_RECV
) != 0) {
681 error
= sflt_data_in(sb
->sb_so
, NULL
, &m
, NULL
, 0, &filtered
);
683 /* no data was appended, caller should not call sowakeup */
688 If we any filters, the socket lock was dropped. n and sb_first
689 cached data from the socket buffer. This cache is not valid
690 since we dropped the lock. We must start over. Since filtered
691 is set we won't run through the filters a second time. We just
692 set n and sb_start again.
698 result
= sbcompress(sb
, m
, n
);
700 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_END
), sb
, sb
->sb_cc
, 0, 0, 0);
708 register struct sockbuf
*sb
;
710 register struct mbuf
*m
;
711 register struct mbuf
*n
= 0;
712 register u_long len
= 0, mbcnt
= 0;
713 lck_mtx_t
*mutex_held
;
715 if (sb
->sb_so
->so_proto
->pr_getlock
!= NULL
)
716 mutex_held
= (*sb
->sb_so
->so_proto
->pr_getlock
)(sb
->sb_so
, 0);
718 mutex_held
= sb
->sb_so
->so_proto
->pr_domain
->dom_mtx
;
720 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
725 for (m
= sb
->sb_mb
; m
; m
= n
) {
727 for (; m
; m
= m
->m_next
) {
730 if (m
->m_flags
& M_EXT
) /*XXX*/ /* pretty sure this is bogus */
731 mbcnt
+= m
->m_ext
.ext_size
;
734 if (len
!= sb
->sb_cc
|| mbcnt
!= sb
->sb_mbcnt
) {
735 panic("cc %ld != %ld || mbcnt %ld != %ld\n", len
, sb
->sb_cc
,
736 mbcnt
, sb
->sb_mbcnt
);
742 * As above, except the mbuf chain
743 * begins a new record.
746 sbappendrecord(sb
, m0
)
747 register struct sockbuf
*sb
;
750 register struct mbuf
*m
;
756 if ((sb
->sb_flags
& SB_RECV
) != 0) {
757 int error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, NULL
, sock_data_filt_flag_record
, NULL
);
759 if (error
!= EJUSTRETURN
)
770 * Put the first mbuf on the queue.
771 * Note this permits zero length records.
780 if (m
&& (m0
->m_flags
& M_EOR
)) {
781 m0
->m_flags
&= ~M_EOR
;
784 return sbcompress(sb
, m
, m0
);
788 * As above except that OOB data
789 * is inserted at the beginning of the sockbuf,
790 * but after any other OOB data.
803 if ((sb
->sb_flags
& SB_RECV
) != 0) {
804 int error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, NULL
,
805 sock_data_filt_flag_oob
, NULL
);
808 if (error
!= EJUSTRETURN
) {
815 for (mp
= &sb
->sb_mb
; *mp
; mp
= &((*mp
)->m_nextpkt
)) {
821 continue; /* WANT next train */
826 goto again
; /* inspect THIS train further */
831 * Put the first mbuf on the queue.
832 * Note this permits zero length records.
839 if (m
&& (m0
->m_flags
& M_EOR
)) {
840 m0
->m_flags
&= ~M_EOR
;
843 return sbcompress(sb
, m
, m0
);
847 * Append address and data, and optionally, control (ancillary) data
848 * to the receive queue of a socket. If present,
849 * m0 must include a packet header with total length.
850 * Returns 0 if no space in sockbuf or insufficient mbufs.
853 sbappendaddr_internal(sb
, asa
, m0
, control
)
854 register struct sockbuf
*sb
;
855 struct sockaddr
*asa
;
856 struct mbuf
*m0
, *control
;
858 register struct mbuf
*m
, *n
;
859 int space
= asa
->sa_len
;
861 if (m0
&& (m0
->m_flags
& M_PKTHDR
) == 0)
862 panic("sbappendaddr");
865 space
+= m0
->m_pkthdr
.len
;
866 for (n
= control
; n
; n
= n
->m_next
) {
868 if (n
->m_next
== 0) /* keep pointer to last control buf */
871 if (space
> sbspace(sb
))
873 if (asa
->sa_len
> MLEN
)
875 MGET(m
, M_DONTWAIT
, MT_SONAME
);
878 m
->m_len
= asa
->sa_len
;
879 bcopy((caddr_t
)asa
, mtod(m
, caddr_t
), asa
->sa_len
);
881 n
->m_next
= m0
; /* concatenate data to control */
885 for (n
= m
; n
; n
= n
->m_next
)
894 postevent(0,sb
,EV_RWBYTES
);
901 struct sockaddr
* asa
,
903 struct mbuf
*control
,
908 if (error_out
) *error_out
= 0;
910 if (m0
&& (m0
->m_flags
& M_PKTHDR
) == 0)
911 panic("sbappendaddrorfree");
913 /* Call socket data in filters */
914 if ((sb
->sb_flags
& SB_RECV
) != 0) {
916 error
= sflt_data_in(sb
->sb_so
, asa
, &m0
, &control
, 0, NULL
);
918 if (error
!= EJUSTRETURN
) {
920 if (control
) m_freem(control
);
921 if (error_out
) *error_out
= error
;
927 result
= sbappendaddr_internal(sb
, asa
, m0
, control
);
930 if (control
) m_freem(control
);
931 if (error_out
) *error_out
= ENOBUFS
;
938 sbappendcontrol_internal(sb
, m0
, control
)
940 struct mbuf
*control
, *m0
;
942 register struct mbuf
*m
, *n
;
946 panic("sbappendcontrol");
948 for (m
= control
; ; m
= m
->m_next
) {
953 n
= m
; /* save pointer to last control buffer */
954 for (m
= m0
; m
; m
= m
->m_next
)
956 if (space
> sbspace(sb
))
958 n
->m_next
= m0
; /* concatenate data to control */
959 for (m
= control
; m
; m
= m
->m_next
)
965 n
->m_nextpkt
= control
;
968 postevent(0,sb
,EV_RWBYTES
);
976 struct mbuf
*control
,
981 if (error_out
) *error_out
= 0;
983 if (sb
->sb_flags
& SB_RECV
) {
985 error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, &control
, 0, NULL
);
987 if (error
!= EJUSTRETURN
) {
989 if (control
) m_freem(control
);
990 if (error_out
) *error_out
= error
;
996 result
= sbappendcontrol_internal(sb
, m0
, control
);
999 if (control
) m_freem(control
);
1000 if (error_out
) *error_out
= ENOBUFS
;
1007 * Compress mbuf chain m into the socket
1008 * buffer sb following mbuf n. If n
1009 * is null, the buffer is presumed empty.
1012 sbcompress(sb
, m
, n
)
1013 register struct sockbuf
*sb
;
1014 register struct mbuf
*m
, *n
;
1016 register int eor
= 0;
1017 register struct mbuf
*o
;
1020 eor
|= m
->m_flags
& M_EOR
;
1021 if (m
->m_len
== 0 &&
1023 (((o
= m
->m_next
) || (o
= n
)) &&
1024 o
->m_type
== m
->m_type
))) {
1028 if (n
&& (n
->m_flags
& M_EOR
) == 0 &&
1032 m
->m_len
<= MCLBYTES
/ 4 && /* XXX: Don't copy too much */
1033 m
->m_len
<= M_TRAILINGSPACE(n
) &&
1034 n
->m_type
== m
->m_type
) {
1035 bcopy(mtod(m
, caddr_t
), mtod(n
, caddr_t
) + n
->m_len
,
1036 (unsigned)m
->m_len
);
1037 n
->m_len
+= m
->m_len
;
1038 sb
->sb_cc
+= m
->m_len
;
1048 m
->m_flags
&= ~M_EOR
;
1056 printf("semi-panic: sbcompress\n");
1058 postevent(0,sb
, EV_RWBYTES
);
1063 * Free all mbufs in a sockbuf.
1064 * Check that all resources are reclaimed.
1068 register struct sockbuf
*sb
;
1070 if (sb
->sb_so
== NULL
)
1071 panic ("sbflush sb->sb_so already null sb=%x\n", sb
);
1072 (void)sblock(sb
, M_WAIT
);
1073 while (sb
->sb_mbcnt
) {
1075 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
1076 * we would loop forever. Panic instead.
1078 if (!sb
->sb_cc
&& (sb
->sb_mb
== NULL
|| sb
->sb_mb
->m_len
))
1080 sbdrop(sb
, (int)sb
->sb_cc
);
1082 if (sb
->sb_cc
|| sb
->sb_mb
|| sb
->sb_mbcnt
|| sb
->sb_so
== NULL
)
1083 panic("sbflush: cc %ld || mb %p || mbcnt %ld sb_so=%x", sb
->sb_cc
, (void *)sb
->sb_mb
, sb
->sb_mbcnt
, sb
->sb_so
);
1085 postevent(0, sb
, EV_RWBYTES
);
1086 sbunlock(sb
, 1); /* keep socket locked */
1091 * Drop data from (the front of) a sockbuf.
1092 * use m_freem_list to free the mbuf structures
1093 * under a single lock... this is done by pruning
1094 * the top of the tree from the body by keeping track
1095 * of where we get to in the tree and then zeroing the
1096 * two pertinent pointers m_nextpkt and m_next
1097 * the socket buffer is then updated to point at the new
1098 * top of the tree and the pruned area is released via
1103 register struct sockbuf
*sb
;
1106 register struct mbuf
*m
, *free_list
, *ml
;
1107 struct mbuf
*next
, *last
;
1109 KERNEL_DEBUG((DBG_FNC_SBDROP
| DBG_FUNC_START
), sb
, len
, 0, 0, 0);
1111 next
= (m
= sb
->sb_mb
) ? m
->m_nextpkt
: 0;
1112 free_list
= last
= m
;
1113 ml
= (struct mbuf
*)0;
1118 /* temporarily replacing this panic with printf because
1119 * it occurs occasionally when closing a socket when there
1120 * is no harm in ignoring it. This problem will be investigated
1123 /* panic("sbdrop"); */
1124 printf("sbdrop - count not zero\n");
1126 /* zero the counts. if we have no mbufs, we have no data (PR-2986815) */
1132 next
= m
->m_nextpkt
;
1135 if (m
->m_len
> len
) {
1147 while (m
&& m
->m_len
== 0) {
1154 ml
->m_next
= (struct mbuf
*)0;
1155 last
->m_nextpkt
= (struct mbuf
*)0;
1156 m_freem_list(free_list
);
1160 m
->m_nextpkt
= next
;
1164 postevent(0, sb
, EV_RWBYTES
);
1166 KERNEL_DEBUG((DBG_FNC_SBDROP
| DBG_FUNC_END
), sb
, 0, 0, 0, 0);
1170 * Drop a record off the front of a sockbuf
1171 * and move the next record to the front.
1175 register struct sockbuf
*sb
;
1177 register struct mbuf
*m
, *mn
;
1181 sb
->sb_mb
= m
->m_nextpkt
;
1188 postevent(0, sb
, EV_RWBYTES
);
1192 * Create a "control" mbuf containing the specified data
1193 * with the specified type for presentation on a socket buffer.
1196 sbcreatecontrol(p
, size
, type
, level
)
1201 register struct cmsghdr
*cp
;
1204 if (CMSG_SPACE((u_int
)size
) > MLEN
)
1205 return ((struct mbuf
*) NULL
);
1206 if ((m
= m_get(M_DONTWAIT
, MT_CONTROL
)) == NULL
)
1207 return ((struct mbuf
*) NULL
);
1208 cp
= mtod(m
, struct cmsghdr
*);
1209 /* XXX check size? */
1210 (void)memcpy(CMSG_DATA(cp
), p
, size
);
1211 m
->m_len
= CMSG_SPACE(size
);
1212 cp
->cmsg_len
= CMSG_LEN(size
);
1213 cp
->cmsg_level
= level
;
1214 cp
->cmsg_type
= type
;
1219 * Some routines that return EOPNOTSUPP for entry points that are not
1220 * supported by a protocol. Fill in as needed.
1223 pru_abort_notsupp(struct socket
*so
)
1230 pru_accept_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1236 pru_attach_notsupp(struct socket
*so
, int proto
, struct proc
*p
)
1242 pru_bind_notsupp(struct socket
*so
, struct sockaddr
*nam
, struct proc
*p
)
1248 pru_connect_notsupp(struct socket
*so
, struct sockaddr
*nam
, struct proc
*p
)
1254 pru_connect2_notsupp(struct socket
*so1
, struct socket
*so2
)
1260 pru_control_notsupp(struct socket
*so
, u_long cmd
, caddr_t data
,
1261 struct ifnet
*ifp
, struct proc
*p
)
1267 pru_detach_notsupp(struct socket
*so
)
1273 pru_disconnect_notsupp(struct socket
*so
)
1279 pru_listen_notsupp(struct socket
*so
, struct proc
*p
)
1285 pru_peeraddr_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1291 pru_rcvd_notsupp(struct socket
*so
, int flags
)
1297 pru_rcvoob_notsupp(struct socket
*so
, struct mbuf
*m
, int flags
)
1303 pru_send_notsupp(struct socket
*so
, int flags
, struct mbuf
*m
,
1304 struct sockaddr
*addr
, struct mbuf
*control
,
1313 * This isn't really a ``null'' operation, but it's the default one
1314 * and doesn't do anything destructive.
1317 pru_sense_null(struct socket
*so
, struct stat
*sb
)
1319 sb
->st_blksize
= so
->so_snd
.sb_hiwat
;
1324 int pru_sosend_notsupp(struct socket
*so
, struct sockaddr
*addr
,
1325 struct uio
*uio
, struct mbuf
*top
,
1326 struct mbuf
*control
, int flags
)
1332 int pru_soreceive_notsupp(struct socket
*so
,
1333 struct sockaddr
**paddr
,
1334 struct uio
*uio
, struct mbuf
**mp0
,
1335 struct mbuf
**controlp
, int *flagsp
)
1342 pru_shutdown_notsupp(struct socket
*so
)
1348 pru_sockaddr_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1353 int pru_sosend(struct socket
*so
, struct sockaddr
*addr
,
1354 struct uio
*uio
, struct mbuf
*top
,
1355 struct mbuf
*control
, int flags
)
1360 int pru_soreceive(struct socket
*so
,
1361 struct sockaddr
**paddr
,
1362 struct uio
*uio
, struct mbuf
**mp0
,
1363 struct mbuf
**controlp
, int *flagsp
)
1370 pru_sopoll_notsupp(__unused
struct socket
*so
, __unused
int events
,
1371 __unused kauth_cred_t cred
, __unused
void *wql
)
1379 * The following are macros on BSD and functions on Darwin
1383 * Do we need to notify the other side when I/O is possible?
1387 sb_notify(struct sockbuf
*sb
)
1389 return ((sb
->sb_flags
& (SB_WAIT
|SB_SEL
|SB_ASYNC
|SB_UPCALL
|SB_KNOTE
)) != 0);
1393 * How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
1394 * This is problematical if the fields are unsigned, as the space might
1395 * still be negative (cc > hiwat or mbcnt > mbmax). Should detect
1396 * overflow and return 0. Should use "lmin" but it doesn't exist now.
1399 sbspace(struct sockbuf
*sb
)
1401 return ((long) imin((int)(sb
->sb_hiwat
- sb
->sb_cc
),
1402 (int)(sb
->sb_mbmax
- sb
->sb_mbcnt
)));
1405 /* do we have to send all at once on a socket? */
1407 sosendallatonce(struct socket
*so
)
1409 return (so
->so_proto
->pr_flags
& PR_ATOMIC
);
1412 /* can we read something from so? */
1414 soreadable(struct socket
*so
)
1416 return (so
->so_rcv
.sb_cc
>= so
->so_rcv
.sb_lowat
||
1417 (so
->so_state
& SS_CANTRCVMORE
) ||
1418 so
->so_comp
.tqh_first
|| so
->so_error
);
1421 /* can we write something to so? */
1424 sowriteable(struct socket
*so
)
1426 return ((sbspace(&(so
)->so_snd
) >= (so
)->so_snd
.sb_lowat
&&
1427 ((so
->so_state
&SS_ISCONNECTED
) ||
1428 (so
->so_proto
->pr_flags
&PR_CONNREQUIRED
)==0)) ||
1429 (so
->so_state
& SS_CANTSENDMORE
) ||
1433 /* adjust counters in sb reflecting allocation of m */
1436 sballoc(struct sockbuf
*sb
, struct mbuf
*m
)
1438 sb
->sb_cc
+= m
->m_len
;
1439 sb
->sb_mbcnt
+= MSIZE
;
1440 if (m
->m_flags
& M_EXT
)
1441 sb
->sb_mbcnt
+= m
->m_ext
.ext_size
;
1444 /* adjust counters in sb reflecting freeing of m */
1446 sbfree(struct sockbuf
*sb
, struct mbuf
*m
)
1448 sb
->sb_cc
-= m
->m_len
;
1449 sb
->sb_mbcnt
-= MSIZE
;
1450 if (m
->m_flags
& M_EXT
)
1451 sb
->sb_mbcnt
-= m
->m_ext
.ext_size
;
1455 * Set lock on sockbuf sb; sleep if lock is already held.
1456 * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
1457 * Returns error without lock if sleep is interrupted.
1460 sblock(struct sockbuf
*sb
, int wf
)
1462 return(sb
->sb_flags
& SB_LOCK
?
1463 ((wf
== M_WAIT
) ? sb_lock(sb
) : EWOULDBLOCK
) :
1464 (sb
->sb_flags
|= SB_LOCK
), 0);
1467 /* release lock on sockbuf sb */
1469 sbunlock(struct sockbuf
*sb
, int keeplocked
)
1471 struct socket
*so
= sb
->sb_so
;
1473 lck_mtx_t
*mutex_held
;
1476 lr_saved
= (unsigned int) __builtin_return_address(0);
1478 sb
->sb_flags
&= ~SB_LOCK
;
1480 if (so
->so_proto
->pr_getlock
!= NULL
)
1481 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
1483 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1485 if (keeplocked
== 0)
1486 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1488 if (sb
->sb_flags
& SB_WANT
) {
1489 sb
->sb_flags
&= ~SB_WANT
;
1490 if (so
->so_usecount
< 0)
1491 panic("sbunlock: b4 wakeup so=%x ref=%d lr=%x sb_flags=%x\n", sb
->sb_so
, so
->so_usecount
, lr_saved
, sb
->sb_flags
);
1493 wakeup((caddr_t
)&(sb
)->sb_flags
);
1495 if (keeplocked
== 0) { /* unlock on exit */
1497 if (so
->so_usecount
< 0)
1498 panic("sbunlock: unlock on exit so=%x lr=%x sb_flags=%x\n", so
, so
->so_usecount
,lr_saved
, sb
->sb_flags
);
1499 so
->unlock_lr
[so
->next_unlock_lr
] = (void *)lr_saved
;
1500 so
->next_unlock_lr
= (so
->next_unlock_lr
+1) % SO_LCKDBG_MAX
;
1501 lck_mtx_unlock(mutex_held
);
1506 sorwakeup(struct socket
* so
)
1508 if (sb_notify(&so
->so_rcv
))
1509 sowakeup(so
, &so
->so_rcv
);
1513 sowwakeup(struct socket
* so
)
1515 if (sb_notify(&so
->so_snd
))
1516 sowakeup(so
, &so
->so_snd
);
1521 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
1524 dup_sockaddr(sa
, canwait
)
1525 struct sockaddr
*sa
;
1528 struct sockaddr
*sa2
;
1530 MALLOC(sa2
, struct sockaddr
*, sa
->sa_len
, M_SONAME
,
1531 canwait
? M_WAITOK
: M_NOWAIT
);
1533 bcopy(sa
, sa2
, sa
->sa_len
);
1538 * Create an external-format (``xsocket'') structure using the information
1539 * in the kernel-format socket structure pointed to by so. This is done
1540 * to reduce the spew of irrelevant information over this interface,
1541 * to isolate user code from changes in the kernel structure, and
1542 * potentially to provide information-hiding if we decide that
1543 * some of this information should be hidden from users.
1546 sotoxsocket(struct socket
*so
, struct xsocket
*xso
)
1548 xso
->xso_len
= sizeof *xso
;
1550 xso
->so_type
= so
->so_type
;
1551 xso
->so_options
= so
->so_options
;
1552 xso
->so_linger
= so
->so_linger
;
1553 xso
->so_state
= so
->so_state
;
1554 xso
->so_pcb
= so
->so_pcb
;
1556 xso
->xso_protocol
= so
->so_proto
->pr_protocol
;
1557 xso
->xso_family
= so
->so_proto
->pr_domain
->dom_family
;
1560 xso
->xso_protocol
= xso
->xso_family
= 0;
1561 xso
->so_qlen
= so
->so_qlen
;
1562 xso
->so_incqlen
= so
->so_incqlen
;
1563 xso
->so_qlimit
= so
->so_qlimit
;
1564 xso
->so_timeo
= so
->so_timeo
;
1565 xso
->so_error
= so
->so_error
;
1566 xso
->so_pgid
= so
->so_pgid
;
1567 xso
->so_oobmark
= so
->so_oobmark
;
1568 sbtoxsockbuf(&so
->so_snd
, &xso
->so_snd
);
1569 sbtoxsockbuf(&so
->so_rcv
, &xso
->so_rcv
);
1570 xso
->so_uid
= so
->so_uid
;
1574 * This does the same for sockbufs. Note that the xsockbuf structure,
1575 * since it is always embedded in a socket, does not include a self
1576 * pointer nor a length. We make this entry point public in case
1577 * some other mechanism needs it.
1580 sbtoxsockbuf(struct sockbuf
*sb
, struct xsockbuf
*xsb
)
1582 xsb
->sb_cc
= sb
->sb_cc
;
1583 xsb
->sb_hiwat
= sb
->sb_hiwat
;
1584 xsb
->sb_mbcnt
= sb
->sb_mbcnt
;
1585 xsb
->sb_mbmax
= sb
->sb_mbmax
;
1586 xsb
->sb_lowat
= sb
->sb_lowat
;
1587 xsb
->sb_flags
= sb
->sb_flags
;
1588 xsb
->sb_timeo
= (u_long
)(sb
->sb_timeo
.tv_sec
* hz
) + sb
->sb_timeo
.tv_usec
/ tick
;
1589 if (xsb
->sb_timeo
== 0 && sb
->sb_timeo
.tv_usec
!= 0)
1594 * Here is the definition of some of the basic objects in the kern.ipc
1595 * branch of the MIB.
1597 SYSCTL_NODE(_kern
, KERN_IPC
, ipc
, CTLFLAG_RW
, 0, "IPC");
1599 /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
1601 SYSCTL_INT(_kern
, KERN_DUMMY
, dummy
, CTLFLAG_RW
, &dummy
, 0, "");
1603 SYSCTL_INT(_kern_ipc
, KIPC_MAXSOCKBUF
, maxsockbuf
, CTLFLAG_RW
,
1604 &sb_max
, 0, "Maximum socket buffer size");
1605 SYSCTL_INT(_kern_ipc
, OID_AUTO
, maxsockets
, CTLFLAG_RD
,
1606 &maxsockets
, 0, "Maximum number of sockets avaliable");
1607 SYSCTL_INT(_kern_ipc
, KIPC_SOCKBUF_WASTE
, sockbuf_waste_factor
, CTLFLAG_RW
,
1608 &sb_efficiency
, 0, "");
1609 SYSCTL_INT(_kern_ipc
, KIPC_NMBCLUSTERS
, nmbclusters
, CTLFLAG_RD
, &nmbclusters
, 0, "");