2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1998, 1999 Apple Computer, Inc. All Rights Reserved */
29 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
31 * Copyright (c) 1982, 1986, 1988, 1990, 1993
32 * The Regents of the University of California. All rights reserved.
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions
37 * 1. Redistributions of source code must retain the above copyright
38 * notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 * notice, this list of conditions and the following disclaimer in the
41 * documentation and/or other materials provided with the distribution.
42 * 3. All advertising materials mentioning features or use of this software
43 * must display the following acknowledgement:
44 * This product includes software developed by the University of
45 * California, Berkeley and its contributors.
46 * 4. Neither the name of the University nor the names of its contributors
47 * may be used to endorse or promote products derived from this software
48 * without specific prior written permission.
50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
63 * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.9 2001/07/26 18:53:02 peter Exp $
66 #include <sys/param.h>
67 #include <sys/systm.h>
68 #include <sys/domain.h>
69 #include <sys/kernel.h>
70 #include <sys/proc_internal.h>
71 #include <sys/kauth.h>
72 #include <sys/malloc.h>
74 #include <sys/protosw.h>
76 #include <sys/socket.h>
77 #include <sys/socketvar.h>
78 #include <sys/signalvar.h>
79 #include <sys/sysctl.h>
81 #include <kern/locks.h>
82 #include <net/route.h>
83 #include <netinet/in.h>
84 #include <netinet/in_pcb.h>
85 #include <sys/kdebug.h>
87 #define DBG_FNC_SBDROP NETDBG_CODE(DBG_NETSOCK, 4)
88 #define DBG_FNC_SBAPPEND NETDBG_CODE(DBG_NETSOCK, 5)
92 * Primitive routines for operating on sockets and socket buffers
95 u_long sb_max
= SB_MAX
; /* XXX should be static */
97 static u_long sb_efficiency
= 8; /* parameter for sbreserve() */
100 * Procedures to manipulate state flags of socket
101 * and do appropriate wakeups. Normal sequence from the
102 * active (originating) side is that soisconnecting() is
103 * called during processing of connect() call,
104 * resulting in an eventual call to soisconnected() if/when the
105 * connection is established. When the connection is torn down
106 * soisdisconnecting() is called during processing of disconnect() call,
107 * and soisdisconnected() is called when the connection to the peer
108 * is totally severed. The semantics of these routines are such that
109 * connectionless protocols can call soisconnected() and soisdisconnected()
110 * only, bypassing the in-progress calls when setting up a ``connection''
113 * From the passive side, a socket is created with
114 * two queues of sockets: so_incomp for connections in progress
115 * and so_comp for connections already made and awaiting user acceptance.
116 * As a protocol is preparing incoming connections, it creates a socket
117 * structure queued on so_incomp by calling sonewconn(). When the connection
118 * is established, soisconnected() is called, and transfers the
119 * socket structure to so_comp, making it available to accept().
121 * If a socket is closed with sockets on either
122 * so_incomp or so_comp, these sockets are dropped.
124 * If higher level protocols are implemented in
125 * the kernel, the wakeups done here will sometimes
126 * cause software-interrupt process scheduling.
130 register struct socket
*so
;
133 so
->so_state
&= ~(SS_ISCONNECTED
|SS_ISDISCONNECTING
);
134 so
->so_state
|= SS_ISCONNECTING
;
136 sflt_notify(so
, sock_evt_connecting
, NULL
);
143 struct socket
*head
= so
->so_head
;
145 so
->so_state
&= ~(SS_ISCONNECTING
|SS_ISDISCONNECTING
|SS_ISCONFIRMING
);
146 so
->so_state
|= SS_ISCONNECTED
;
148 sflt_notify(so
, sock_evt_connected
, NULL
);
150 if (head
&& (so
->so_state
& SS_INCOMP
)) {
151 so
->so_state
&= ~SS_INCOMP
;
152 so
->so_state
|= SS_COMP
;
153 if (head
->so_proto
->pr_getlock
!= NULL
) {
154 socket_unlock(so
, 0);
155 socket_lock(head
, 1);
157 postevent(head
, 0, EV_RCONN
);
158 TAILQ_REMOVE(&head
->so_incomp
, so
, so_list
);
160 TAILQ_INSERT_TAIL(&head
->so_comp
, so
, so_list
);
162 wakeup_one((caddr_t
)&head
->so_timeo
);
163 if (head
->so_proto
->pr_getlock
!= NULL
) {
164 socket_unlock(head
, 1);
168 postevent(so
, 0, EV_WCONN
);
169 wakeup((caddr_t
)&so
->so_timeo
);
176 soisdisconnecting(so
)
177 register struct socket
*so
;
179 so
->so_state
&= ~SS_ISCONNECTING
;
180 so
->so_state
|= (SS_ISDISCONNECTING
|SS_CANTRCVMORE
|SS_CANTSENDMORE
);
181 sflt_notify(so
, sock_evt_disconnecting
, NULL
);
182 wakeup((caddr_t
)&so
->so_timeo
);
189 register struct socket
*so
;
191 so
->so_state
&= ~(SS_ISCONNECTING
|SS_ISCONNECTED
|SS_ISDISCONNECTING
);
192 so
->so_state
|= (SS_CANTRCVMORE
|SS_CANTSENDMORE
|SS_ISDISCONNECTED
);
193 sflt_notify(so
, sock_evt_disconnected
, NULL
);
194 wakeup((caddr_t
)&so
->so_timeo
);
200 * Return a random connection that hasn't been serviced yet and
201 * is eligible for discard. There is a one in qlen chance that
202 * we will return a null, saying that there are no dropable
203 * requests. In this case, the protocol specific code should drop
204 * the new request. This insures fairness.
206 * This may be used in conjunction with protocol specific queue
207 * congestion routines.
211 register struct socket
*head
;
213 struct socket
*so
, *sonext
= NULL
;
214 unsigned int i
, j
, qlen
;
216 static struct timeval old_runtime
;
217 static unsigned int cur_cnt
, old_cnt
;
221 if ((i
= (tv
.tv_sec
- old_runtime
.tv_sec
)) != 0) {
223 old_cnt
= cur_cnt
/ i
;
227 so
= TAILQ_FIRST(&head
->so_incomp
);
231 qlen
= head
->so_incqlen
;
232 if (++cur_cnt
> qlen
|| old_cnt
> qlen
) {
233 rnd
= (314159 * rnd
+ 66329) & 0xffff;
234 j
= ((qlen
+ 1) * rnd
) >> 16;
237 // if (in_pcb_checkstate(so->so_pcb, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
239 sonext
= TAILQ_NEXT(so
, so_list
);
240 // in_pcb_check_state(so->so_pcb, WNT_RELEASE, 0);
241 socket_unlock(so
, 1);
246 // if (in_pcb_checkstate(so->so_pcb, WNT_ACQUIRE, 0) == WNT_STOPUSING)
253 * When an attempt at a new connection is noted on a socket
254 * which accepts connections, sonewconn is called. If the
255 * connection is possible (subject to space constraints, etc.)
256 * then we allocate a new structure, propoerly linked into the
257 * data structure of the original socket, and return this.
258 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
260 static struct socket
*
261 sonewconn_internal(head
, connstatus
)
262 register struct socket
*head
;
266 register struct socket
*so
;
267 lck_mtx_t
*mutex_held
;
269 if (head
->so_proto
->pr_getlock
!= NULL
)
270 mutex_held
= (*head
->so_proto
->pr_getlock
)(head
, 0);
272 mutex_held
= head
->so_proto
->pr_domain
->dom_mtx
;
273 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
275 if (head
->so_qlen
> 3 * head
->so_qlimit
/ 2)
276 return ((struct socket
*)0);
277 so
= soalloc(1, head
->so_proto
->pr_domain
->dom_family
, head
->so_type
);
279 return ((struct socket
*)0);
280 /* check if head was closed during the soalloc */
281 if (head
->so_proto
== NULL
) {
283 return ((struct socket
*)0);
287 so
->so_type
= head
->so_type
;
288 so
->so_options
= head
->so_options
&~ SO_ACCEPTCONN
;
289 so
->so_linger
= head
->so_linger
;
290 so
->so_state
= head
->so_state
| SS_NOFDREF
;
291 so
->so_proto
= head
->so_proto
;
292 so
->so_timeo
= head
->so_timeo
;
293 so
->so_pgid
= head
->so_pgid
;
294 so
->so_uid
= head
->so_uid
;
298 so
->so_rcv
.sb_flags
|= SB_RECV
; /* XXX */
299 so
->so_rcv
.sb_so
= so
->so_snd
.sb_so
= so
;
300 TAILQ_INIT(&so
->so_evlist
);
303 if (soreserve(so
, head
->so_snd
.sb_hiwat
, head
->so_rcv
.sb_hiwat
)) {
306 return ((struct socket
*)0);
310 * Must be done with head unlocked to avoid deadlock for protocol with per socket mutexes.
312 if (head
->so_proto
->pr_unlock
)
313 socket_unlock(head
, 0);
314 if (((*so
->so_proto
->pr_usrreqs
->pru_attach
)(so
, 0, NULL
) != 0) || error
) {
317 if (head
->so_proto
->pr_unlock
)
318 socket_lock(head
, 0);
319 return ((struct socket
*)0);
321 if (head
->so_proto
->pr_unlock
)
322 socket_lock(head
, 0);
324 so
->so_proto
->pr_domain
->dom_refs
++;
328 TAILQ_INSERT_TAIL(&head
->so_comp
, so
, so_list
);
329 so
->so_state
|= SS_COMP
;
331 TAILQ_INSERT_TAIL(&head
->so_incomp
, so
, so_list
);
332 so
->so_state
|= SS_INCOMP
;
338 /* Attach socket filters for this protocol */
342 so
->so_state
|= connstatus
;
344 wakeup((caddr_t
)&head
->so_timeo
);
354 const struct sockaddr
*from
)
357 struct socket_filter_entry
*filter
;
361 for (filter
= head
->so_filt
; filter
&& (error
== 0);
362 filter
= filter
->sfe_next_onsocket
) {
363 if (filter
->sfe_filter
->sf_filter
.sf_connect_in
) {
367 socket_unlock(head
, 0);
369 error
= filter
->sfe_filter
->sf_filter
.sf_connect_in(
370 filter
->sfe_cookie
, head
, from
);
374 socket_lock(head
, 0);
382 return sonewconn_internal(head
, connstatus
);
386 * Socantsendmore indicates that no more data will be sent on the
387 * socket; it would normally be applied to a socket when the user
388 * informs the system that no more data is to be sent, by the protocol
389 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data
390 * will be received, and will normally be applied to the socket by a
391 * protocol when it detects that the peer will send no more data.
392 * Data queued for reading in the socket may yet be read.
399 so
->so_state
|= SS_CANTSENDMORE
;
400 sflt_notify(so
, sock_evt_cantsendmore
, NULL
);
408 so
->so_state
|= SS_CANTRCVMORE
;
409 sflt_notify(so
, sock_evt_cantrecvmore
, NULL
);
414 * Wait for data to arrive at/drain from a socket buffer.
420 int error
= 0, lr
, lr_saved
;
421 struct socket
*so
= sb
->sb_so
;
422 lck_mtx_t
*mutex_held
;
426 __asm__
volatile("mflr %0" : "=r" (lr
));
431 if (so
->so_proto
->pr_getlock
!= NULL
)
432 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
434 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
436 sb
->sb_flags
|= SB_WAIT
;
438 if (so
->so_usecount
< 1)
439 panic("sbwait: so=%x refcount=%d\n", so
, so
->so_usecount
);
440 ts
.tv_sec
= sb
->sb_timeo
.tv_sec
;
441 ts
.tv_nsec
= sb
->sb_timeo
.tv_usec
* 1000;
442 error
= msleep((caddr_t
)&sb
->sb_cc
, mutex_held
,
443 (sb
->sb_flags
& SB_NOINTR
) ? PSOCK
: PSOCK
| PCATCH
, "sbwait",
446 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
448 if (so
->so_usecount
< 1)
449 panic("sbwait: so=%x refcount=%d\n", so
, so
->so_usecount
);
451 if ((so
->so_state
& SS_DRAINING
)) {
459 * Lock a sockbuf already known to be locked;
460 * return any error returned from sleep (EINTR).
464 register struct sockbuf
*sb
;
466 struct socket
*so
= sb
->sb_so
;
467 lck_mtx_t
* mutex_held
;
468 int error
= 0, lr
, lr_saved
;
471 __asm__
volatile("mflr %0" : "=r" (lr
));
476 panic("sb_lock: null so back pointer sb=%x\n", sb
);
478 while (sb
->sb_flags
& SB_LOCK
) {
479 sb
->sb_flags
|= SB_WANT
;
480 if (so
->so_proto
->pr_getlock
!= NULL
)
481 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
483 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
484 if (so
->so_usecount
< 1)
485 panic("sb_lock: so=%x refcount=%d\n", so
, so
->so_usecount
);
486 error
= msleep((caddr_t
)&sb
->sb_flags
, mutex_held
,
487 (sb
->sb_flags
& SB_NOINTR
) ? PSOCK
: PSOCK
| PCATCH
, "sblock", 0);
488 if (so
->so_usecount
< 1)
489 panic("sb_lock: 2 so=%x refcount=%d\n", so
, so
->so_usecount
);
493 sb
->sb_flags
|= SB_LOCK
;
498 * Wakeup processes waiting on a socket buffer.
499 * Do asynchronous notification via SIGIO
500 * if the socket has the SS_ASYNC flag set.
504 register struct socket
*so
;
505 register struct sockbuf
*sb
;
507 struct proc
*p
= current_proc();
508 sb
->sb_flags
&= ~SB_SEL
;
509 selwakeup(&sb
->sb_sel
);
510 if (sb
->sb_flags
& SB_WAIT
) {
511 sb
->sb_flags
&= ~SB_WAIT
;
512 wakeup((caddr_t
)&sb
->sb_cc
);
514 if (so
->so_state
& SS_ASYNC
) {
516 gsignal(-so
->so_pgid
, SIGIO
);
517 else if (so
->so_pgid
> 0 && (p
= pfind(so
->so_pgid
)) != 0)
520 if (sb
->sb_flags
& SB_KNOTE
) {
521 KNOTE(&sb
->sb_sel
.si_note
, SO_FILT_HINT_LOCKED
);
523 if (sb
->sb_flags
& SB_UPCALL
) {
524 socket_unlock(so
, 0);
525 (*so
->so_upcall
)(so
, so
->so_upcallarg
, M_DONTWAIT
);
531 * Socket buffer (struct sockbuf) utility routines.
533 * Each socket contains two socket buffers: one for sending data and
534 * one for receiving data. Each buffer contains a queue of mbufs,
535 * information about the number of mbufs and amount of data in the
536 * queue, and other fields allowing select() statements and notification
537 * on data availability to be implemented.
539 * Data stored in a socket buffer is maintained as a list of records.
540 * Each record is a list of mbufs chained together with the m_next
541 * field. Records are chained together with the m_nextpkt field. The upper
542 * level routine soreceive() expects the following conventions to be
543 * observed when placing information in the receive buffer:
545 * 1. If the protocol requires each message be preceded by the sender's
546 * name, then a record containing that name must be present before
547 * any associated data (mbuf's must be of type MT_SONAME).
548 * 2. If the protocol supports the exchange of ``access rights'' (really
549 * just additional data associated with the message), and there are
550 * ``rights'' to be received, then a record containing this data
551 * should be present (mbuf's must be of type MT_RIGHTS).
552 * 3. If a name or rights record exists, then it must be followed by
553 * a data record, perhaps of zero length.
555 * Before using a new socket structure it is first necessary to reserve
556 * buffer space to the socket, by calling sbreserve(). This should commit
557 * some of the available buffer space in the system buffer pool for the
558 * socket (currently, it does nothing but enforce limits). The space
559 * should be released by calling sbrelease() when the socket is destroyed.
563 soreserve(so
, sndcc
, rcvcc
)
564 register struct socket
*so
;
568 if (sbreserve(&so
->so_snd
, sndcc
) == 0)
570 if (sbreserve(&so
->so_rcv
, rcvcc
) == 0)
572 if (so
->so_rcv
.sb_lowat
== 0)
573 so
->so_rcv
.sb_lowat
= 1;
574 if (so
->so_snd
.sb_lowat
== 0)
575 so
->so_snd
.sb_lowat
= MCLBYTES
;
576 if (so
->so_snd
.sb_lowat
> so
->so_snd
.sb_hiwat
)
577 so
->so_snd
.sb_lowat
= so
->so_snd
.sb_hiwat
;
581 selthreadclear(&so
->so_snd
.sb_sel
);
583 sbrelease(&so
->so_snd
);
589 * Allot mbufs to a sockbuf.
590 * Attempt to scale mbmax so that mbcnt doesn't become limiting
591 * if buffering efficiency is near the normal case.
598 if ((u_quad_t
)cc
> (u_quad_t
)sb_max
* MCLBYTES
/ (MSIZE
+ MCLBYTES
))
601 sb
->sb_mbmax
= min(cc
* sb_efficiency
, sb_max
);
602 if (sb
->sb_lowat
> sb
->sb_hiwat
)
603 sb
->sb_lowat
= sb
->sb_hiwat
;
608 * Free mbufs held by a socket, and reserved mbuf space.
610 /* WARNING needs to do selthreadclear() before calling this */
623 * Routines to add and remove
624 * data from an mbuf queue.
626 * The routines sbappend() or sbappendrecord() are normally called to
627 * append new mbufs to a socket buffer, after checking that adequate
628 * space is available, comparing the function sbspace() with the amount
629 * of data to be added. sbappendrecord() differs from sbappend() in
630 * that data supplied is treated as the beginning of a new record.
631 * To place a sender's address, optional access rights, and data in a
632 * socket receive buffer, sbappendaddr() should be used. To place
633 * access rights and data in a socket receive buffer, sbappendrights()
634 * should be used. In either case, the new data begins a new record.
635 * Note that unlike sbappend() and sbappendrecord(), these routines check
636 * for the caller that there will be enough space to store the data.
637 * Each fails if there is not enough space, or if it cannot find mbufs
638 * to store additional information in.
640 * Reliable protocols may use the socket send buffer to hold data
641 * awaiting acknowledgement. Data is normally copied from a socket
642 * send buffer in a protocol with m_copy for output to a peer,
643 * and then removing the data from the socket buffer with sbdrop()
644 * or sbdroprecord() when the data is acknowledged by the peer.
648 * Append mbuf chain m to the last record in the
649 * socket buffer sb. The additional space associated
650 * the mbuf chain is recorded in sb. Empty mbufs are
651 * discarded and mbufs are compacted where possible.
658 register struct mbuf
*n
, *sb_first
;
664 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_START
), sb
, m
->m_len
, 0, 0, 0);
670 sb_first
= n
= sb
->sb_mb
;
675 if (n
->m_flags
& M_EOR
) {
676 result
= sbappendrecord(sb
, m
); /* XXXXXX!!!! */
677 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_END
), sb
, sb
->sb_cc
, 0, 0, 0);
680 } while (n
->m_next
&& (n
= n
->m_next
));
683 if (!filtered
&& (sb
->sb_flags
& SB_RECV
) != 0) {
684 error
= sflt_data_in(sb
->sb_so
, NULL
, &m
, NULL
, 0, &filtered
);
686 /* no data was appended, caller should not call sowakeup */
691 If we any filters, the socket lock was dropped. n and sb_first
692 cached data from the socket buffer. This cache is not valid
693 since we dropped the lock. We must start over. Since filtered
694 is set we won't run through the filters a second time. We just
695 set n and sb_start again.
701 result
= sbcompress(sb
, m
, n
);
703 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_END
), sb
, sb
->sb_cc
, 0, 0, 0);
711 register struct sockbuf
*sb
;
713 register struct mbuf
*m
;
714 register struct mbuf
*n
= 0;
715 register u_long len
= 0, mbcnt
= 0;
716 lck_mtx_t
*mutex_held
;
718 if (sb
->sb_so
->so_proto
->pr_getlock
!= NULL
)
719 mutex_held
= (*sb
->sb_so
->so_proto
->pr_getlock
)(sb
->sb_so
, 0);
721 mutex_held
= sb
->sb_so
->so_proto
->pr_domain
->dom_mtx
;
723 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
728 for (m
= sb
->sb_mb
; m
; m
= n
) {
730 for (; m
; m
= m
->m_next
) {
733 if (m
->m_flags
& M_EXT
) /*XXX*/ /* pretty sure this is bogus */
734 mbcnt
+= m
->m_ext
.ext_size
;
737 if (len
!= sb
->sb_cc
|| mbcnt
!= sb
->sb_mbcnt
) {
738 panic("cc %ld != %ld || mbcnt %ld != %ld\n", len
, sb
->sb_cc
,
739 mbcnt
, sb
->sb_mbcnt
);
745 * As above, except the mbuf chain
746 * begins a new record.
749 sbappendrecord(sb
, m0
)
750 register struct sockbuf
*sb
;
751 register struct mbuf
*m0
;
753 register struct mbuf
*m
;
759 if ((sb
->sb_flags
& SB_RECV
) != 0) {
760 int error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, NULL
, sock_data_filt_flag_record
, NULL
);
762 if (error
!= EJUSTRETURN
)
773 * Put the first mbuf on the queue.
774 * Note this permits zero length records.
783 if (m
&& (m0
->m_flags
& M_EOR
)) {
784 m0
->m_flags
&= ~M_EOR
;
787 return sbcompress(sb
, m
, m0
);
791 * As above except that OOB data
792 * is inserted at the beginning of the sockbuf,
793 * but after any other OOB data.
806 if ((sb
->sb_flags
& SB_RECV
) != 0) {
807 int error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, NULL
,
808 sock_data_filt_flag_oob
, NULL
);
811 if (error
!= EJUSTRETURN
) {
818 for (mp
= &sb
->sb_mb
; *mp
; mp
= &((*mp
)->m_nextpkt
)) {
824 continue; /* WANT next train */
829 goto again
; /* inspect THIS train further */
834 * Put the first mbuf on the queue.
835 * Note this permits zero length records.
842 if (m
&& (m0
->m_flags
& M_EOR
)) {
843 m0
->m_flags
&= ~M_EOR
;
846 return sbcompress(sb
, m
, m0
);
850 * Append address and data, and optionally, control (ancillary) data
851 * to the receive queue of a socket. If present,
852 * m0 must include a packet header with total length.
853 * Returns 0 if no space in sockbuf or insufficient mbufs.
856 sbappendaddr_internal(sb
, asa
, m0
, control
)
857 register struct sockbuf
*sb
;
858 struct sockaddr
*asa
;
859 struct mbuf
*m0
, *control
;
861 register struct mbuf
*m
, *n
;
862 int space
= asa
->sa_len
;
864 if (m0
&& (m0
->m_flags
& M_PKTHDR
) == 0)
865 panic("sbappendaddr");
868 space
+= m0
->m_pkthdr
.len
;
869 for (n
= control
; n
; n
= n
->m_next
) {
871 if (n
->m_next
== 0) /* keep pointer to last control buf */
874 if (space
> sbspace(sb
))
876 if (asa
->sa_len
> MLEN
)
878 MGET(m
, M_DONTWAIT
, MT_SONAME
);
881 m
->m_len
= asa
->sa_len
;
882 bcopy((caddr_t
)asa
, mtod(m
, caddr_t
), asa
->sa_len
);
884 n
->m_next
= m0
; /* concatenate data to control */
888 for (n
= m
; n
; n
= n
->m_next
)
897 postevent(0,sb
,EV_RWBYTES
);
904 struct sockaddr
* asa
,
906 struct mbuf
*control
,
911 if (error_out
) *error_out
= 0;
913 if (m0
&& (m0
->m_flags
& M_PKTHDR
) == 0)
914 panic("sbappendaddrorfree");
916 /* Call socket data in filters */
917 if ((sb
->sb_flags
& SB_RECV
) != 0) {
919 error
= sflt_data_in(sb
->sb_so
, asa
, &m0
, &control
, 0, NULL
);
921 if (error
!= EJUSTRETURN
) {
923 if (control
) m_freem(control
);
924 if (error_out
) *error_out
= error
;
930 result
= sbappendaddr_internal(sb
, asa
, m0
, control
);
933 if (control
) m_freem(control
);
934 if (error_out
) *error_out
= ENOBUFS
;
941 sbappendcontrol_internal(sb
, m0
, control
)
943 struct mbuf
*control
, *m0
;
945 register struct mbuf
*m
, *n
;
949 panic("sbappendcontrol");
951 for (m
= control
; ; m
= m
->m_next
) {
956 n
= m
; /* save pointer to last control buffer */
957 for (m
= m0
; m
; m
= m
->m_next
)
959 if (space
> sbspace(sb
))
961 n
->m_next
= m0
; /* concatenate data to control */
962 for (m
= control
; m
; m
= m
->m_next
)
968 n
->m_nextpkt
= control
;
971 postevent(0,sb
,EV_RWBYTES
);
979 struct mbuf
*control
,
984 if (error_out
) *error_out
= 0;
986 if (sb
->sb_flags
& SB_RECV
) {
988 error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, &control
, 0, NULL
);
990 if (error
!= EJUSTRETURN
) {
992 if (control
) m_freem(control
);
993 if (error_out
) *error_out
= error
;
999 result
= sbappendcontrol_internal(sb
, m0
, control
);
1001 if (m0
) m_freem(m0
);
1002 if (control
) m_freem(control
);
1003 if (error_out
) *error_out
= ENOBUFS
;
1010 * Compress mbuf chain m into the socket
1011 * buffer sb following mbuf n. If n
1012 * is null, the buffer is presumed empty.
1015 sbcompress(sb
, m
, n
)
1016 register struct sockbuf
*sb
;
1017 register struct mbuf
*m
, *n
;
1019 register int eor
= 0;
1020 register struct mbuf
*o
;
1023 eor
|= m
->m_flags
& M_EOR
;
1024 if (m
->m_len
== 0 &&
1026 (((o
= m
->m_next
) || (o
= n
)) &&
1027 o
->m_type
== m
->m_type
))) {
1031 if (n
&& (n
->m_flags
& M_EOR
) == 0 &&
1035 m
->m_len
<= MCLBYTES
/ 4 && /* XXX: Don't copy too much */
1036 m
->m_len
<= M_TRAILINGSPACE(n
) &&
1037 n
->m_type
== m
->m_type
) {
1038 bcopy(mtod(m
, caddr_t
), mtod(n
, caddr_t
) + n
->m_len
,
1039 (unsigned)m
->m_len
);
1040 n
->m_len
+= m
->m_len
;
1041 sb
->sb_cc
+= m
->m_len
;
1051 m
->m_flags
&= ~M_EOR
;
1059 printf("semi-panic: sbcompress\n");
1061 postevent(0,sb
, EV_RWBYTES
);
1066 * Free all mbufs in a sockbuf.
1067 * Check that all resources are reclaimed.
1071 register struct sockbuf
*sb
;
1073 if (sb
->sb_so
== NULL
)
1074 panic ("sbflush sb->sb_so already null sb=%x\n", sb
);
1075 (void)sblock(sb
, M_WAIT
);
1076 while (sb
->sb_mbcnt
) {
1078 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
1079 * we would loop forever. Panic instead.
1081 if (!sb
->sb_cc
&& (sb
->sb_mb
== NULL
|| sb
->sb_mb
->m_len
))
1083 sbdrop(sb
, (int)sb
->sb_cc
);
1085 if (sb
->sb_cc
|| sb
->sb_mb
|| sb
->sb_mbcnt
|| sb
->sb_so
== NULL
)
1086 panic("sbflush: cc %ld || mb %p || mbcnt %ld sb_so=%x", sb
->sb_cc
, (void *)sb
->sb_mb
, sb
->sb_mbcnt
, sb
->sb_so
);
1088 postevent(0, sb
, EV_RWBYTES
);
1089 sbunlock(sb
, 1); /* keep socket locked */
1094 * Drop data from (the front of) a sockbuf.
1095 * use m_freem_list to free the mbuf structures
1096 * under a single lock... this is done by pruning
1097 * the top of the tree from the body by keeping track
1098 * of where we get to in the tree and then zeroing the
1099 * two pertinent pointers m_nextpkt and m_next
1100 * the socket buffer is then updated to point at the new
1101 * top of the tree and the pruned area is released via
1106 register struct sockbuf
*sb
;
1109 register struct mbuf
*m
, *free_list
, *ml
;
1110 struct mbuf
*next
, *last
;
1112 KERNEL_DEBUG((DBG_FNC_SBDROP
| DBG_FUNC_START
), sb
, len
, 0, 0, 0);
1114 next
= (m
= sb
->sb_mb
) ? m
->m_nextpkt
: 0;
1115 free_list
= last
= m
;
1116 ml
= (struct mbuf
*)0;
1121 /* temporarily replacing this panic with printf because
1122 * it occurs occasionally when closing a socket when there
1123 * is no harm in ignoring it. This problem will be investigated
1126 /* panic("sbdrop"); */
1127 printf("sbdrop - count not zero\n");
1129 /* zero the counts. if we have no mbufs, we have no data (PR-2986815) */
1135 next
= m
->m_nextpkt
;
1138 if (m
->m_len
> len
) {
1150 while (m
&& m
->m_len
== 0) {
1157 ml
->m_next
= (struct mbuf
*)0;
1158 last
->m_nextpkt
= (struct mbuf
*)0;
1159 m_freem_list(free_list
);
1163 m
->m_nextpkt
= next
;
1167 postevent(0, sb
, EV_RWBYTES
);
1169 KERNEL_DEBUG((DBG_FNC_SBDROP
| DBG_FUNC_END
), sb
, 0, 0, 0, 0);
1173 * Drop a record off the front of a sockbuf
1174 * and move the next record to the front.
1178 register struct sockbuf
*sb
;
1180 register struct mbuf
*m
, *mn
;
1184 sb
->sb_mb
= m
->m_nextpkt
;
1191 postevent(0, sb
, EV_RWBYTES
);
1195 * Create a "control" mbuf containing the specified data
1196 * with the specified type for presentation on a socket buffer.
1199 sbcreatecontrol(p
, size
, type
, level
)
1204 register struct cmsghdr
*cp
;
1207 if (CMSG_SPACE((u_int
)size
) > MLEN
)
1208 return ((struct mbuf
*) NULL
);
1209 if ((m
= m_get(M_DONTWAIT
, MT_CONTROL
)) == NULL
)
1210 return ((struct mbuf
*) NULL
);
1211 cp
= mtod(m
, struct cmsghdr
*);
1212 /* XXX check size? */
1213 (void)memcpy(CMSG_DATA(cp
), p
, size
);
1214 m
->m_len
= CMSG_SPACE(size
);
1215 cp
->cmsg_len
= CMSG_LEN(size
);
1216 cp
->cmsg_level
= level
;
1217 cp
->cmsg_type
= type
;
1222 * Some routines that return EOPNOTSUPP for entry points that are not
1223 * supported by a protocol. Fill in as needed.
1226 pru_abort_notsupp(struct socket
*so
)
1233 pru_accept_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1239 pru_attach_notsupp(struct socket
*so
, int proto
, struct proc
*p
)
1245 pru_bind_notsupp(struct socket
*so
, struct sockaddr
*nam
, struct proc
*p
)
1251 pru_connect_notsupp(struct socket
*so
, struct sockaddr
*nam
, struct proc
*p
)
1257 pru_connect2_notsupp(struct socket
*so1
, struct socket
*so2
)
1263 pru_control_notsupp(struct socket
*so
, u_long cmd
, caddr_t data
,
1264 struct ifnet
*ifp
, struct proc
*p
)
1270 pru_detach_notsupp(struct socket
*so
)
1276 pru_disconnect_notsupp(struct socket
*so
)
1282 pru_listen_notsupp(struct socket
*so
, struct proc
*p
)
1288 pru_peeraddr_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1294 pru_rcvd_notsupp(struct socket
*so
, int flags
)
1300 pru_rcvoob_notsupp(struct socket
*so
, struct mbuf
*m
, int flags
)
1306 pru_send_notsupp(struct socket
*so
, int flags
, struct mbuf
*m
,
1307 struct sockaddr
*addr
, struct mbuf
*control
,
1316 * This isn't really a ``null'' operation, but it's the default one
1317 * and doesn't do anything destructive.
1320 pru_sense_null(struct socket
*so
, struct stat
*sb
)
1322 sb
->st_blksize
= so
->so_snd
.sb_hiwat
;
1327 int pru_sosend_notsupp(struct socket
*so
, struct sockaddr
*addr
,
1328 struct uio
*uio
, struct mbuf
*top
,
1329 struct mbuf
*control
, int flags
)
1335 int pru_soreceive_notsupp(struct socket
*so
,
1336 struct sockaddr
**paddr
,
1337 struct uio
*uio
, struct mbuf
**mp0
,
1338 struct mbuf
**controlp
, int *flagsp
)
1345 pru_shutdown_notsupp(struct socket
*so
)
1351 pru_sockaddr_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1356 int pru_sosend(struct socket
*so
, struct sockaddr
*addr
,
1357 struct uio
*uio
, struct mbuf
*top
,
1358 struct mbuf
*control
, int flags
)
1363 int pru_soreceive(struct socket
*so
,
1364 struct sockaddr
**paddr
,
1365 struct uio
*uio
, struct mbuf
**mp0
,
1366 struct mbuf
**controlp
, int *flagsp
)
1373 pru_sopoll_notsupp(__unused
struct socket
*so
, __unused
int events
,
1374 __unused kauth_cred_t cred
, __unused
void *wql
)
1382 * The following are macros on BSD and functions on Darwin
1386 * Do we need to notify the other side when I/O is possible?
1390 sb_notify(struct sockbuf
*sb
)
1392 return ((sb
->sb_flags
& (SB_WAIT
|SB_SEL
|SB_ASYNC
|SB_UPCALL
|SB_KNOTE
)) != 0);
1396 * How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
1397 * This is problematical if the fields are unsigned, as the space might
1398 * still be negative (cc > hiwat or mbcnt > mbmax). Should detect
1399 * overflow and return 0. Should use "lmin" but it doesn't exist now.
1402 sbspace(struct sockbuf
*sb
)
1404 return ((long) imin((int)(sb
->sb_hiwat
- sb
->sb_cc
),
1405 (int)(sb
->sb_mbmax
- sb
->sb_mbcnt
)));
1408 /* do we have to send all at once on a socket? */
1410 sosendallatonce(struct socket
*so
)
1412 return (so
->so_proto
->pr_flags
& PR_ATOMIC
);
1415 /* can we read something from so? */
1417 soreadable(struct socket
*so
)
1419 return (so
->so_rcv
.sb_cc
>= so
->so_rcv
.sb_lowat
||
1420 (so
->so_state
& SS_CANTRCVMORE
) ||
1421 so
->so_comp
.tqh_first
|| so
->so_error
);
1424 /* can we write something to so? */
1427 sowriteable(struct socket
*so
)
1429 return ((sbspace(&(so
)->so_snd
) >= (so
)->so_snd
.sb_lowat
&&
1430 ((so
->so_state
&SS_ISCONNECTED
) ||
1431 (so
->so_proto
->pr_flags
&PR_CONNREQUIRED
)==0)) ||
1432 (so
->so_state
& SS_CANTSENDMORE
) ||
1436 /* adjust counters in sb reflecting allocation of m */
1439 sballoc(struct sockbuf
*sb
, struct mbuf
*m
)
1441 sb
->sb_cc
+= m
->m_len
;
1442 sb
->sb_mbcnt
+= MSIZE
;
1443 if (m
->m_flags
& M_EXT
)
1444 sb
->sb_mbcnt
+= m
->m_ext
.ext_size
;
1447 /* adjust counters in sb reflecting freeing of m */
1449 sbfree(struct sockbuf
*sb
, struct mbuf
*m
)
1451 sb
->sb_cc
-= m
->m_len
;
1452 sb
->sb_mbcnt
-= MSIZE
;
1453 if (m
->m_flags
& M_EXT
)
1454 sb
->sb_mbcnt
-= m
->m_ext
.ext_size
;
1458 * Set lock on sockbuf sb; sleep if lock is already held.
1459 * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
1460 * Returns error without lock if sleep is interrupted.
1463 sblock(struct sockbuf
*sb
, int wf
)
1465 return(sb
->sb_flags
& SB_LOCK
?
1466 ((wf
== M_WAIT
) ? sb_lock(sb
) : EWOULDBLOCK
) :
1467 (sb
->sb_flags
|= SB_LOCK
), 0);
1470 /* release lock on sockbuf sb */
1472 sbunlock(struct sockbuf
*sb
, int keeplocked
)
1474 struct socket
*so
= sb
->sb_so
;
1476 lck_mtx_t
*mutex_held
;
1479 __asm__
volatile("mflr %0" : "=r" (lr
));
1482 sb
->sb_flags
&= ~SB_LOCK
;
1484 if (so
->so_proto
->pr_getlock
!= NULL
)
1485 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
1487 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1489 if (keeplocked
== 0)
1490 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1492 if (sb
->sb_flags
& SB_WANT
) {
1493 sb
->sb_flags
&= ~SB_WANT
;
1494 if (so
->so_usecount
< 0)
1495 panic("sbunlock: b4 wakeup so=%x ref=%d lr=%x sb_flags=%x\n", sb
->sb_so
, so
->so_usecount
, lr_saved
, sb
->sb_flags
);
1497 wakeup((caddr_t
)&(sb
)->sb_flags
);
1499 if (keeplocked
== 0) { /* unlock on exit */
1501 if (so
->so_usecount
< 0)
1502 panic("sbunlock: unlock on exit so=%x lr=%x sb_flags=%x\n", so
, so
->so_usecount
,lr_saved
, sb
->sb_flags
);
1503 so
->reserved4
= lr_saved
;
1504 lck_mtx_unlock(mutex_held
);
1509 sorwakeup(struct socket
* so
)
1511 if (sb_notify(&so
->so_rcv
))
1512 sowakeup(so
, &so
->so_rcv
);
1516 sowwakeup(struct socket
* so
)
1518 if (sb_notify(&so
->so_snd
))
1519 sowakeup(so
, &so
->so_snd
);
1524 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
1527 dup_sockaddr(sa
, canwait
)
1528 struct sockaddr
*sa
;
1531 struct sockaddr
*sa2
;
1533 MALLOC(sa2
, struct sockaddr
*, sa
->sa_len
, M_SONAME
,
1534 canwait
? M_WAITOK
: M_NOWAIT
);
1536 bcopy(sa
, sa2
, sa
->sa_len
);
1541 * Create an external-format (``xsocket'') structure using the information
1542 * in the kernel-format socket structure pointed to by so. This is done
1543 * to reduce the spew of irrelevant information over this interface,
1544 * to isolate user code from changes in the kernel structure, and
1545 * potentially to provide information-hiding if we decide that
1546 * some of this information should be hidden from users.
1549 sotoxsocket(struct socket
*so
, struct xsocket
*xso
)
1551 xso
->xso_len
= sizeof *xso
;
1553 xso
->so_type
= so
->so_type
;
1554 xso
->so_options
= so
->so_options
;
1555 xso
->so_linger
= so
->so_linger
;
1556 xso
->so_state
= so
->so_state
;
1557 xso
->so_pcb
= so
->so_pcb
;
1559 xso
->xso_protocol
= so
->so_proto
->pr_protocol
;
1560 xso
->xso_family
= so
->so_proto
->pr_domain
->dom_family
;
1563 xso
->xso_protocol
= xso
->xso_family
= 0;
1564 xso
->so_qlen
= so
->so_qlen
;
1565 xso
->so_incqlen
= so
->so_incqlen
;
1566 xso
->so_qlimit
= so
->so_qlimit
;
1567 xso
->so_timeo
= so
->so_timeo
;
1568 xso
->so_error
= so
->so_error
;
1569 xso
->so_pgid
= so
->so_pgid
;
1570 xso
->so_oobmark
= so
->so_oobmark
;
1571 sbtoxsockbuf(&so
->so_snd
, &xso
->so_snd
);
1572 sbtoxsockbuf(&so
->so_rcv
, &xso
->so_rcv
);
1573 xso
->so_uid
= so
->so_uid
;
1577 * This does the same for sockbufs. Note that the xsockbuf structure,
1578 * since it is always embedded in a socket, does not include a self
1579 * pointer nor a length. We make this entry point public in case
1580 * some other mechanism needs it.
1583 sbtoxsockbuf(struct sockbuf
*sb
, struct xsockbuf
*xsb
)
1585 xsb
->sb_cc
= sb
->sb_cc
;
1586 xsb
->sb_hiwat
= sb
->sb_hiwat
;
1587 xsb
->sb_mbcnt
= sb
->sb_mbcnt
;
1588 xsb
->sb_mbmax
= sb
->sb_mbmax
;
1589 xsb
->sb_lowat
= sb
->sb_lowat
;
1590 xsb
->sb_flags
= sb
->sb_flags
;
1591 xsb
->sb_timeo
= (u_long
)(sb
->sb_timeo
.tv_sec
* hz
) + sb
->sb_timeo
.tv_usec
/ tick
;
1592 if (xsb
->sb_timeo
== 0 && sb
->sb_timeo
.tv_usec
!= 0)
1597 * Here is the definition of some of the basic objects in the kern.ipc
1598 * branch of the MIB.
1600 SYSCTL_NODE(_kern
, KERN_IPC
, ipc
, CTLFLAG_RW
, 0, "IPC");
1602 /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
1604 SYSCTL_INT(_kern
, KERN_DUMMY
, dummy
, CTLFLAG_RW
, &dummy
, 0, "");
1606 SYSCTL_INT(_kern_ipc
, KIPC_MAXSOCKBUF
, maxsockbuf
, CTLFLAG_RW
,
1607 &sb_max
, 0, "Maximum socket buffer size");
1608 SYSCTL_INT(_kern_ipc
, OID_AUTO
, maxsockets
, CTLFLAG_RD
,
1609 &maxsockets
, 0, "Maximum number of sockets avaliable");
1610 SYSCTL_INT(_kern_ipc
, KIPC_SOCKBUF_WASTE
, sockbuf_waste_factor
, CTLFLAG_RW
,
1611 &sb_efficiency
, 0, "");
1612 SYSCTL_INT(_kern_ipc
, KIPC_NMBCLUSTERS
, nmbclusters
, CTLFLAG_RD
, &nmbclusters
, 0, "");