2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
21 * @APPLE_LICENSE_HEADER_END@
23 /* Copyright (c) 1998, 1999 Apple Computer, Inc. All Rights Reserved */
24 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
26 * Copyright (c) 1982, 1986, 1988, 1990, 1993
27 * The Regents of the University of California. All rights reserved.
29 * Redistribution and use in source and binary forms, with or without
30 * modification, are permitted provided that the following conditions
32 * 1. Redistributions of source code must retain the above copyright
33 * notice, this list of conditions and the following disclaimer.
34 * 2. Redistributions in binary form must reproduce the above copyright
35 * notice, this list of conditions and the following disclaimer in the
36 * documentation and/or other materials provided with the distribution.
37 * 3. All advertising materials mentioning features or use of this software
38 * must display the following acknowledgement:
39 * This product includes software developed by the University of
40 * California, Berkeley and its contributors.
41 * 4. Neither the name of the University nor the names of its contributors
42 * may be used to endorse or promote products derived from this software
43 * without specific prior written permission.
45 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
58 * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.9 2001/07/26 18:53:02 peter Exp $
61 #include <sys/param.h>
62 #include <sys/systm.h>
63 #include <sys/domain.h>
64 #include <sys/kernel.h>
65 #include <sys/proc_internal.h>
66 #include <sys/kauth.h>
67 #include <sys/malloc.h>
69 #include <sys/protosw.h>
71 #include <sys/socket.h>
72 #include <sys/socketvar.h>
73 #include <sys/signalvar.h>
74 #include <sys/sysctl.h>
76 #include <kern/locks.h>
77 #include <net/route.h>
78 #include <netinet/in.h>
79 #include <netinet/in_pcb.h>
80 #include <sys/kdebug.h>
82 #define DBG_FNC_SBDROP NETDBG_CODE(DBG_NETSOCK, 4)
83 #define DBG_FNC_SBAPPEND NETDBG_CODE(DBG_NETSOCK, 5)
87 * Primitive routines for operating on sockets and socket buffers
90 u_long sb_max
= SB_MAX
; /* XXX should be static */
92 static u_long sb_efficiency
= 8; /* parameter for sbreserve() */
95 * Procedures to manipulate state flags of socket
96 * and do appropriate wakeups. Normal sequence from the
97 * active (originating) side is that soisconnecting() is
98 * called during processing of connect() call,
99 * resulting in an eventual call to soisconnected() if/when the
100 * connection is established. When the connection is torn down
101 * soisdisconnecting() is called during processing of disconnect() call,
102 * and soisdisconnected() is called when the connection to the peer
103 * is totally severed. The semantics of these routines are such that
104 * connectionless protocols can call soisconnected() and soisdisconnected()
105 * only, bypassing the in-progress calls when setting up a ``connection''
108 * From the passive side, a socket is created with
109 * two queues of sockets: so_incomp for connections in progress
110 * and so_comp for connections already made and awaiting user acceptance.
111 * As a protocol is preparing incoming connections, it creates a socket
112 * structure queued on so_incomp by calling sonewconn(). When the connection
113 * is established, soisconnected() is called, and transfers the
114 * socket structure to so_comp, making it available to accept().
116 * If a socket is closed with sockets on either
117 * so_incomp or so_comp, these sockets are dropped.
119 * If higher level protocols are implemented in
120 * the kernel, the wakeups done here will sometimes
121 * cause software-interrupt process scheduling.
125 register struct socket
*so
;
128 so
->so_state
&= ~(SS_ISCONNECTED
|SS_ISDISCONNECTING
);
129 so
->so_state
|= SS_ISCONNECTING
;
131 sflt_notify(so
, sock_evt_connecting
, NULL
);
138 struct socket
*head
= so
->so_head
;
140 so
->so_state
&= ~(SS_ISCONNECTING
|SS_ISDISCONNECTING
|SS_ISCONFIRMING
);
141 so
->so_state
|= SS_ISCONNECTED
;
143 sflt_notify(so
, sock_evt_connected
, NULL
);
145 if (head
&& (so
->so_state
& SS_INCOMP
)) {
146 so
->so_state
&= ~SS_INCOMP
;
147 so
->so_state
|= SS_COMP
;
148 if (head
->so_proto
->pr_getlock
!= NULL
) {
149 socket_unlock(so
, 0);
150 socket_lock(head
, 1);
152 postevent(head
, 0, EV_RCONN
);
153 TAILQ_REMOVE(&head
->so_incomp
, so
, so_list
);
155 TAILQ_INSERT_TAIL(&head
->so_comp
, so
, so_list
);
157 wakeup_one((caddr_t
)&head
->so_timeo
);
158 if (head
->so_proto
->pr_getlock
!= NULL
) {
159 socket_unlock(head
, 1);
163 postevent(so
, 0, EV_WCONN
);
164 wakeup((caddr_t
)&so
->so_timeo
);
171 soisdisconnecting(so
)
172 register struct socket
*so
;
174 so
->so_state
&= ~SS_ISCONNECTING
;
175 so
->so_state
|= (SS_ISDISCONNECTING
|SS_CANTRCVMORE
|SS_CANTSENDMORE
);
176 sflt_notify(so
, sock_evt_disconnecting
, NULL
);
177 wakeup((caddr_t
)&so
->so_timeo
);
184 register struct socket
*so
;
186 so
->so_state
&= ~(SS_ISCONNECTING
|SS_ISCONNECTED
|SS_ISDISCONNECTING
);
187 so
->so_state
|= (SS_CANTRCVMORE
|SS_CANTSENDMORE
|SS_ISDISCONNECTED
);
188 sflt_notify(so
, sock_evt_disconnected
, NULL
);
189 wakeup((caddr_t
)&so
->so_timeo
);
195 * Return a random connection that hasn't been serviced yet and
196 * is eligible for discard. There is a one in qlen chance that
197 * we will return a null, saying that there are no dropable
198 * requests. In this case, the protocol specific code should drop
199 * the new request. This insures fairness.
201 * This may be used in conjunction with protocol specific queue
202 * congestion routines.
206 register struct socket
*head
;
208 struct socket
*so
, *sonext
= NULL
;
209 unsigned int i
, j
, qlen
;
211 static struct timeval old_runtime
;
212 static unsigned int cur_cnt
, old_cnt
;
216 if ((i
= (tv
.tv_sec
- old_runtime
.tv_sec
)) != 0) {
218 old_cnt
= cur_cnt
/ i
;
222 so
= TAILQ_FIRST(&head
->so_incomp
);
226 qlen
= head
->so_incqlen
;
227 if (++cur_cnt
> qlen
|| old_cnt
> qlen
) {
228 rnd
= (314159 * rnd
+ 66329) & 0xffff;
229 j
= ((qlen
+ 1) * rnd
) >> 16;
232 // if (in_pcb_checkstate(so->so_pcb, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
234 sonext
= TAILQ_NEXT(so
, so_list
);
235 // in_pcb_check_state(so->so_pcb, WNT_RELEASE, 0);
236 socket_unlock(so
, 1);
241 // if (in_pcb_checkstate(so->so_pcb, WNT_ACQUIRE, 0) == WNT_STOPUSING)
248 * When an attempt at a new connection is noted on a socket
249 * which accepts connections, sonewconn is called. If the
250 * connection is possible (subject to space constraints, etc.)
251 * then we allocate a new structure, propoerly linked into the
252 * data structure of the original socket, and return this.
253 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
255 static struct socket
*
256 sonewconn_internal(head
, connstatus
)
257 register struct socket
*head
;
261 register struct socket
*so
;
262 lck_mtx_t
*mutex_held
;
264 if (head
->so_proto
->pr_getlock
!= NULL
)
265 mutex_held
= (*head
->so_proto
->pr_getlock
)(head
, 0);
267 mutex_held
= head
->so_proto
->pr_domain
->dom_mtx
;
268 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
270 if (head
->so_qlen
> 3 * head
->so_qlimit
/ 2)
271 return ((struct socket
*)0);
272 so
= soalloc(1, head
->so_proto
->pr_domain
->dom_family
, head
->so_type
);
274 return ((struct socket
*)0);
275 /* check if head was closed during the soalloc */
276 if (head
->so_proto
== NULL
) {
278 return ((struct socket
*)0);
282 so
->so_type
= head
->so_type
;
283 so
->so_options
= head
->so_options
&~ SO_ACCEPTCONN
;
284 so
->so_linger
= head
->so_linger
;
285 so
->so_state
= head
->so_state
| SS_NOFDREF
;
286 so
->so_proto
= head
->so_proto
;
287 so
->so_timeo
= head
->so_timeo
;
288 so
->so_pgid
= head
->so_pgid
;
289 so
->so_uid
= head
->so_uid
;
292 if (soreserve(so
, head
->so_snd
.sb_hiwat
, head
->so_rcv
.sb_hiwat
)) {
295 return ((struct socket
*)0);
299 * Must be done with head unlocked to avoid deadlock with pcb list
301 socket_unlock(head
, 0);
302 if (((*so
->so_proto
->pr_usrreqs
->pru_attach
)(so
, 0, NULL
) != 0) || error
) {
305 socket_lock(head
, 0);
306 return ((struct socket
*)0);
308 socket_lock(head
, 0);
310 so
->so_proto
->pr_domain
->dom_refs
++;
314 TAILQ_INSERT_TAIL(&head
->so_comp
, so
, so_list
);
315 so
->so_state
|= SS_COMP
;
317 TAILQ_INSERT_TAIL(&head
->so_incomp
, so
, so_list
);
318 so
->so_state
|= SS_INCOMP
;
323 so
->so_rcv
.sb_so
= so
->so_snd
.sb_so
= so
;
324 TAILQ_INIT(&so
->so_evlist
);
326 /* Attach socket filters for this protocol */
330 so
->so_state
|= connstatus
;
332 wakeup((caddr_t
)&head
->so_timeo
);
342 const struct sockaddr
*from
)
345 struct socket_filter_entry
*filter
;
349 for (filter
= head
->so_filt
; filter
&& (error
== 0);
350 filter
= filter
->sfe_next_onsocket
) {
351 if (filter
->sfe_filter
->sf_filter
.sf_connect_in
) {
355 socket_unlock(head
, 0);
357 error
= filter
->sfe_filter
->sf_filter
.sf_connect_in(
358 filter
->sfe_cookie
, head
, from
);
362 socket_lock(head
, 0);
370 return sonewconn_internal(head
, connstatus
);
374 * Socantsendmore indicates that no more data will be sent on the
375 * socket; it would normally be applied to a socket when the user
376 * informs the system that no more data is to be sent, by the protocol
377 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data
378 * will be received, and will normally be applied to the socket by a
379 * protocol when it detects that the peer will send no more data.
380 * Data queued for reading in the socket may yet be read.
387 so
->so_state
|= SS_CANTSENDMORE
;
388 sflt_notify(so
, sock_evt_cantsendmore
, NULL
);
396 so
->so_state
|= SS_CANTRCVMORE
;
397 sflt_notify(so
, sock_evt_cantrecvmore
, NULL
);
402 * Wait for data to arrive at/drain from a socket buffer.
408 int error
= 0, lr
, lr_saved
;
409 struct socket
*so
= sb
->sb_so
;
410 lck_mtx_t
*mutex_held
;
414 __asm__
volatile("mflr %0" : "=r" (lr
));
419 if (so
->so_proto
->pr_getlock
!= NULL
)
420 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
422 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
424 sb
->sb_flags
|= SB_WAIT
;
426 if (so
->so_usecount
< 1)
427 panic("sbwait: so=%x refcount=%d\n", so
, so
->so_usecount
);
428 ts
.tv_sec
= sb
->sb_timeo
.tv_sec
;
429 ts
.tv_nsec
= sb
->sb_timeo
.tv_usec
* 1000;
430 error
= msleep((caddr_t
)&sb
->sb_cc
, mutex_held
,
431 (sb
->sb_flags
& SB_NOINTR
) ? PSOCK
: PSOCK
| PCATCH
, "sbwait",
434 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
436 if (so
->so_usecount
< 1)
437 panic("sbwait: so=%x refcount=%d\n", so
, so
->so_usecount
);
439 if ((so
->so_state
& SS_DRAINING
)) {
447 * Lock a sockbuf already known to be locked;
448 * return any error returned from sleep (EINTR).
452 register struct sockbuf
*sb
;
454 struct socket
*so
= sb
->sb_so
;
455 lck_mtx_t
* mutex_held
;
456 int error
= 0, lr
, lr_saved
;
459 __asm__
volatile("mflr %0" : "=r" (lr
));
464 panic("sb_lock: null so back pointer sb=%x\n", sb
);
466 while (sb
->sb_flags
& SB_LOCK
) {
467 sb
->sb_flags
|= SB_WANT
;
468 if (so
->so_proto
->pr_getlock
!= NULL
)
469 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
471 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
472 if (so
->so_usecount
< 1)
473 panic("sb_lock: so=%x refcount=%d\n", so
, so
->so_usecount
);
474 error
= msleep((caddr_t
)&sb
->sb_flags
, mutex_held
,
475 (sb
->sb_flags
& SB_NOINTR
) ? PSOCK
: PSOCK
| PCATCH
, "sblock", 0);
476 if (so
->so_usecount
< 1)
477 panic("sb_lock: 2 so=%x refcount=%d\n", so
, so
->so_usecount
);
481 sb
->sb_flags
|= SB_LOCK
;
486 * Wakeup processes waiting on a socket buffer.
487 * Do asynchronous notification via SIGIO
488 * if the socket has the SS_ASYNC flag set.
492 register struct socket
*so
;
493 register struct sockbuf
*sb
;
495 struct proc
*p
= current_proc();
496 sb
->sb_flags
&= ~SB_SEL
;
497 selwakeup(&sb
->sb_sel
);
498 if (sb
->sb_flags
& SB_WAIT
) {
499 sb
->sb_flags
&= ~SB_WAIT
;
500 wakeup((caddr_t
)&sb
->sb_cc
);
502 if (so
->so_state
& SS_ASYNC
) {
504 gsignal(-so
->so_pgid
, SIGIO
);
505 else if (so
->so_pgid
> 0 && (p
= pfind(so
->so_pgid
)) != 0)
508 if (sb
->sb_flags
& SB_KNOTE
) {
509 KNOTE(&sb
->sb_sel
.si_note
, SO_FILT_HINT_LOCKED
);
511 if (sb
->sb_flags
& SB_UPCALL
) {
512 socket_unlock(so
, 0);
513 (*so
->so_upcall
)(so
, so
->so_upcallarg
, M_DONTWAIT
);
519 * Socket buffer (struct sockbuf) utility routines.
521 * Each socket contains two socket buffers: one for sending data and
522 * one for receiving data. Each buffer contains a queue of mbufs,
523 * information about the number of mbufs and amount of data in the
524 * queue, and other fields allowing select() statements and notification
525 * on data availability to be implemented.
527 * Data stored in a socket buffer is maintained as a list of records.
528 * Each record is a list of mbufs chained together with the m_next
529 * field. Records are chained together with the m_nextpkt field. The upper
530 * level routine soreceive() expects the following conventions to be
531 * observed when placing information in the receive buffer:
533 * 1. If the protocol requires each message be preceded by the sender's
534 * name, then a record containing that name must be present before
535 * any associated data (mbuf's must be of type MT_SONAME).
536 * 2. If the protocol supports the exchange of ``access rights'' (really
537 * just additional data associated with the message), and there are
538 * ``rights'' to be received, then a record containing this data
539 * should be present (mbuf's must be of type MT_RIGHTS).
540 * 3. If a name or rights record exists, then it must be followed by
541 * a data record, perhaps of zero length.
543 * Before using a new socket structure it is first necessary to reserve
544 * buffer space to the socket, by calling sbreserve(). This should commit
545 * some of the available buffer space in the system buffer pool for the
546 * socket (currently, it does nothing but enforce limits). The space
547 * should be released by calling sbrelease() when the socket is destroyed.
551 soreserve(so
, sndcc
, rcvcc
)
552 register struct socket
*so
;
556 if (sbreserve(&so
->so_snd
, sndcc
) == 0)
558 if (sbreserve(&so
->so_rcv
, rcvcc
) == 0)
560 if (so
->so_rcv
.sb_lowat
== 0)
561 so
->so_rcv
.sb_lowat
= 1;
562 if (so
->so_snd
.sb_lowat
== 0)
563 so
->so_snd
.sb_lowat
= MCLBYTES
;
564 if (so
->so_snd
.sb_lowat
> so
->so_snd
.sb_hiwat
)
565 so
->so_snd
.sb_lowat
= so
->so_snd
.sb_hiwat
;
569 selthreadclear(&so
->so_snd
.sb_sel
);
571 sbrelease(&so
->so_snd
);
577 * Allot mbufs to a sockbuf.
578 * Attempt to scale mbmax so that mbcnt doesn't become limiting
579 * if buffering efficiency is near the normal case.
586 if ((u_quad_t
)cc
> (u_quad_t
)sb_max
* MCLBYTES
/ (MSIZE
+ MCLBYTES
))
589 sb
->sb_mbmax
= min(cc
* sb_efficiency
, sb_max
);
590 if (sb
->sb_lowat
> sb
->sb_hiwat
)
591 sb
->sb_lowat
= sb
->sb_hiwat
;
596 * Free mbufs held by a socket, and reserved mbuf space.
598 /* WARNING needs to do selthreadclear() before calling this */
611 * Routines to add and remove
612 * data from an mbuf queue.
614 * The routines sbappend() or sbappendrecord() are normally called to
615 * append new mbufs to a socket buffer, after checking that adequate
616 * space is available, comparing the function sbspace() with the amount
617 * of data to be added. sbappendrecord() differs from sbappend() in
618 * that data supplied is treated as the beginning of a new record.
619 * To place a sender's address, optional access rights, and data in a
620 * socket receive buffer, sbappendaddr() should be used. To place
621 * access rights and data in a socket receive buffer, sbappendrights()
622 * should be used. In either case, the new data begins a new record.
623 * Note that unlike sbappend() and sbappendrecord(), these routines check
624 * for the caller that there will be enough space to store the data.
625 * Each fails if there is not enough space, or if it cannot find mbufs
626 * to store additional information in.
628 * Reliable protocols may use the socket send buffer to hold data
629 * awaiting acknowledgement. Data is normally copied from a socket
630 * send buffer in a protocol with m_copy for output to a peer,
631 * and then removing the data from the socket buffer with sbdrop()
632 * or sbdroprecord() when the data is acknowledged by the peer.
636 * Append mbuf chain m to the last record in the
637 * socket buffer sb. The additional space associated
638 * the mbuf chain is recorded in sb. Empty mbufs are
639 * discarded and mbufs are compacted where possible.
646 register struct mbuf
*n
, *sb_first
;
652 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_START
), sb
, m
->m_len
, 0, 0, 0);
658 sb_first
= n
= sb
->sb_mb
;
663 if (n
->m_flags
& M_EOR
) {
664 result
= sbappendrecord(sb
, m
); /* XXXXXX!!!! */
665 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_END
), sb
, sb
->sb_cc
, 0, 0, 0);
668 } while (n
->m_next
&& (n
= n
->m_next
));
671 if (!filtered
&& (sb
->sb_flags
& SB_RECV
) != 0) {
672 error
= sflt_data_in(sb
->sb_so
, NULL
, &m
, NULL
, 0, &filtered
);
674 /* no data was appended, caller should not call sowakeup */
679 If we any filters, the socket lock was dropped. n and sb_first
680 cached data from the socket buffer. This cache is not valid
681 since we dropped the lock. We must start over. Since filtered
682 is set we won't run through the filters a second time. We just
683 set n and sb_start again.
689 result
= sbcompress(sb
, m
, n
);
691 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_END
), sb
, sb
->sb_cc
, 0, 0, 0);
699 register struct sockbuf
*sb
;
701 register struct mbuf
*m
;
702 register struct mbuf
*n
= 0;
703 register u_long len
= 0, mbcnt
= 0;
704 lck_mtx_t
*mutex_held
;
706 if (sb
->sb_so
->so_proto
->pr_getlock
!= NULL
)
707 mutex_held
= (*sb
->sb_so
->so_proto
->pr_getlock
)(sb
->sb_so
, 0);
709 mutex_held
= sb
->sb_so
->so_proto
->pr_domain
->dom_mtx
;
711 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
716 for (m
= sb
->sb_mb
; m
; m
= n
) {
718 for (; m
; m
= m
->m_next
) {
721 if (m
->m_flags
& M_EXT
) /*XXX*/ /* pretty sure this is bogus */
722 mbcnt
+= m
->m_ext
.ext_size
;
725 if (len
!= sb
->sb_cc
|| mbcnt
!= sb
->sb_mbcnt
) {
726 panic("cc %ld != %ld || mbcnt %ld != %ld\n", len
, sb
->sb_cc
,
727 mbcnt
, sb
->sb_mbcnt
);
733 * As above, except the mbuf chain
734 * begins a new record.
737 sbappendrecord(sb
, m0
)
738 register struct sockbuf
*sb
;
739 register struct mbuf
*m0
;
741 register struct mbuf
*m
;
747 if ((sb
->sb_flags
& SB_RECV
) != 0) {
748 int error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, NULL
, sock_data_filt_flag_record
, NULL
);
750 if (error
!= EJUSTRETURN
)
761 * Put the first mbuf on the queue.
762 * Note this permits zero length records.
771 if (m
&& (m0
->m_flags
& M_EOR
)) {
772 m0
->m_flags
&= ~M_EOR
;
775 return sbcompress(sb
, m
, m0
);
779 * As above except that OOB data
780 * is inserted at the beginning of the sockbuf,
781 * but after any other OOB data.
794 if ((sb
->sb_flags
& SB_RECV
) != 0) {
795 int error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, NULL
,
796 sock_data_filt_flag_oob
, NULL
);
799 if (error
!= EJUSTRETURN
) {
806 for (mp
= &sb
->sb_mb
; *mp
; mp
= &((*mp
)->m_nextpkt
)) {
812 continue; /* WANT next train */
817 goto again
; /* inspect THIS train further */
822 * Put the first mbuf on the queue.
823 * Note this permits zero length records.
830 if (m
&& (m0
->m_flags
& M_EOR
)) {
831 m0
->m_flags
&= ~M_EOR
;
834 return sbcompress(sb
, m
, m0
);
838 * Append address and data, and optionally, control (ancillary) data
839 * to the receive queue of a socket. If present,
840 * m0 must include a packet header with total length.
841 * Returns 0 if no space in sockbuf or insufficient mbufs.
844 sbappendaddr_internal(sb
, asa
, m0
, control
)
845 register struct sockbuf
*sb
;
846 struct sockaddr
*asa
;
847 struct mbuf
*m0
, *control
;
849 register struct mbuf
*m
, *n
;
850 int space
= asa
->sa_len
;
852 if (m0
&& (m0
->m_flags
& M_PKTHDR
) == 0)
853 panic("sbappendaddr");
856 space
+= m0
->m_pkthdr
.len
;
857 for (n
= control
; n
; n
= n
->m_next
) {
859 if (n
->m_next
== 0) /* keep pointer to last control buf */
862 if (space
> sbspace(sb
))
864 if (asa
->sa_len
> MLEN
)
866 MGET(m
, M_DONTWAIT
, MT_SONAME
);
869 m
->m_len
= asa
->sa_len
;
870 bcopy((caddr_t
)asa
, mtod(m
, caddr_t
), asa
->sa_len
);
872 n
->m_next
= m0
; /* concatenate data to control */
876 for (n
= m
; n
; n
= n
->m_next
)
885 postevent(0,sb
,EV_RWBYTES
);
892 struct sockaddr
* asa
,
894 struct mbuf
*control
,
899 if (error_out
) *error_out
= 0;
901 if (m0
&& (m0
->m_flags
& M_PKTHDR
) == 0)
902 panic("sbappendaddrorfree");
904 /* Call socket data in filters */
905 if ((sb
->sb_flags
& SB_RECV
) != 0) {
907 error
= sflt_data_in(sb
->sb_so
, asa
, &m0
, &control
, 0, NULL
);
909 if (error
!= EJUSTRETURN
) {
911 if (control
) m_freem(control
);
912 if (error_out
) *error_out
= error
;
918 result
= sbappendaddr_internal(sb
, asa
, m0
, control
);
921 if (control
) m_freem(control
);
922 if (error_out
) *error_out
= ENOBUFS
;
929 sbappendcontrol_internal(sb
, m0
, control
)
931 struct mbuf
*control
, *m0
;
933 register struct mbuf
*m
, *n
;
937 panic("sbappendcontrol");
939 for (m
= control
; ; m
= m
->m_next
) {
944 n
= m
; /* save pointer to last control buffer */
945 for (m
= m0
; m
; m
= m
->m_next
)
947 if (space
> sbspace(sb
))
949 n
->m_next
= m0
; /* concatenate data to control */
950 for (m
= control
; m
; m
= m
->m_next
)
956 n
->m_nextpkt
= control
;
959 postevent(0,sb
,EV_RWBYTES
);
967 struct mbuf
*control
,
972 if (error_out
) *error_out
= 0;
974 if (sb
->sb_flags
& SB_RECV
) {
976 error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, &control
, 0, NULL
);
978 if (error
!= EJUSTRETURN
) {
980 if (control
) m_freem(control
);
981 if (error_out
) *error_out
= error
;
987 result
= sbappendcontrol_internal(sb
, m0
, control
);
990 if (control
) m_freem(control
);
991 if (error_out
) *error_out
= ENOBUFS
;
998 * Compress mbuf chain m into the socket
999 * buffer sb following mbuf n. If n
1000 * is null, the buffer is presumed empty.
1003 sbcompress(sb
, m
, n
)
1004 register struct sockbuf
*sb
;
1005 register struct mbuf
*m
, *n
;
1007 register int eor
= 0;
1008 register struct mbuf
*o
;
1011 eor
|= m
->m_flags
& M_EOR
;
1012 if (m
->m_len
== 0 &&
1014 (((o
= m
->m_next
) || (o
= n
)) &&
1015 o
->m_type
== m
->m_type
))) {
1019 if (n
&& (n
->m_flags
& M_EOR
) == 0 &&
1023 m
->m_len
<= MCLBYTES
/ 4 && /* XXX: Don't copy too much */
1024 m
->m_len
<= M_TRAILINGSPACE(n
) &&
1025 n
->m_type
== m
->m_type
) {
1026 bcopy(mtod(m
, caddr_t
), mtod(n
, caddr_t
) + n
->m_len
,
1027 (unsigned)m
->m_len
);
1028 n
->m_len
+= m
->m_len
;
1029 sb
->sb_cc
+= m
->m_len
;
1039 m
->m_flags
&= ~M_EOR
;
1047 printf("semi-panic: sbcompress\n");
1049 postevent(0,sb
, EV_RWBYTES
);
1054 * Free all mbufs in a sockbuf.
1055 * Check that all resources are reclaimed.
1059 register struct sockbuf
*sb
;
1061 if (sb
->sb_so
== NULL
)
1062 panic ("sbflush sb->sb_so already null sb=%x\n", sb
);
1063 (void)sblock(sb
, M_WAIT
);
1064 while (sb
->sb_mbcnt
) {
1066 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
1067 * we would loop forever. Panic instead.
1069 if (!sb
->sb_cc
&& (sb
->sb_mb
== NULL
|| sb
->sb_mb
->m_len
))
1071 sbdrop(sb
, (int)sb
->sb_cc
);
1073 if (sb
->sb_cc
|| sb
->sb_mb
|| sb
->sb_mbcnt
|| sb
->sb_so
== NULL
)
1074 panic("sbflush: cc %ld || mb %p || mbcnt %ld sb_so=%x", sb
->sb_cc
, (void *)sb
->sb_mb
, sb
->sb_mbcnt
, sb
->sb_so
);
1076 postevent(0, sb
, EV_RWBYTES
);
1077 sbunlock(sb
, 1); /* keep socket locked */
1082 * Drop data from (the front of) a sockbuf.
1083 * use m_freem_list to free the mbuf structures
1084 * under a single lock... this is done by pruning
1085 * the top of the tree from the body by keeping track
1086 * of where we get to in the tree and then zeroing the
1087 * two pertinent pointers m_nextpkt and m_next
1088 * the socket buffer is then updated to point at the new
1089 * top of the tree and the pruned area is released via
1094 register struct sockbuf
*sb
;
1097 register struct mbuf
*m
, *free_list
, *ml
;
1098 struct mbuf
*next
, *last
;
1100 KERNEL_DEBUG((DBG_FNC_SBDROP
| DBG_FUNC_START
), sb
, len
, 0, 0, 0);
1102 next
= (m
= sb
->sb_mb
) ? m
->m_nextpkt
: 0;
1103 free_list
= last
= m
;
1104 ml
= (struct mbuf
*)0;
1109 /* temporarily replacing this panic with printf because
1110 * it occurs occasionally when closing a socket when there
1111 * is no harm in ignoring it. This problem will be investigated
1114 /* panic("sbdrop"); */
1115 printf("sbdrop - count not zero\n");
1117 /* zero the counts. if we have no mbufs, we have no data (PR-2986815) */
1123 next
= m
->m_nextpkt
;
1126 if (m
->m_len
> len
) {
1138 while (m
&& m
->m_len
== 0) {
1145 ml
->m_next
= (struct mbuf
*)0;
1146 last
->m_nextpkt
= (struct mbuf
*)0;
1147 m_freem_list(free_list
);
1151 m
->m_nextpkt
= next
;
1155 postevent(0, sb
, EV_RWBYTES
);
1157 KERNEL_DEBUG((DBG_FNC_SBDROP
| DBG_FUNC_END
), sb
, 0, 0, 0, 0);
1161 * Drop a record off the front of a sockbuf
1162 * and move the next record to the front.
1166 register struct sockbuf
*sb
;
1168 register struct mbuf
*m
, *mn
;
1172 sb
->sb_mb
= m
->m_nextpkt
;
1179 postevent(0, sb
, EV_RWBYTES
);
1183 * Create a "control" mbuf containing the specified data
1184 * with the specified type for presentation on a socket buffer.
1187 sbcreatecontrol(p
, size
, type
, level
)
1192 register struct cmsghdr
*cp
;
1195 if (CMSG_SPACE((u_int
)size
) > MLEN
)
1196 return ((struct mbuf
*) NULL
);
1197 if ((m
= m_get(M_DONTWAIT
, MT_CONTROL
)) == NULL
)
1198 return ((struct mbuf
*) NULL
);
1199 cp
= mtod(m
, struct cmsghdr
*);
1200 /* XXX check size? */
1201 (void)memcpy(CMSG_DATA(cp
), p
, size
);
1202 m
->m_len
= CMSG_SPACE(size
);
1203 cp
->cmsg_len
= CMSG_LEN(size
);
1204 cp
->cmsg_level
= level
;
1205 cp
->cmsg_type
= type
;
1210 * Some routines that return EOPNOTSUPP for entry points that are not
1211 * supported by a protocol. Fill in as needed.
1214 pru_abort_notsupp(struct socket
*so
)
1221 pru_accept_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1227 pru_attach_notsupp(struct socket
*so
, int proto
, struct proc
*p
)
1233 pru_bind_notsupp(struct socket
*so
, struct sockaddr
*nam
, struct proc
*p
)
1239 pru_connect_notsupp(struct socket
*so
, struct sockaddr
*nam
, struct proc
*p
)
1245 pru_connect2_notsupp(struct socket
*so1
, struct socket
*so2
)
1251 pru_control_notsupp(struct socket
*so
, u_long cmd
, caddr_t data
,
1252 struct ifnet
*ifp
, struct proc
*p
)
1258 pru_detach_notsupp(struct socket
*so
)
1264 pru_disconnect_notsupp(struct socket
*so
)
1270 pru_listen_notsupp(struct socket
*so
, struct proc
*p
)
1276 pru_peeraddr_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1282 pru_rcvd_notsupp(struct socket
*so
, int flags
)
1288 pru_rcvoob_notsupp(struct socket
*so
, struct mbuf
*m
, int flags
)
1294 pru_send_notsupp(struct socket
*so
, int flags
, struct mbuf
*m
,
1295 struct sockaddr
*addr
, struct mbuf
*control
,
1304 * This isn't really a ``null'' operation, but it's the default one
1305 * and doesn't do anything destructive.
1308 pru_sense_null(struct socket
*so
, struct stat
*sb
)
1310 sb
->st_blksize
= so
->so_snd
.sb_hiwat
;
1315 int pru_sosend_notsupp(struct socket
*so
, struct sockaddr
*addr
,
1316 struct uio
*uio
, struct mbuf
*top
,
1317 struct mbuf
*control
, int flags
)
1323 int pru_soreceive_notsupp(struct socket
*so
,
1324 struct sockaddr
**paddr
,
1325 struct uio
*uio
, struct mbuf
**mp0
,
1326 struct mbuf
**controlp
, int *flagsp
)
1333 pru_shutdown_notsupp(struct socket
*so
)
1339 pru_sockaddr_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1344 int pru_sosend(struct socket
*so
, struct sockaddr
*addr
,
1345 struct uio
*uio
, struct mbuf
*top
,
1346 struct mbuf
*control
, int flags
)
1351 int pru_soreceive(struct socket
*so
,
1352 struct sockaddr
**paddr
,
1353 struct uio
*uio
, struct mbuf
**mp0
,
1354 struct mbuf
**controlp
, int *flagsp
)
1361 pru_sopoll_notsupp(__unused
struct socket
*so
, __unused
int events
,
1362 __unused kauth_cred_t cred
, __unused
void *wql
)
1370 * The following are macros on BSD and functions on Darwin
1374 * Do we need to notify the other side when I/O is possible?
1378 sb_notify(struct sockbuf
*sb
)
1380 return ((sb
->sb_flags
& (SB_WAIT
|SB_SEL
|SB_ASYNC
|SB_UPCALL
|SB_KNOTE
)) != 0);
1384 * How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
1385 * This is problematical if the fields are unsigned, as the space might
1386 * still be negative (cc > hiwat or mbcnt > mbmax). Should detect
1387 * overflow and return 0. Should use "lmin" but it doesn't exist now.
1390 sbspace(struct sockbuf
*sb
)
1392 return ((long) imin((int)(sb
->sb_hiwat
- sb
->sb_cc
),
1393 (int)(sb
->sb_mbmax
- sb
->sb_mbcnt
)));
1396 /* do we have to send all at once on a socket? */
1398 sosendallatonce(struct socket
*so
)
1400 return (so
->so_proto
->pr_flags
& PR_ATOMIC
);
1403 /* can we read something from so? */
1405 soreadable(struct socket
*so
)
1407 return (so
->so_rcv
.sb_cc
>= so
->so_rcv
.sb_lowat
||
1408 (so
->so_state
& SS_CANTRCVMORE
) ||
1409 so
->so_comp
.tqh_first
|| so
->so_error
);
1412 /* can we write something to so? */
1415 sowriteable(struct socket
*so
)
1417 return ((sbspace(&(so
)->so_snd
) >= (so
)->so_snd
.sb_lowat
&&
1418 ((so
->so_state
&SS_ISCONNECTED
) ||
1419 (so
->so_proto
->pr_flags
&PR_CONNREQUIRED
)==0)) ||
1420 (so
->so_state
& SS_CANTSENDMORE
) ||
1424 /* adjust counters in sb reflecting allocation of m */
1427 sballoc(struct sockbuf
*sb
, struct mbuf
*m
)
1429 sb
->sb_cc
+= m
->m_len
;
1430 sb
->sb_mbcnt
+= MSIZE
;
1431 if (m
->m_flags
& M_EXT
)
1432 sb
->sb_mbcnt
+= m
->m_ext
.ext_size
;
1435 /* adjust counters in sb reflecting freeing of m */
1437 sbfree(struct sockbuf
*sb
, struct mbuf
*m
)
1439 sb
->sb_cc
-= m
->m_len
;
1440 sb
->sb_mbcnt
-= MSIZE
;
1441 if (m
->m_flags
& M_EXT
)
1442 sb
->sb_mbcnt
-= m
->m_ext
.ext_size
;
1446 * Set lock on sockbuf sb; sleep if lock is already held.
1447 * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
1448 * Returns error without lock if sleep is interrupted.
1451 sblock(struct sockbuf
*sb
, int wf
)
1453 return(sb
->sb_flags
& SB_LOCK
?
1454 ((wf
== M_WAIT
) ? sb_lock(sb
) : EWOULDBLOCK
) :
1455 (sb
->sb_flags
|= SB_LOCK
), 0);
1458 /* release lock on sockbuf sb */
1460 sbunlock(struct sockbuf
*sb
, int keeplocked
)
1462 struct socket
*so
= sb
->sb_so
;
1464 lck_mtx_t
*mutex_held
;
1467 __asm__
volatile("mflr %0" : "=r" (lr
));
1470 sb
->sb_flags
&= ~SB_LOCK
;
1472 if (so
->so_proto
->pr_getlock
!= NULL
)
1473 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
1475 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1477 if (keeplocked
== 0)
1478 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1480 if (sb
->sb_flags
& SB_WANT
) {
1481 sb
->sb_flags
&= ~SB_WANT
;
1482 if (so
->so_usecount
< 0)
1483 panic("sbunlock: b4 wakeup so=%x ref=%d lr=%x sb_flags=%x\n", sb
->sb_so
, so
->so_usecount
, lr_saved
, sb
->sb_flags
);
1485 wakeup((caddr_t
)&(sb
)->sb_flags
);
1487 if (keeplocked
== 0) { /* unlock on exit */
1489 if (so
->so_usecount
< 0)
1490 panic("sbunlock: unlock on exit so=%x lr=%x sb_flags=%x\n", so
, so
->so_usecount
,lr_saved
, sb
->sb_flags
);
1491 so
->reserved4
= lr_saved
;
1492 lck_mtx_unlock(mutex_held
);
1497 sorwakeup(struct socket
* so
)
1499 if (sb_notify(&so
->so_rcv
))
1500 sowakeup(so
, &so
->so_rcv
);
1504 sowwakeup(struct socket
* so
)
1506 if (sb_notify(&so
->so_snd
))
1507 sowakeup(so
, &so
->so_snd
);
1512 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
1515 dup_sockaddr(sa
, canwait
)
1516 struct sockaddr
*sa
;
1519 struct sockaddr
*sa2
;
1521 MALLOC(sa2
, struct sockaddr
*, sa
->sa_len
, M_SONAME
,
1522 canwait
? M_WAITOK
: M_NOWAIT
);
1524 bcopy(sa
, sa2
, sa
->sa_len
);
1529 * Create an external-format (``xsocket'') structure using the information
1530 * in the kernel-format socket structure pointed to by so. This is done
1531 * to reduce the spew of irrelevant information over this interface,
1532 * to isolate user code from changes in the kernel structure, and
1533 * potentially to provide information-hiding if we decide that
1534 * some of this information should be hidden from users.
1537 sotoxsocket(struct socket
*so
, struct xsocket
*xso
)
1539 xso
->xso_len
= sizeof *xso
;
1541 xso
->so_type
= so
->so_type
;
1542 xso
->so_options
= so
->so_options
;
1543 xso
->so_linger
= so
->so_linger
;
1544 xso
->so_state
= so
->so_state
;
1545 xso
->so_pcb
= so
->so_pcb
;
1547 xso
->xso_protocol
= so
->so_proto
->pr_protocol
;
1548 xso
->xso_family
= so
->so_proto
->pr_domain
->dom_family
;
1551 xso
->xso_protocol
= xso
->xso_family
= 0;
1552 xso
->so_qlen
= so
->so_qlen
;
1553 xso
->so_incqlen
= so
->so_incqlen
;
1554 xso
->so_qlimit
= so
->so_qlimit
;
1555 xso
->so_timeo
= so
->so_timeo
;
1556 xso
->so_error
= so
->so_error
;
1557 xso
->so_pgid
= so
->so_pgid
;
1558 xso
->so_oobmark
= so
->so_oobmark
;
1559 sbtoxsockbuf(&so
->so_snd
, &xso
->so_snd
);
1560 sbtoxsockbuf(&so
->so_rcv
, &xso
->so_rcv
);
1561 xso
->so_uid
= so
->so_uid
;
1565 * This does the same for sockbufs. Note that the xsockbuf structure,
1566 * since it is always embedded in a socket, does not include a self
1567 * pointer nor a length. We make this entry point public in case
1568 * some other mechanism needs it.
1571 sbtoxsockbuf(struct sockbuf
*sb
, struct xsockbuf
*xsb
)
1573 xsb
->sb_cc
= sb
->sb_cc
;
1574 xsb
->sb_hiwat
= sb
->sb_hiwat
;
1575 xsb
->sb_mbcnt
= sb
->sb_mbcnt
;
1576 xsb
->sb_mbmax
= sb
->sb_mbmax
;
1577 xsb
->sb_lowat
= sb
->sb_lowat
;
1578 xsb
->sb_flags
= sb
->sb_flags
;
1579 xsb
->sb_timeo
= (u_long
)(sb
->sb_timeo
.tv_sec
* hz
) + sb
->sb_timeo
.tv_usec
/ tick
;
1580 if (xsb
->sb_timeo
== 0 && sb
->sb_timeo
.tv_usec
!= 0)
1585 * Here is the definition of some of the basic objects in the kern.ipc
1586 * branch of the MIB.
1588 SYSCTL_NODE(_kern
, KERN_IPC
, ipc
, CTLFLAG_RW
, 0, "IPC");
1590 /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
1592 SYSCTL_INT(_kern
, KERN_DUMMY
, dummy
, CTLFLAG_RW
, &dummy
, 0, "");
1594 SYSCTL_INT(_kern_ipc
, KIPC_MAXSOCKBUF
, maxsockbuf
, CTLFLAG_RW
,
1595 &sb_max
, 0, "Maximum socket buffer size");
1596 SYSCTL_INT(_kern_ipc
, OID_AUTO
, maxsockets
, CTLFLAG_RD
,
1597 &maxsockets
, 0, "Maximum number of sockets avaliable");
1598 SYSCTL_INT(_kern_ipc
, KIPC_SOCKBUF_WASTE
, sockbuf_waste_factor
, CTLFLAG_RW
,
1599 &sb_efficiency
, 0, "");
1600 SYSCTL_INT(_kern_ipc
, KIPC_NMBCLUSTERS
, nmbclusters
, CTLFLAG_RD
, &nmbclusters
, 0, "");