2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
22 /* Copyright (c) 1998, 1999 Apple Computer, Inc. All Rights Reserved */
23 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
25 * Copyright (c) 1982, 1986, 1988, 1990, 1993
26 * The Regents of the University of California. All rights reserved.
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
31 * 1. Redistributions of source code must retain the above copyright
32 * notice, this list of conditions and the following disclaimer.
33 * 2. Redistributions in binary form must reproduce the above copyright
34 * notice, this list of conditions and the following disclaimer in the
35 * documentation and/or other materials provided with the distribution.
36 * 3. All advertising materials mentioning features or use of this software
37 * must display the following acknowledgement:
38 * This product includes software developed by the University of
39 * California, Berkeley and its contributors.
40 * 4. Neither the name of the University nor the names of its contributors
41 * may be used to endorse or promote products derived from this software
42 * without specific prior written permission.
44 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
45 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
46 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
47 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
48 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
49 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
50 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
51 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
52 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
53 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
57 * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.9 2001/07/26 18:53:02 peter Exp $
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/domain.h>
63 #include <sys/kernel.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/malloc.h>
68 #include <sys/protosw.h>
70 #include <sys/socket.h>
71 #include <sys/socketvar.h>
72 #include <sys/signalvar.h>
73 #include <sys/sysctl.h>
75 #include <kern/locks.h>
76 #include <net/route.h>
77 #include <netinet/in.h>
78 #include <netinet/in_pcb.h>
79 #include <sys/kdebug.h>
81 #define DBG_FNC_SBDROP NETDBG_CODE(DBG_NETSOCK, 4)
82 #define DBG_FNC_SBAPPEND NETDBG_CODE(DBG_NETSOCK, 5)
84 static int sbcompress(struct sockbuf
*, struct mbuf
*, struct mbuf
*);
87 * Primitive routines for operating on sockets and socket buffers
90 u_long sb_max
= SB_MAX
; /* XXX should be static */
92 static u_long sb_efficiency
= 8; /* parameter for sbreserve() */
95 * Procedures to manipulate state flags of socket
96 * and do appropriate wakeups. Normal sequence from the
97 * active (originating) side is that soisconnecting() is
98 * called during processing of connect() call,
99 * resulting in an eventual call to soisconnected() if/when the
100 * connection is established. When the connection is torn down
101 * soisdisconnecting() is called during processing of disconnect() call,
102 * and soisdisconnected() is called when the connection to the peer
103 * is totally severed. The semantics of these routines are such that
104 * connectionless protocols can call soisconnected() and soisdisconnected()
105 * only, bypassing the in-progress calls when setting up a ``connection''
108 * From the passive side, a socket is created with
109 * two queues of sockets: so_incomp for connections in progress
110 * and so_comp for connections already made and awaiting user acceptance.
111 * As a protocol is preparing incoming connections, it creates a socket
112 * structure queued on so_incomp by calling sonewconn(). When the connection
113 * is established, soisconnected() is called, and transfers the
114 * socket structure to so_comp, making it available to accept().
116 * If a socket is closed with sockets on either
117 * so_incomp or so_comp, these sockets are dropped.
119 * If higher level protocols are implemented in
120 * the kernel, the wakeups done here will sometimes
121 * cause software-interrupt process scheduling.
125 register struct socket
*so
;
128 so
->so_state
&= ~(SS_ISCONNECTED
|SS_ISDISCONNECTING
);
129 so
->so_state
|= SS_ISCONNECTING
;
131 sflt_notify(so
, sock_evt_connecting
, NULL
);
138 struct socket
*head
= so
->so_head
;
140 so
->so_state
&= ~(SS_ISCONNECTING
|SS_ISDISCONNECTING
|SS_ISCONFIRMING
);
141 so
->so_state
|= SS_ISCONNECTED
;
143 sflt_notify(so
, sock_evt_connected
, NULL
);
145 if (head
&& (so
->so_state
& SS_INCOMP
)) {
146 so
->so_state
&= ~SS_INCOMP
;
147 so
->so_state
|= SS_COMP
;
148 if (head
->so_proto
->pr_getlock
!= NULL
) {
149 socket_unlock(so
, 0);
150 socket_lock(head
, 1);
152 postevent(head
, 0, EV_RCONN
);
153 TAILQ_REMOVE(&head
->so_incomp
, so
, so_list
);
155 TAILQ_INSERT_TAIL(&head
->so_comp
, so
, so_list
);
157 wakeup_one((caddr_t
)&head
->so_timeo
);
158 if (head
->so_proto
->pr_getlock
!= NULL
) {
159 socket_unlock(head
, 1);
163 postevent(so
, 0, EV_WCONN
);
164 wakeup((caddr_t
)&so
->so_timeo
);
171 soisdisconnecting(so
)
172 register struct socket
*so
;
174 so
->so_state
&= ~SS_ISCONNECTING
;
175 so
->so_state
|= (SS_ISDISCONNECTING
|SS_CANTRCVMORE
|SS_CANTSENDMORE
);
176 sflt_notify(so
, sock_evt_disconnecting
, NULL
);
177 wakeup((caddr_t
)&so
->so_timeo
);
184 register struct socket
*so
;
186 so
->so_state
&= ~(SS_ISCONNECTING
|SS_ISCONNECTED
|SS_ISDISCONNECTING
);
187 so
->so_state
|= (SS_CANTRCVMORE
|SS_CANTSENDMORE
|SS_ISDISCONNECTED
);
188 sflt_notify(so
, sock_evt_disconnected
, NULL
);
189 wakeup((caddr_t
)&so
->so_timeo
);
195 * Return a random connection that hasn't been serviced yet and
196 * is eligible for discard. There is a one in qlen chance that
197 * we will return a null, saying that there are no dropable
198 * requests. In this case, the protocol specific code should drop
199 * the new request. This insures fairness.
201 * This may be used in conjunction with protocol specific queue
202 * congestion routines.
206 register struct socket
*head
;
208 struct socket
*so
, *sonext
= NULL
;
209 unsigned int i
, j
, qlen
;
211 static struct timeval old_runtime
;
212 static unsigned int cur_cnt
, old_cnt
;
216 if ((i
= (tv
.tv_sec
- old_runtime
.tv_sec
)) != 0) {
218 old_cnt
= cur_cnt
/ i
;
222 so
= TAILQ_FIRST(&head
->so_incomp
);
226 qlen
= head
->so_incqlen
;
227 if (++cur_cnt
> qlen
|| old_cnt
> qlen
) {
228 rnd
= (314159 * rnd
+ 66329) & 0xffff;
229 j
= ((qlen
+ 1) * rnd
) >> 16;
232 // if (in_pcb_checkstate(so->so_pcb, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
234 sonext
= TAILQ_NEXT(so
, so_list
);
235 // in_pcb_check_state(so->so_pcb, WNT_RELEASE, 0);
236 socket_unlock(so
, 1);
241 // if (in_pcb_checkstate(so->so_pcb, WNT_ACQUIRE, 0) == WNT_STOPUSING)
248 * When an attempt at a new connection is noted on a socket
249 * which accepts connections, sonewconn is called. If the
250 * connection is possible (subject to space constraints, etc.)
251 * then we allocate a new structure, propoerly linked into the
252 * data structure of the original socket, and return this.
253 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
255 static struct socket
*
256 sonewconn_internal(head
, connstatus
)
257 register struct socket
*head
;
261 register struct socket
*so
;
262 lck_mtx_t
*mutex_held
;
264 if (head
->so_proto
->pr_getlock
!= NULL
)
265 mutex_held
= (*head
->so_proto
->pr_getlock
)(head
, 0);
267 mutex_held
= head
->so_proto
->pr_domain
->dom_mtx
;
268 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
270 if (head
->so_qlen
> 3 * head
->so_qlimit
/ 2)
271 return ((struct socket
*)0);
272 so
= soalloc(1, head
->so_proto
->pr_domain
->dom_family
, head
->so_type
);
274 return ((struct socket
*)0);
275 /* check if head was closed during the soalloc */
276 if (head
->so_proto
== NULL
) {
278 return ((struct socket
*)0);
282 so
->so_type
= head
->so_type
;
283 so
->so_options
= head
->so_options
&~ SO_ACCEPTCONN
;
284 so
->so_linger
= head
->so_linger
;
285 so
->so_state
= head
->so_state
| SS_NOFDREF
;
286 so
->so_proto
= head
->so_proto
;
287 so
->so_timeo
= head
->so_timeo
;
288 so
->so_pgid
= head
->so_pgid
;
289 so
->so_uid
= head
->so_uid
;
291 so
->next_lock_lr
= 0;
292 so
->next_unlock_lr
= 0;
295 so
->so_rcv
.sb_flags
|= SB_RECV
; /* XXX */
296 so
->so_rcv
.sb_so
= so
->so_snd
.sb_so
= so
;
297 TAILQ_INIT(&so
->so_evlist
);
300 if (soreserve(so
, head
->so_snd
.sb_hiwat
, head
->so_rcv
.sb_hiwat
)) {
303 return ((struct socket
*)0);
307 * Must be done with head unlocked to avoid deadlock for protocol with per socket mutexes.
309 if (head
->so_proto
->pr_unlock
)
310 socket_unlock(head
, 0);
311 if (((*so
->so_proto
->pr_usrreqs
->pru_attach
)(so
, 0, NULL
) != 0) || error
) {
314 if (head
->so_proto
->pr_unlock
)
315 socket_lock(head
, 0);
316 return ((struct socket
*)0);
318 if (head
->so_proto
->pr_unlock
)
319 socket_lock(head
, 0);
321 so
->so_proto
->pr_domain
->dom_refs
++;
325 TAILQ_INSERT_TAIL(&head
->so_comp
, so
, so_list
);
326 so
->so_state
|= SS_COMP
;
328 TAILQ_INSERT_TAIL(&head
->so_incomp
, so
, so_list
);
329 so
->so_state
|= SS_INCOMP
;
335 /* Attach socket filters for this protocol */
339 so
->so_state
|= connstatus
;
341 wakeup((caddr_t
)&head
->so_timeo
);
351 const struct sockaddr
*from
)
354 struct socket_filter_entry
*filter
;
358 for (filter
= head
->so_filt
; filter
&& (error
== 0);
359 filter
= filter
->sfe_next_onsocket
) {
360 if (filter
->sfe_filter
->sf_filter
.sf_connect_in
) {
364 socket_unlock(head
, 0);
366 error
= filter
->sfe_filter
->sf_filter
.sf_connect_in(
367 filter
->sfe_cookie
, head
, from
);
371 socket_lock(head
, 0);
379 return sonewconn_internal(head
, connstatus
);
383 * Socantsendmore indicates that no more data will be sent on the
384 * socket; it would normally be applied to a socket when the user
385 * informs the system that no more data is to be sent, by the protocol
386 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data
387 * will be received, and will normally be applied to the socket by a
388 * protocol when it detects that the peer will send no more data.
389 * Data queued for reading in the socket may yet be read.
396 so
->so_state
|= SS_CANTSENDMORE
;
397 sflt_notify(so
, sock_evt_cantsendmore
, NULL
);
405 so
->so_state
|= SS_CANTRCVMORE
;
406 sflt_notify(so
, sock_evt_cantrecvmore
, NULL
);
411 * Wait for data to arrive at/drain from a socket buffer.
417 int error
= 0, lr_saved
;
418 struct socket
*so
= sb
->sb_so
;
419 lck_mtx_t
*mutex_held
;
422 lr_saved
= (unsigned int) __builtin_return_address(0);
424 if (so
->so_proto
->pr_getlock
!= NULL
)
425 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
427 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
429 sb
->sb_flags
|= SB_WAIT
;
431 if (so
->so_usecount
< 1)
432 panic("sbwait: so=%x refcount=%d\n", so
, so
->so_usecount
);
433 ts
.tv_sec
= sb
->sb_timeo
.tv_sec
;
434 ts
.tv_nsec
= sb
->sb_timeo
.tv_usec
* 1000;
435 error
= msleep((caddr_t
)&sb
->sb_cc
, mutex_held
,
436 (sb
->sb_flags
& SB_NOINTR
) ? PSOCK
: PSOCK
| PCATCH
, "sbwait",
439 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
441 if (so
->so_usecount
< 1)
442 panic("sbwait: so=%x refcount=%d\n", so
, so
->so_usecount
);
444 if ((so
->so_state
& SS_DRAINING
)) {
452 * Lock a sockbuf already known to be locked;
453 * return any error returned from sleep (EINTR).
457 register struct sockbuf
*sb
;
459 struct socket
*so
= sb
->sb_so
;
460 lck_mtx_t
* mutex_held
;
464 panic("sb_lock: null so back pointer sb=%x\n", sb
);
466 while (sb
->sb_flags
& SB_LOCK
) {
467 sb
->sb_flags
|= SB_WANT
;
468 if (so
->so_proto
->pr_getlock
!= NULL
)
469 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
471 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
472 if (so
->so_usecount
< 1)
473 panic("sb_lock: so=%x refcount=%d\n", so
, so
->so_usecount
);
475 error
= msleep((caddr_t
)&sb
->sb_flags
, mutex_held
,
476 (sb
->sb_flags
& SB_NOINTR
) ? PSOCK
: PSOCK
| PCATCH
, "sblock", 0);
477 if (so
->so_usecount
< 1)
478 panic("sb_lock: 2 so=%x refcount=%d\n", so
, so
->so_usecount
);
482 sb
->sb_flags
|= SB_LOCK
;
487 * Wakeup processes waiting on a socket buffer.
488 * Do asynchronous notification via SIGIO
489 * if the socket has the SS_ASYNC flag set.
493 register struct socket
*so
;
494 register struct sockbuf
*sb
;
496 struct proc
*p
= current_proc();
497 sb
->sb_flags
&= ~SB_SEL
;
498 selwakeup(&sb
->sb_sel
);
499 if (sb
->sb_flags
& SB_WAIT
) {
500 sb
->sb_flags
&= ~SB_WAIT
;
501 wakeup((caddr_t
)&sb
->sb_cc
);
503 if (so
->so_state
& SS_ASYNC
) {
505 gsignal(-so
->so_pgid
, SIGIO
);
506 else if (so
->so_pgid
> 0 && (p
= pfind(so
->so_pgid
)) != 0)
509 if (sb
->sb_flags
& SB_KNOTE
) {
510 KNOTE(&sb
->sb_sel
.si_note
, SO_FILT_HINT_LOCKED
);
512 if (sb
->sb_flags
& SB_UPCALL
) {
513 socket_unlock(so
, 0);
514 (*so
->so_upcall
)(so
, so
->so_upcallarg
, M_DONTWAIT
);
520 * Socket buffer (struct sockbuf) utility routines.
522 * Each socket contains two socket buffers: one for sending data and
523 * one for receiving data. Each buffer contains a queue of mbufs,
524 * information about the number of mbufs and amount of data in the
525 * queue, and other fields allowing select() statements and notification
526 * on data availability to be implemented.
528 * Data stored in a socket buffer is maintained as a list of records.
529 * Each record is a list of mbufs chained together with the m_next
530 * field. Records are chained together with the m_nextpkt field. The upper
531 * level routine soreceive() expects the following conventions to be
532 * observed when placing information in the receive buffer:
534 * 1. If the protocol requires each message be preceded by the sender's
535 * name, then a record containing that name must be present before
536 * any associated data (mbuf's must be of type MT_SONAME).
537 * 2. If the protocol supports the exchange of ``access rights'' (really
538 * just additional data associated with the message), and there are
539 * ``rights'' to be received, then a record containing this data
540 * should be present (mbuf's must be of type MT_RIGHTS).
541 * 3. If a name or rights record exists, then it must be followed by
542 * a data record, perhaps of zero length.
544 * Before using a new socket structure it is first necessary to reserve
545 * buffer space to the socket, by calling sbreserve(). This should commit
546 * some of the available buffer space in the system buffer pool for the
547 * socket (currently, it does nothing but enforce limits). The space
548 * should be released by calling sbrelease() when the socket is destroyed.
552 soreserve(so
, sndcc
, rcvcc
)
553 register struct socket
*so
;
557 if (sbreserve(&so
->so_snd
, sndcc
) == 0)
559 if (sbreserve(&so
->so_rcv
, rcvcc
) == 0)
561 if (so
->so_rcv
.sb_lowat
== 0)
562 so
->so_rcv
.sb_lowat
= 1;
563 if (so
->so_snd
.sb_lowat
== 0)
564 so
->so_snd
.sb_lowat
= MCLBYTES
;
565 if (so
->so_snd
.sb_lowat
> so
->so_snd
.sb_hiwat
)
566 so
->so_snd
.sb_lowat
= so
->so_snd
.sb_hiwat
;
570 selthreadclear(&so
->so_snd
.sb_sel
);
572 sbrelease(&so
->so_snd
);
578 * Allot mbufs to a sockbuf.
579 * Attempt to scale mbmax so that mbcnt doesn't become limiting
580 * if buffering efficiency is near the normal case.
587 if ((u_quad_t
)cc
> (u_quad_t
)sb_max
* MCLBYTES
/ (MSIZE
+ MCLBYTES
))
590 sb
->sb_mbmax
= min(cc
* sb_efficiency
, sb_max
);
591 if (sb
->sb_lowat
> sb
->sb_hiwat
)
592 sb
->sb_lowat
= sb
->sb_hiwat
;
597 * Free mbufs held by a socket, and reserved mbuf space.
599 /* WARNING needs to do selthreadclear() before calling this */
612 * Routines to add and remove
613 * data from an mbuf queue.
615 * The routines sbappend() or sbappendrecord() are normally called to
616 * append new mbufs to a socket buffer, after checking that adequate
617 * space is available, comparing the function sbspace() with the amount
618 * of data to be added. sbappendrecord() differs from sbappend() in
619 * that data supplied is treated as the beginning of a new record.
620 * To place a sender's address, optional access rights, and data in a
621 * socket receive buffer, sbappendaddr() should be used. To place
622 * access rights and data in a socket receive buffer, sbappendrights()
623 * should be used. In either case, the new data begins a new record.
624 * Note that unlike sbappend() and sbappendrecord(), these routines check
625 * for the caller that there will be enough space to store the data.
626 * Each fails if there is not enough space, or if it cannot find mbufs
627 * to store additional information in.
629 * Reliable protocols may use the socket send buffer to hold data
630 * awaiting acknowledgement. Data is normally copied from a socket
631 * send buffer in a protocol with m_copy for output to a peer,
632 * and then removing the data from the socket buffer with sbdrop()
633 * or sbdroprecord() when the data is acknowledged by the peer.
637 * Append mbuf chain m to the last record in the
638 * socket buffer sb. The additional space associated
639 * the mbuf chain is recorded in sb. Empty mbufs are
640 * discarded and mbufs are compacted where possible.
647 register struct mbuf
*n
, *sb_first
;
653 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_START
), sb
, m
->m_len
, 0, 0, 0);
659 sb_first
= n
= sb
->sb_mb
;
664 if (n
->m_flags
& M_EOR
) {
665 result
= sbappendrecord(sb
, m
); /* XXXXXX!!!! */
666 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_END
), sb
, sb
->sb_cc
, 0, 0, 0);
669 } while (n
->m_next
&& (n
= n
->m_next
));
672 if (!filtered
&& (sb
->sb_flags
& SB_RECV
) != 0) {
673 error
= sflt_data_in(sb
->sb_so
, NULL
, &m
, NULL
, 0, &filtered
);
675 /* no data was appended, caller should not call sowakeup */
680 If we any filters, the socket lock was dropped. n and sb_first
681 cached data from the socket buffer. This cache is not valid
682 since we dropped the lock. We must start over. Since filtered
683 is set we won't run through the filters a second time. We just
684 set n and sb_start again.
690 result
= sbcompress(sb
, m
, n
);
692 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_END
), sb
, sb
->sb_cc
, 0, 0, 0);
700 register struct sockbuf
*sb
;
702 register struct mbuf
*m
;
703 register struct mbuf
*n
= 0;
704 register u_long len
= 0, mbcnt
= 0;
705 lck_mtx_t
*mutex_held
;
707 if (sb
->sb_so
->so_proto
->pr_getlock
!= NULL
)
708 mutex_held
= (*sb
->sb_so
->so_proto
->pr_getlock
)(sb
->sb_so
, 0);
710 mutex_held
= sb
->sb_so
->so_proto
->pr_domain
->dom_mtx
;
712 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
717 for (m
= sb
->sb_mb
; m
; m
= n
) {
719 for (; m
; m
= m
->m_next
) {
722 if (m
->m_flags
& M_EXT
) /*XXX*/ /* pretty sure this is bogus */
723 mbcnt
+= m
->m_ext
.ext_size
;
726 if (len
!= sb
->sb_cc
|| mbcnt
!= sb
->sb_mbcnt
) {
727 panic("cc %ld != %ld || mbcnt %ld != %ld\n", len
, sb
->sb_cc
,
728 mbcnt
, sb
->sb_mbcnt
);
734 * As above, except the mbuf chain
735 * begins a new record.
738 sbappendrecord(sb
, m0
)
739 register struct sockbuf
*sb
;
742 register struct mbuf
*m
;
748 if ((sb
->sb_flags
& SB_RECV
) != 0) {
749 int error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, NULL
, sock_data_filt_flag_record
, NULL
);
751 if (error
!= EJUSTRETURN
)
762 * Put the first mbuf on the queue.
763 * Note this permits zero length records.
772 if (m
&& (m0
->m_flags
& M_EOR
)) {
773 m0
->m_flags
&= ~M_EOR
;
776 return sbcompress(sb
, m
, m0
);
780 * As above except that OOB data
781 * is inserted at the beginning of the sockbuf,
782 * but after any other OOB data.
795 if ((sb
->sb_flags
& SB_RECV
) != 0) {
796 int error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, NULL
,
797 sock_data_filt_flag_oob
, NULL
);
800 if (error
!= EJUSTRETURN
) {
807 for (mp
= &sb
->sb_mb
; *mp
; mp
= &((*mp
)->m_nextpkt
)) {
813 continue; /* WANT next train */
818 goto again
; /* inspect THIS train further */
823 * Put the first mbuf on the queue.
824 * Note this permits zero length records.
831 if (m
&& (m0
->m_flags
& M_EOR
)) {
832 m0
->m_flags
&= ~M_EOR
;
835 return sbcompress(sb
, m
, m0
);
839 * Append address and data, and optionally, control (ancillary) data
840 * to the receive queue of a socket. If present,
841 * m0 must include a packet header with total length.
842 * Returns 0 if no space in sockbuf or insufficient mbufs.
845 sbappendaddr_internal(sb
, asa
, m0
, control
)
846 register struct sockbuf
*sb
;
847 struct sockaddr
*asa
;
848 struct mbuf
*m0
, *control
;
850 register struct mbuf
*m
, *n
;
851 int space
= asa
->sa_len
;
853 if (m0
&& (m0
->m_flags
& M_PKTHDR
) == 0)
854 panic("sbappendaddr");
857 space
+= m0
->m_pkthdr
.len
;
858 for (n
= control
; n
; n
= n
->m_next
) {
860 if (n
->m_next
== 0) /* keep pointer to last control buf */
863 if (space
> sbspace(sb
))
865 if (asa
->sa_len
> MLEN
)
867 MGET(m
, M_DONTWAIT
, MT_SONAME
);
870 m
->m_len
= asa
->sa_len
;
871 bcopy((caddr_t
)asa
, mtod(m
, caddr_t
), asa
->sa_len
);
873 n
->m_next
= m0
; /* concatenate data to control */
877 for (n
= m
; n
; n
= n
->m_next
)
886 postevent(0,sb
,EV_RWBYTES
);
893 struct sockaddr
* asa
,
895 struct mbuf
*control
,
900 if (error_out
) *error_out
= 0;
902 if (m0
&& (m0
->m_flags
& M_PKTHDR
) == 0)
903 panic("sbappendaddrorfree");
905 /* Call socket data in filters */
906 if ((sb
->sb_flags
& SB_RECV
) != 0) {
908 error
= sflt_data_in(sb
->sb_so
, asa
, &m0
, &control
, 0, NULL
);
910 if (error
!= EJUSTRETURN
) {
912 if (control
) m_freem(control
);
913 if (error_out
) *error_out
= error
;
919 result
= sbappendaddr_internal(sb
, asa
, m0
, control
);
922 if (control
) m_freem(control
);
923 if (error_out
) *error_out
= ENOBUFS
;
930 sbappendcontrol_internal(sb
, m0
, control
)
932 struct mbuf
*control
, *m0
;
934 register struct mbuf
*m
, *n
;
938 panic("sbappendcontrol");
940 for (m
= control
; ; m
= m
->m_next
) {
945 n
= m
; /* save pointer to last control buffer */
946 for (m
= m0
; m
; m
= m
->m_next
)
948 if (space
> sbspace(sb
))
950 n
->m_next
= m0
; /* concatenate data to control */
951 for (m
= control
; m
; m
= m
->m_next
)
957 n
->m_nextpkt
= control
;
960 postevent(0,sb
,EV_RWBYTES
);
968 struct mbuf
*control
,
973 if (error_out
) *error_out
= 0;
975 if (sb
->sb_flags
& SB_RECV
) {
977 error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, &control
, 0, NULL
);
979 if (error
!= EJUSTRETURN
) {
981 if (control
) m_freem(control
);
982 if (error_out
) *error_out
= error
;
988 result
= sbappendcontrol_internal(sb
, m0
, control
);
991 if (control
) m_freem(control
);
992 if (error_out
) *error_out
= ENOBUFS
;
999 * Compress mbuf chain m into the socket
1000 * buffer sb following mbuf n. If n
1001 * is null, the buffer is presumed empty.
1004 sbcompress(sb
, m
, n
)
1005 register struct sockbuf
*sb
;
1006 register struct mbuf
*m
, *n
;
1008 register int eor
= 0;
1009 register struct mbuf
*o
;
1012 eor
|= m
->m_flags
& M_EOR
;
1013 if (m
->m_len
== 0 &&
1015 (((o
= m
->m_next
) || (o
= n
)) &&
1016 o
->m_type
== m
->m_type
))) {
1020 if (n
&& (n
->m_flags
& M_EOR
) == 0 &&
1024 m
->m_len
<= MCLBYTES
/ 4 && /* XXX: Don't copy too much */
1025 m
->m_len
<= M_TRAILINGSPACE(n
) &&
1026 n
->m_type
== m
->m_type
) {
1027 bcopy(mtod(m
, caddr_t
), mtod(n
, caddr_t
) + n
->m_len
,
1028 (unsigned)m
->m_len
);
1029 n
->m_len
+= m
->m_len
;
1030 sb
->sb_cc
+= m
->m_len
;
1040 m
->m_flags
&= ~M_EOR
;
1048 printf("semi-panic: sbcompress\n");
1050 postevent(0,sb
, EV_RWBYTES
);
1055 * Free all mbufs in a sockbuf.
1056 * Check that all resources are reclaimed.
1060 register struct sockbuf
*sb
;
1062 if (sb
->sb_so
== NULL
)
1063 panic ("sbflush sb->sb_so already null sb=%x\n", sb
);
1064 (void)sblock(sb
, M_WAIT
);
1065 while (sb
->sb_mbcnt
) {
1067 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
1068 * we would loop forever. Panic instead.
1070 if (!sb
->sb_cc
&& (sb
->sb_mb
== NULL
|| sb
->sb_mb
->m_len
))
1072 sbdrop(sb
, (int)sb
->sb_cc
);
1074 if (sb
->sb_cc
|| sb
->sb_mb
|| sb
->sb_mbcnt
|| sb
->sb_so
== NULL
)
1075 panic("sbflush: cc %ld || mb %p || mbcnt %ld sb_so=%x", sb
->sb_cc
, (void *)sb
->sb_mb
, sb
->sb_mbcnt
, sb
->sb_so
);
1077 postevent(0, sb
, EV_RWBYTES
);
1078 sbunlock(sb
, 1); /* keep socket locked */
1083 * Drop data from (the front of) a sockbuf.
1084 * use m_freem_list to free the mbuf structures
1085 * under a single lock... this is done by pruning
1086 * the top of the tree from the body by keeping track
1087 * of where we get to in the tree and then zeroing the
1088 * two pertinent pointers m_nextpkt and m_next
1089 * the socket buffer is then updated to point at the new
1090 * top of the tree and the pruned area is released via
1095 register struct sockbuf
*sb
;
1098 register struct mbuf
*m
, *free_list
, *ml
;
1099 struct mbuf
*next
, *last
;
1101 KERNEL_DEBUG((DBG_FNC_SBDROP
| DBG_FUNC_START
), sb
, len
, 0, 0, 0);
1103 next
= (m
= sb
->sb_mb
) ? m
->m_nextpkt
: 0;
1104 free_list
= last
= m
;
1105 ml
= (struct mbuf
*)0;
1110 /* temporarily replacing this panic with printf because
1111 * it occurs occasionally when closing a socket when there
1112 * is no harm in ignoring it. This problem will be investigated
1115 /* panic("sbdrop"); */
1116 printf("sbdrop - count not zero\n");
1118 /* zero the counts. if we have no mbufs, we have no data (PR-2986815) */
1124 next
= m
->m_nextpkt
;
1127 if (m
->m_len
> len
) {
1139 while (m
&& m
->m_len
== 0) {
1146 ml
->m_next
= (struct mbuf
*)0;
1147 last
->m_nextpkt
= (struct mbuf
*)0;
1148 m_freem_list(free_list
);
1152 m
->m_nextpkt
= next
;
1156 postevent(0, sb
, EV_RWBYTES
);
1158 KERNEL_DEBUG((DBG_FNC_SBDROP
| DBG_FUNC_END
), sb
, 0, 0, 0, 0);
1162 * Drop a record off the front of a sockbuf
1163 * and move the next record to the front.
1167 register struct sockbuf
*sb
;
1169 register struct mbuf
*m
, *mn
;
1173 sb
->sb_mb
= m
->m_nextpkt
;
1180 postevent(0, sb
, EV_RWBYTES
);
1184 * Create a "control" mbuf containing the specified data
1185 * with the specified type for presentation on a socket buffer.
1188 sbcreatecontrol(p
, size
, type
, level
)
1193 register struct cmsghdr
*cp
;
1196 if (CMSG_SPACE((u_int
)size
) > MLEN
)
1197 return ((struct mbuf
*) NULL
);
1198 if ((m
= m_get(M_DONTWAIT
, MT_CONTROL
)) == NULL
)
1199 return ((struct mbuf
*) NULL
);
1200 cp
= mtod(m
, struct cmsghdr
*);
1201 /* XXX check size? */
1202 (void)memcpy(CMSG_DATA(cp
), p
, size
);
1203 m
->m_len
= CMSG_SPACE(size
);
1204 cp
->cmsg_len
= CMSG_LEN(size
);
1205 cp
->cmsg_level
= level
;
1206 cp
->cmsg_type
= type
;
1211 * Some routines that return EOPNOTSUPP for entry points that are not
1212 * supported by a protocol. Fill in as needed.
1215 pru_abort_notsupp(struct socket
*so
)
1222 pru_accept_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1228 pru_attach_notsupp(struct socket
*so
, int proto
, struct proc
*p
)
1234 pru_bind_notsupp(struct socket
*so
, struct sockaddr
*nam
, struct proc
*p
)
1240 pru_connect_notsupp(struct socket
*so
, struct sockaddr
*nam
, struct proc
*p
)
1246 pru_connect2_notsupp(struct socket
*so1
, struct socket
*so2
)
1252 pru_control_notsupp(struct socket
*so
, u_long cmd
, caddr_t data
,
1253 struct ifnet
*ifp
, struct proc
*p
)
1259 pru_detach_notsupp(struct socket
*so
)
1265 pru_disconnect_notsupp(struct socket
*so
)
1271 pru_listen_notsupp(struct socket
*so
, struct proc
*p
)
1277 pru_peeraddr_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1283 pru_rcvd_notsupp(struct socket
*so
, int flags
)
1289 pru_rcvoob_notsupp(struct socket
*so
, struct mbuf
*m
, int flags
)
1295 pru_send_notsupp(struct socket
*so
, int flags
, struct mbuf
*m
,
1296 struct sockaddr
*addr
, struct mbuf
*control
,
1305 * This isn't really a ``null'' operation, but it's the default one
1306 * and doesn't do anything destructive.
1309 pru_sense_null(struct socket
*so
, struct stat
*sb
)
1311 sb
->st_blksize
= so
->so_snd
.sb_hiwat
;
1316 int pru_sosend_notsupp(struct socket
*so
, struct sockaddr
*addr
,
1317 struct uio
*uio
, struct mbuf
*top
,
1318 struct mbuf
*control
, int flags
)
1324 int pru_soreceive_notsupp(struct socket
*so
,
1325 struct sockaddr
**paddr
,
1326 struct uio
*uio
, struct mbuf
**mp0
,
1327 struct mbuf
**controlp
, int *flagsp
)
1334 pru_shutdown_notsupp(struct socket
*so
)
1340 pru_sockaddr_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1345 int pru_sosend(struct socket
*so
, struct sockaddr
*addr
,
1346 struct uio
*uio
, struct mbuf
*top
,
1347 struct mbuf
*control
, int flags
)
1352 int pru_soreceive(struct socket
*so
,
1353 struct sockaddr
**paddr
,
1354 struct uio
*uio
, struct mbuf
**mp0
,
1355 struct mbuf
**controlp
, int *flagsp
)
1362 pru_sopoll_notsupp(__unused
struct socket
*so
, __unused
int events
,
1363 __unused kauth_cred_t cred
, __unused
void *wql
)
1371 * The following are macros on BSD and functions on Darwin
1375 * Do we need to notify the other side when I/O is possible?
1379 sb_notify(struct sockbuf
*sb
)
1381 return ((sb
->sb_flags
& (SB_WAIT
|SB_SEL
|SB_ASYNC
|SB_UPCALL
|SB_KNOTE
)) != 0);
1385 * How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
1386 * This is problematical if the fields are unsigned, as the space might
1387 * still be negative (cc > hiwat or mbcnt > mbmax). Should detect
1388 * overflow and return 0. Should use "lmin" but it doesn't exist now.
1391 sbspace(struct sockbuf
*sb
)
1393 return ((long) imin((int)(sb
->sb_hiwat
- sb
->sb_cc
),
1394 (int)(sb
->sb_mbmax
- sb
->sb_mbcnt
)));
1397 /* do we have to send all at once on a socket? */
1399 sosendallatonce(struct socket
*so
)
1401 return (so
->so_proto
->pr_flags
& PR_ATOMIC
);
1404 /* can we read something from so? */
1406 soreadable(struct socket
*so
)
1408 return (so
->so_rcv
.sb_cc
>= so
->so_rcv
.sb_lowat
||
1409 (so
->so_state
& SS_CANTRCVMORE
) ||
1410 so
->so_comp
.tqh_first
|| so
->so_error
);
1413 /* can we write something to so? */
1416 sowriteable(struct socket
*so
)
1418 return ((sbspace(&(so
)->so_snd
) >= (so
)->so_snd
.sb_lowat
&&
1419 ((so
->so_state
&SS_ISCONNECTED
) ||
1420 (so
->so_proto
->pr_flags
&PR_CONNREQUIRED
)==0)) ||
1421 (so
->so_state
& SS_CANTSENDMORE
) ||
1425 /* adjust counters in sb reflecting allocation of m */
1428 sballoc(struct sockbuf
*sb
, struct mbuf
*m
)
1430 sb
->sb_cc
+= m
->m_len
;
1431 sb
->sb_mbcnt
+= MSIZE
;
1432 if (m
->m_flags
& M_EXT
)
1433 sb
->sb_mbcnt
+= m
->m_ext
.ext_size
;
1436 /* adjust counters in sb reflecting freeing of m */
1438 sbfree(struct sockbuf
*sb
, struct mbuf
*m
)
1440 sb
->sb_cc
-= m
->m_len
;
1441 sb
->sb_mbcnt
-= MSIZE
;
1442 if (m
->m_flags
& M_EXT
)
1443 sb
->sb_mbcnt
-= m
->m_ext
.ext_size
;
1447 * Set lock on sockbuf sb; sleep if lock is already held.
1448 * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
1449 * Returns error without lock if sleep is interrupted.
1452 sblock(struct sockbuf
*sb
, int wf
)
1454 return(sb
->sb_flags
& SB_LOCK
?
1455 ((wf
== M_WAIT
) ? sb_lock(sb
) : EWOULDBLOCK
) :
1456 (sb
->sb_flags
|= SB_LOCK
), 0);
1459 /* release lock on sockbuf sb */
1461 sbunlock(struct sockbuf
*sb
, int keeplocked
)
1463 struct socket
*so
= sb
->sb_so
;
1465 lck_mtx_t
*mutex_held
;
1468 lr_saved
= (unsigned int) __builtin_return_address(0);
1470 sb
->sb_flags
&= ~SB_LOCK
;
1472 if (so
->so_proto
->pr_getlock
!= NULL
)
1473 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
1475 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1477 if (keeplocked
== 0)
1478 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1480 if (sb
->sb_flags
& SB_WANT
) {
1481 sb
->sb_flags
&= ~SB_WANT
;
1482 if (so
->so_usecount
< 0)
1483 panic("sbunlock: b4 wakeup so=%x ref=%d lr=%x sb_flags=%x\n", sb
->sb_so
, so
->so_usecount
, lr_saved
, sb
->sb_flags
);
1485 wakeup((caddr_t
)&(sb
)->sb_flags
);
1487 if (keeplocked
== 0) { /* unlock on exit */
1489 if (so
->so_usecount
< 0)
1490 panic("sbunlock: unlock on exit so=%x lr=%x sb_flags=%x\n", so
, so
->so_usecount
,lr_saved
, sb
->sb_flags
);
1491 so
->unlock_lr
[so
->next_unlock_lr
] = (void *)lr_saved
;
1492 so
->next_unlock_lr
= (so
->next_unlock_lr
+1) % SO_LCKDBG_MAX
;
1493 lck_mtx_unlock(mutex_held
);
1498 sorwakeup(struct socket
* so
)
1500 if (sb_notify(&so
->so_rcv
))
1501 sowakeup(so
, &so
->so_rcv
);
1505 sowwakeup(struct socket
* so
)
1507 if (sb_notify(&so
->so_snd
))
1508 sowakeup(so
, &so
->so_snd
);
1513 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
1516 dup_sockaddr(sa
, canwait
)
1517 struct sockaddr
*sa
;
1520 struct sockaddr
*sa2
;
1522 MALLOC(sa2
, struct sockaddr
*, sa
->sa_len
, M_SONAME
,
1523 canwait
? M_WAITOK
: M_NOWAIT
);
1525 bcopy(sa
, sa2
, sa
->sa_len
);
1530 * Create an external-format (``xsocket'') structure using the information
1531 * in the kernel-format socket structure pointed to by so. This is done
1532 * to reduce the spew of irrelevant information over this interface,
1533 * to isolate user code from changes in the kernel structure, and
1534 * potentially to provide information-hiding if we decide that
1535 * some of this information should be hidden from users.
1538 sotoxsocket(struct socket
*so
, struct xsocket
*xso
)
1540 xso
->xso_len
= sizeof *xso
;
1542 xso
->so_type
= so
->so_type
;
1543 xso
->so_options
= so
->so_options
;
1544 xso
->so_linger
= so
->so_linger
;
1545 xso
->so_state
= so
->so_state
;
1546 xso
->so_pcb
= so
->so_pcb
;
1548 xso
->xso_protocol
= so
->so_proto
->pr_protocol
;
1549 xso
->xso_family
= so
->so_proto
->pr_domain
->dom_family
;
1552 xso
->xso_protocol
= xso
->xso_family
= 0;
1553 xso
->so_qlen
= so
->so_qlen
;
1554 xso
->so_incqlen
= so
->so_incqlen
;
1555 xso
->so_qlimit
= so
->so_qlimit
;
1556 xso
->so_timeo
= so
->so_timeo
;
1557 xso
->so_error
= so
->so_error
;
1558 xso
->so_pgid
= so
->so_pgid
;
1559 xso
->so_oobmark
= so
->so_oobmark
;
1560 sbtoxsockbuf(&so
->so_snd
, &xso
->so_snd
);
1561 sbtoxsockbuf(&so
->so_rcv
, &xso
->so_rcv
);
1562 xso
->so_uid
= so
->so_uid
;
1566 * This does the same for sockbufs. Note that the xsockbuf structure,
1567 * since it is always embedded in a socket, does not include a self
1568 * pointer nor a length. We make this entry point public in case
1569 * some other mechanism needs it.
1572 sbtoxsockbuf(struct sockbuf
*sb
, struct xsockbuf
*xsb
)
1574 xsb
->sb_cc
= sb
->sb_cc
;
1575 xsb
->sb_hiwat
= sb
->sb_hiwat
;
1576 xsb
->sb_mbcnt
= sb
->sb_mbcnt
;
1577 xsb
->sb_mbmax
= sb
->sb_mbmax
;
1578 xsb
->sb_lowat
= sb
->sb_lowat
;
1579 xsb
->sb_flags
= sb
->sb_flags
;
1580 xsb
->sb_timeo
= (u_long
)(sb
->sb_timeo
.tv_sec
* hz
) + sb
->sb_timeo
.tv_usec
/ tick
;
1581 if (xsb
->sb_timeo
== 0 && sb
->sb_timeo
.tv_usec
!= 0)
1586 * Here is the definition of some of the basic objects in the kern.ipc
1587 * branch of the MIB.
1589 SYSCTL_NODE(_kern
, KERN_IPC
, ipc
, CTLFLAG_RW
, 0, "IPC");
1591 /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
1593 SYSCTL_INT(_kern
, KERN_DUMMY
, dummy
, CTLFLAG_RW
, &dummy
, 0, "");
1595 SYSCTL_INT(_kern_ipc
, KIPC_MAXSOCKBUF
, maxsockbuf
, CTLFLAG_RW
,
1596 &sb_max
, 0, "Maximum socket buffer size");
1597 SYSCTL_INT(_kern_ipc
, OID_AUTO
, maxsockets
, CTLFLAG_RD
,
1598 &maxsockets
, 0, "Maximum number of sockets avaliable");
1599 SYSCTL_INT(_kern_ipc
, KIPC_SOCKBUF_WASTE
, sockbuf_waste_factor
, CTLFLAG_RW
,
1600 &sb_efficiency
, 0, "");
1601 SYSCTL_INT(_kern_ipc
, KIPC_NMBCLUSTERS
, nmbclusters
, CTLFLAG_RD
, &nmbclusters
, 0, "");