2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
22 /* Copyright (c) 1998, 1999 Apple Computer, Inc. All Rights Reserved */
23 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
25 * Copyright (c) 1982, 1986, 1988, 1990, 1993
26 * The Regents of the University of California. All rights reserved.
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
31 * 1. Redistributions of source code must retain the above copyright
32 * notice, this list of conditions and the following disclaimer.
33 * 2. Redistributions in binary form must reproduce the above copyright
34 * notice, this list of conditions and the following disclaimer in the
35 * documentation and/or other materials provided with the distribution.
36 * 3. All advertising materials mentioning features or use of this software
37 * must display the following acknowledgement:
38 * This product includes software developed by the University of
39 * California, Berkeley and its contributors.
40 * 4. Neither the name of the University nor the names of its contributors
41 * may be used to endorse or promote products derived from this software
42 * without specific prior written permission.
44 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
45 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
46 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
47 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
48 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
49 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
50 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
51 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
52 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
53 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
57 * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.9 2001/07/26 18:53:02 peter Exp $
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/domain.h>
63 #include <sys/kernel.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/malloc.h>
68 #include <sys/protosw.h>
70 #include <sys/socket.h>
71 #include <sys/socketvar.h>
72 #include <sys/signalvar.h>
73 #include <sys/sysctl.h>
75 #include <kern/locks.h>
76 #include <net/route.h>
77 #include <netinet/in.h>
78 #include <netinet/in_pcb.h>
79 #include <sys/kdebug.h>
81 #define DBG_FNC_SBDROP NETDBG_CODE(DBG_NETSOCK, 4)
82 #define DBG_FNC_SBAPPEND NETDBG_CODE(DBG_NETSOCK, 5)
86 * Primitive routines for operating on sockets and socket buffers
89 u_long sb_max
= SB_MAX
; /* XXX should be static */
91 static u_long sb_efficiency
= 8; /* parameter for sbreserve() */
94 * Procedures to manipulate state flags of socket
95 * and do appropriate wakeups. Normal sequence from the
96 * active (originating) side is that soisconnecting() is
97 * called during processing of connect() call,
98 * resulting in an eventual call to soisconnected() if/when the
99 * connection is established. When the connection is torn down
100 * soisdisconnecting() is called during processing of disconnect() call,
101 * and soisdisconnected() is called when the connection to the peer
102 * is totally severed. The semantics of these routines are such that
103 * connectionless protocols can call soisconnected() and soisdisconnected()
104 * only, bypassing the in-progress calls when setting up a ``connection''
107 * From the passive side, a socket is created with
108 * two queues of sockets: so_incomp for connections in progress
109 * and so_comp for connections already made and awaiting user acceptance.
110 * As a protocol is preparing incoming connections, it creates a socket
111 * structure queued on so_incomp by calling sonewconn(). When the connection
112 * is established, soisconnected() is called, and transfers the
113 * socket structure to so_comp, making it available to accept().
115 * If a socket is closed with sockets on either
116 * so_incomp or so_comp, these sockets are dropped.
118 * If higher level protocols are implemented in
119 * the kernel, the wakeups done here will sometimes
120 * cause software-interrupt process scheduling.
124 register struct socket
*so
;
127 so
->so_state
&= ~(SS_ISCONNECTED
|SS_ISDISCONNECTING
);
128 so
->so_state
|= SS_ISCONNECTING
;
130 sflt_notify(so
, sock_evt_connecting
, NULL
);
137 struct socket
*head
= so
->so_head
;
139 so
->so_state
&= ~(SS_ISCONNECTING
|SS_ISDISCONNECTING
|SS_ISCONFIRMING
);
140 so
->so_state
|= SS_ISCONNECTED
;
142 sflt_notify(so
, sock_evt_connected
, NULL
);
144 if (head
&& (so
->so_state
& SS_INCOMP
)) {
145 so
->so_state
&= ~SS_INCOMP
;
146 so
->so_state
|= SS_COMP
;
147 if (head
->so_proto
->pr_getlock
!= NULL
) {
148 socket_unlock(so
, 0);
149 socket_lock(head
, 1);
151 postevent(head
, 0, EV_RCONN
);
152 TAILQ_REMOVE(&head
->so_incomp
, so
, so_list
);
154 TAILQ_INSERT_TAIL(&head
->so_comp
, so
, so_list
);
156 wakeup_one((caddr_t
)&head
->so_timeo
);
157 if (head
->so_proto
->pr_getlock
!= NULL
) {
158 socket_unlock(head
, 1);
162 postevent(so
, 0, EV_WCONN
);
163 wakeup((caddr_t
)&so
->so_timeo
);
170 soisdisconnecting(so
)
171 register struct socket
*so
;
173 so
->so_state
&= ~SS_ISCONNECTING
;
174 so
->so_state
|= (SS_ISDISCONNECTING
|SS_CANTRCVMORE
|SS_CANTSENDMORE
);
175 sflt_notify(so
, sock_evt_disconnecting
, NULL
);
176 wakeup((caddr_t
)&so
->so_timeo
);
183 register struct socket
*so
;
185 so
->so_state
&= ~(SS_ISCONNECTING
|SS_ISCONNECTED
|SS_ISDISCONNECTING
);
186 so
->so_state
|= (SS_CANTRCVMORE
|SS_CANTSENDMORE
|SS_ISDISCONNECTED
);
187 sflt_notify(so
, sock_evt_disconnected
, NULL
);
188 wakeup((caddr_t
)&so
->so_timeo
);
194 * Return a random connection that hasn't been serviced yet and
195 * is eligible for discard. There is a one in qlen chance that
196 * we will return a null, saying that there are no dropable
197 * requests. In this case, the protocol specific code should drop
198 * the new request. This insures fairness.
200 * This may be used in conjunction with protocol specific queue
201 * congestion routines.
205 register struct socket
*head
;
207 struct socket
*so
, *sonext
= NULL
;
208 unsigned int i
, j
, qlen
;
210 static struct timeval old_runtime
;
211 static unsigned int cur_cnt
, old_cnt
;
215 if ((i
= (tv
.tv_sec
- old_runtime
.tv_sec
)) != 0) {
217 old_cnt
= cur_cnt
/ i
;
221 so
= TAILQ_FIRST(&head
->so_incomp
);
225 qlen
= head
->so_incqlen
;
226 if (++cur_cnt
> qlen
|| old_cnt
> qlen
) {
227 rnd
= (314159 * rnd
+ 66329) & 0xffff;
228 j
= ((qlen
+ 1) * rnd
) >> 16;
231 // if (in_pcb_checkstate(so->so_pcb, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
233 sonext
= TAILQ_NEXT(so
, so_list
);
234 // in_pcb_check_state(so->so_pcb, WNT_RELEASE, 0);
235 socket_unlock(so
, 1);
240 // if (in_pcb_checkstate(so->so_pcb, WNT_ACQUIRE, 0) == WNT_STOPUSING)
247 * When an attempt at a new connection is noted on a socket
248 * which accepts connections, sonewconn is called. If the
249 * connection is possible (subject to space constraints, etc.)
250 * then we allocate a new structure, propoerly linked into the
251 * data structure of the original socket, and return this.
252 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
254 static struct socket
*
255 sonewconn_internal(head
, connstatus
)
256 register struct socket
*head
;
260 register struct socket
*so
;
261 lck_mtx_t
*mutex_held
;
263 if (head
->so_proto
->pr_getlock
!= NULL
)
264 mutex_held
= (*head
->so_proto
->pr_getlock
)(head
, 0);
266 mutex_held
= head
->so_proto
->pr_domain
->dom_mtx
;
267 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
269 if (head
->so_qlen
> 3 * head
->so_qlimit
/ 2)
270 return ((struct socket
*)0);
271 so
= soalloc(1, head
->so_proto
->pr_domain
->dom_family
, head
->so_type
);
273 return ((struct socket
*)0);
274 /* check if head was closed during the soalloc */
275 if (head
->so_proto
== NULL
) {
277 return ((struct socket
*)0);
281 so
->so_type
= head
->so_type
;
282 so
->so_options
= head
->so_options
&~ SO_ACCEPTCONN
;
283 so
->so_linger
= head
->so_linger
;
284 so
->so_state
= head
->so_state
| SS_NOFDREF
;
285 so
->so_proto
= head
->so_proto
;
286 so
->so_timeo
= head
->so_timeo
;
287 so
->so_pgid
= head
->so_pgid
;
288 so
->so_uid
= head
->so_uid
;
291 if (soreserve(so
, head
->so_snd
.sb_hiwat
, head
->so_rcv
.sb_hiwat
)) {
294 return ((struct socket
*)0);
298 * Must be done with head unlocked to avoid deadlock for protocol with per socket mutexes.
300 if (head
->so_proto
->pr_unlock
)
301 socket_unlock(head
, 0);
302 if (((*so
->so_proto
->pr_usrreqs
->pru_attach
)(so
, 0, NULL
) != 0) || error
) {
305 if (head
->so_proto
->pr_unlock
)
306 socket_lock(head
, 0);
307 return ((struct socket
*)0);
309 if (head
->so_proto
->pr_unlock
)
310 socket_lock(head
, 0);
312 so
->so_proto
->pr_domain
->dom_refs
++;
316 TAILQ_INSERT_TAIL(&head
->so_comp
, so
, so_list
);
317 so
->so_state
|= SS_COMP
;
319 TAILQ_INSERT_TAIL(&head
->so_incomp
, so
, so_list
);
320 so
->so_state
|= SS_INCOMP
;
325 so
->so_rcv
.sb_flags
|= SB_RECV
; /* XXX */
326 so
->so_rcv
.sb_so
= so
->so_snd
.sb_so
= so
;
327 TAILQ_INIT(&so
->so_evlist
);
329 /* Attach socket filters for this protocol */
333 so
->so_state
|= connstatus
;
335 wakeup((caddr_t
)&head
->so_timeo
);
345 const struct sockaddr
*from
)
348 struct socket_filter_entry
*filter
;
352 for (filter
= head
->so_filt
; filter
&& (error
== 0);
353 filter
= filter
->sfe_next_onsocket
) {
354 if (filter
->sfe_filter
->sf_filter
.sf_connect_in
) {
358 socket_unlock(head
, 0);
360 error
= filter
->sfe_filter
->sf_filter
.sf_connect_in(
361 filter
->sfe_cookie
, head
, from
);
365 socket_lock(head
, 0);
373 return sonewconn_internal(head
, connstatus
);
377 * Socantsendmore indicates that no more data will be sent on the
378 * socket; it would normally be applied to a socket when the user
379 * informs the system that no more data is to be sent, by the protocol
380 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data
381 * will be received, and will normally be applied to the socket by a
382 * protocol when it detects that the peer will send no more data.
383 * Data queued for reading in the socket may yet be read.
390 so
->so_state
|= SS_CANTSENDMORE
;
391 sflt_notify(so
, sock_evt_cantsendmore
, NULL
);
399 so
->so_state
|= SS_CANTRCVMORE
;
400 sflt_notify(so
, sock_evt_cantrecvmore
, NULL
);
405 * Wait for data to arrive at/drain from a socket buffer.
411 int error
= 0, lr
, lr_saved
;
412 struct socket
*so
= sb
->sb_so
;
413 lck_mtx_t
*mutex_held
;
417 __asm__
volatile("mflr %0" : "=r" (lr
));
422 if (so
->so_proto
->pr_getlock
!= NULL
)
423 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
425 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
427 sb
->sb_flags
|= SB_WAIT
;
429 if (so
->so_usecount
< 1)
430 panic("sbwait: so=%x refcount=%d\n", so
, so
->so_usecount
);
431 ts
.tv_sec
= sb
->sb_timeo
.tv_sec
;
432 ts
.tv_nsec
= sb
->sb_timeo
.tv_usec
* 1000;
433 error
= msleep((caddr_t
)&sb
->sb_cc
, mutex_held
,
434 (sb
->sb_flags
& SB_NOINTR
) ? PSOCK
: PSOCK
| PCATCH
, "sbwait",
437 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
439 if (so
->so_usecount
< 1)
440 panic("sbwait: so=%x refcount=%d\n", so
, so
->so_usecount
);
442 if ((so
->so_state
& SS_DRAINING
)) {
450 * Lock a sockbuf already known to be locked;
451 * return any error returned from sleep (EINTR).
455 register struct sockbuf
*sb
;
457 struct socket
*so
= sb
->sb_so
;
458 lck_mtx_t
* mutex_held
;
459 int error
= 0, lr
, lr_saved
;
462 __asm__
volatile("mflr %0" : "=r" (lr
));
467 panic("sb_lock: null so back pointer sb=%x\n", sb
);
469 while (sb
->sb_flags
& SB_LOCK
) {
470 sb
->sb_flags
|= SB_WANT
;
471 if (so
->so_proto
->pr_getlock
!= NULL
)
472 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
474 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
475 if (so
->so_usecount
< 1)
476 panic("sb_lock: so=%x refcount=%d\n", so
, so
->so_usecount
);
477 error
= msleep((caddr_t
)&sb
->sb_flags
, mutex_held
,
478 (sb
->sb_flags
& SB_NOINTR
) ? PSOCK
: PSOCK
| PCATCH
, "sblock", 0);
479 if (so
->so_usecount
< 1)
480 panic("sb_lock: 2 so=%x refcount=%d\n", so
, so
->so_usecount
);
484 sb
->sb_flags
|= SB_LOCK
;
489 * Wakeup processes waiting on a socket buffer.
490 * Do asynchronous notification via SIGIO
491 * if the socket has the SS_ASYNC flag set.
495 register struct socket
*so
;
496 register struct sockbuf
*sb
;
498 struct proc
*p
= current_proc();
499 sb
->sb_flags
&= ~SB_SEL
;
500 selwakeup(&sb
->sb_sel
);
501 if (sb
->sb_flags
& SB_WAIT
) {
502 sb
->sb_flags
&= ~SB_WAIT
;
503 wakeup((caddr_t
)&sb
->sb_cc
);
505 if (so
->so_state
& SS_ASYNC
) {
507 gsignal(-so
->so_pgid
, SIGIO
);
508 else if (so
->so_pgid
> 0 && (p
= pfind(so
->so_pgid
)) != 0)
511 if (sb
->sb_flags
& SB_KNOTE
) {
512 KNOTE(&sb
->sb_sel
.si_note
, SO_FILT_HINT_LOCKED
);
514 if (sb
->sb_flags
& SB_UPCALL
) {
515 socket_unlock(so
, 0);
516 (*so
->so_upcall
)(so
, so
->so_upcallarg
, M_DONTWAIT
);
522 * Socket buffer (struct sockbuf) utility routines.
524 * Each socket contains two socket buffers: one for sending data and
525 * one for receiving data. Each buffer contains a queue of mbufs,
526 * information about the number of mbufs and amount of data in the
527 * queue, and other fields allowing select() statements and notification
528 * on data availability to be implemented.
530 * Data stored in a socket buffer is maintained as a list of records.
531 * Each record is a list of mbufs chained together with the m_next
532 * field. Records are chained together with the m_nextpkt field. The upper
533 * level routine soreceive() expects the following conventions to be
534 * observed when placing information in the receive buffer:
536 * 1. If the protocol requires each message be preceded by the sender's
537 * name, then a record containing that name must be present before
538 * any associated data (mbuf's must be of type MT_SONAME).
539 * 2. If the protocol supports the exchange of ``access rights'' (really
540 * just additional data associated with the message), and there are
541 * ``rights'' to be received, then a record containing this data
542 * should be present (mbuf's must be of type MT_RIGHTS).
543 * 3. If a name or rights record exists, then it must be followed by
544 * a data record, perhaps of zero length.
546 * Before using a new socket structure it is first necessary to reserve
547 * buffer space to the socket, by calling sbreserve(). This should commit
548 * some of the available buffer space in the system buffer pool for the
549 * socket (currently, it does nothing but enforce limits). The space
550 * should be released by calling sbrelease() when the socket is destroyed.
554 soreserve(so
, sndcc
, rcvcc
)
555 register struct socket
*so
;
559 if (sbreserve(&so
->so_snd
, sndcc
) == 0)
561 if (sbreserve(&so
->so_rcv
, rcvcc
) == 0)
563 if (so
->so_rcv
.sb_lowat
== 0)
564 so
->so_rcv
.sb_lowat
= 1;
565 if (so
->so_snd
.sb_lowat
== 0)
566 so
->so_snd
.sb_lowat
= MCLBYTES
;
567 if (so
->so_snd
.sb_lowat
> so
->so_snd
.sb_hiwat
)
568 so
->so_snd
.sb_lowat
= so
->so_snd
.sb_hiwat
;
572 selthreadclear(&so
->so_snd
.sb_sel
);
574 sbrelease(&so
->so_snd
);
580 * Allot mbufs to a sockbuf.
581 * Attempt to scale mbmax so that mbcnt doesn't become limiting
582 * if buffering efficiency is near the normal case.
589 if ((u_quad_t
)cc
> (u_quad_t
)sb_max
* MCLBYTES
/ (MSIZE
+ MCLBYTES
))
592 sb
->sb_mbmax
= min(cc
* sb_efficiency
, sb_max
);
593 if (sb
->sb_lowat
> sb
->sb_hiwat
)
594 sb
->sb_lowat
= sb
->sb_hiwat
;
599 * Free mbufs held by a socket, and reserved mbuf space.
601 /* WARNING needs to do selthreadclear() before calling this */
614 * Routines to add and remove
615 * data from an mbuf queue.
617 * The routines sbappend() or sbappendrecord() are normally called to
618 * append new mbufs to a socket buffer, after checking that adequate
619 * space is available, comparing the function sbspace() with the amount
620 * of data to be added. sbappendrecord() differs from sbappend() in
621 * that data supplied is treated as the beginning of a new record.
622 * To place a sender's address, optional access rights, and data in a
623 * socket receive buffer, sbappendaddr() should be used. To place
624 * access rights and data in a socket receive buffer, sbappendrights()
625 * should be used. In either case, the new data begins a new record.
626 * Note that unlike sbappend() and sbappendrecord(), these routines check
627 * for the caller that there will be enough space to store the data.
628 * Each fails if there is not enough space, or if it cannot find mbufs
629 * to store additional information in.
631 * Reliable protocols may use the socket send buffer to hold data
632 * awaiting acknowledgement. Data is normally copied from a socket
633 * send buffer in a protocol with m_copy for output to a peer,
634 * and then removing the data from the socket buffer with sbdrop()
635 * or sbdroprecord() when the data is acknowledged by the peer.
639 * Append mbuf chain m to the last record in the
640 * socket buffer sb. The additional space associated
641 * the mbuf chain is recorded in sb. Empty mbufs are
642 * discarded and mbufs are compacted where possible.
649 register struct mbuf
*n
, *sb_first
;
655 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_START
), sb
, m
->m_len
, 0, 0, 0);
661 sb_first
= n
= sb
->sb_mb
;
666 if (n
->m_flags
& M_EOR
) {
667 result
= sbappendrecord(sb
, m
); /* XXXXXX!!!! */
668 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_END
), sb
, sb
->sb_cc
, 0, 0, 0);
671 } while (n
->m_next
&& (n
= n
->m_next
));
674 if (!filtered
&& (sb
->sb_flags
& SB_RECV
) != 0) {
675 error
= sflt_data_in(sb
->sb_so
, NULL
, &m
, NULL
, 0, &filtered
);
677 /* no data was appended, caller should not call sowakeup */
682 If we any filters, the socket lock was dropped. n and sb_first
683 cached data from the socket buffer. This cache is not valid
684 since we dropped the lock. We must start over. Since filtered
685 is set we won't run through the filters a second time. We just
686 set n and sb_start again.
692 result
= sbcompress(sb
, m
, n
);
694 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_END
), sb
, sb
->sb_cc
, 0, 0, 0);
702 register struct sockbuf
*sb
;
704 register struct mbuf
*m
;
705 register struct mbuf
*n
= 0;
706 register u_long len
= 0, mbcnt
= 0;
707 lck_mtx_t
*mutex_held
;
709 if (sb
->sb_so
->so_proto
->pr_getlock
!= NULL
)
710 mutex_held
= (*sb
->sb_so
->so_proto
->pr_getlock
)(sb
->sb_so
, 0);
712 mutex_held
= sb
->sb_so
->so_proto
->pr_domain
->dom_mtx
;
714 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
719 for (m
= sb
->sb_mb
; m
; m
= n
) {
721 for (; m
; m
= m
->m_next
) {
724 if (m
->m_flags
& M_EXT
) /*XXX*/ /* pretty sure this is bogus */
725 mbcnt
+= m
->m_ext
.ext_size
;
728 if (len
!= sb
->sb_cc
|| mbcnt
!= sb
->sb_mbcnt
) {
729 panic("cc %ld != %ld || mbcnt %ld != %ld\n", len
, sb
->sb_cc
,
730 mbcnt
, sb
->sb_mbcnt
);
736 * As above, except the mbuf chain
737 * begins a new record.
740 sbappendrecord(sb
, m0
)
741 register struct sockbuf
*sb
;
742 register struct mbuf
*m0
;
744 register struct mbuf
*m
;
750 if ((sb
->sb_flags
& SB_RECV
) != 0) {
751 int error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, NULL
, sock_data_filt_flag_record
, NULL
);
753 if (error
!= EJUSTRETURN
)
764 * Put the first mbuf on the queue.
765 * Note this permits zero length records.
774 if (m
&& (m0
->m_flags
& M_EOR
)) {
775 m0
->m_flags
&= ~M_EOR
;
778 return sbcompress(sb
, m
, m0
);
782 * As above except that OOB data
783 * is inserted at the beginning of the sockbuf,
784 * but after any other OOB data.
797 if ((sb
->sb_flags
& SB_RECV
) != 0) {
798 int error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, NULL
,
799 sock_data_filt_flag_oob
, NULL
);
802 if (error
!= EJUSTRETURN
) {
809 for (mp
= &sb
->sb_mb
; *mp
; mp
= &((*mp
)->m_nextpkt
)) {
815 continue; /* WANT next train */
820 goto again
; /* inspect THIS train further */
825 * Put the first mbuf on the queue.
826 * Note this permits zero length records.
833 if (m
&& (m0
->m_flags
& M_EOR
)) {
834 m0
->m_flags
&= ~M_EOR
;
837 return sbcompress(sb
, m
, m0
);
841 * Append address and data, and optionally, control (ancillary) data
842 * to the receive queue of a socket. If present,
843 * m0 must include a packet header with total length.
844 * Returns 0 if no space in sockbuf or insufficient mbufs.
847 sbappendaddr_internal(sb
, asa
, m0
, control
)
848 register struct sockbuf
*sb
;
849 struct sockaddr
*asa
;
850 struct mbuf
*m0
, *control
;
852 register struct mbuf
*m
, *n
;
853 int space
= asa
->sa_len
;
855 if (m0
&& (m0
->m_flags
& M_PKTHDR
) == 0)
856 panic("sbappendaddr");
859 space
+= m0
->m_pkthdr
.len
;
860 for (n
= control
; n
; n
= n
->m_next
) {
862 if (n
->m_next
== 0) /* keep pointer to last control buf */
865 if (space
> sbspace(sb
))
867 if (asa
->sa_len
> MLEN
)
869 MGET(m
, M_DONTWAIT
, MT_SONAME
);
872 m
->m_len
= asa
->sa_len
;
873 bcopy((caddr_t
)asa
, mtod(m
, caddr_t
), asa
->sa_len
);
875 n
->m_next
= m0
; /* concatenate data to control */
879 for (n
= m
; n
; n
= n
->m_next
)
888 postevent(0,sb
,EV_RWBYTES
);
895 struct sockaddr
* asa
,
897 struct mbuf
*control
,
902 if (error_out
) *error_out
= 0;
904 if (m0
&& (m0
->m_flags
& M_PKTHDR
) == 0)
905 panic("sbappendaddrorfree");
907 /* Call socket data in filters */
908 if ((sb
->sb_flags
& SB_RECV
) != 0) {
910 error
= sflt_data_in(sb
->sb_so
, asa
, &m0
, &control
, 0, NULL
);
912 if (error
!= EJUSTRETURN
) {
914 if (control
) m_freem(control
);
915 if (error_out
) *error_out
= error
;
921 result
= sbappendaddr_internal(sb
, asa
, m0
, control
);
924 if (control
) m_freem(control
);
925 if (error_out
) *error_out
= ENOBUFS
;
932 sbappendcontrol_internal(sb
, m0
, control
)
934 struct mbuf
*control
, *m0
;
936 register struct mbuf
*m
, *n
;
940 panic("sbappendcontrol");
942 for (m
= control
; ; m
= m
->m_next
) {
947 n
= m
; /* save pointer to last control buffer */
948 for (m
= m0
; m
; m
= m
->m_next
)
950 if (space
> sbspace(sb
))
952 n
->m_next
= m0
; /* concatenate data to control */
953 for (m
= control
; m
; m
= m
->m_next
)
959 n
->m_nextpkt
= control
;
962 postevent(0,sb
,EV_RWBYTES
);
970 struct mbuf
*control
,
975 if (error_out
) *error_out
= 0;
977 if (sb
->sb_flags
& SB_RECV
) {
979 error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, &control
, 0, NULL
);
981 if (error
!= EJUSTRETURN
) {
983 if (control
) m_freem(control
);
984 if (error_out
) *error_out
= error
;
990 result
= sbappendcontrol_internal(sb
, m0
, control
);
993 if (control
) m_freem(control
);
994 if (error_out
) *error_out
= ENOBUFS
;
1001 * Compress mbuf chain m into the socket
1002 * buffer sb following mbuf n. If n
1003 * is null, the buffer is presumed empty.
1006 sbcompress(sb
, m
, n
)
1007 register struct sockbuf
*sb
;
1008 register struct mbuf
*m
, *n
;
1010 register int eor
= 0;
1011 register struct mbuf
*o
;
1014 eor
|= m
->m_flags
& M_EOR
;
1015 if (m
->m_len
== 0 &&
1017 (((o
= m
->m_next
) || (o
= n
)) &&
1018 o
->m_type
== m
->m_type
))) {
1022 if (n
&& (n
->m_flags
& M_EOR
) == 0 &&
1026 m
->m_len
<= MCLBYTES
/ 4 && /* XXX: Don't copy too much */
1027 m
->m_len
<= M_TRAILINGSPACE(n
) &&
1028 n
->m_type
== m
->m_type
) {
1029 bcopy(mtod(m
, caddr_t
), mtod(n
, caddr_t
) + n
->m_len
,
1030 (unsigned)m
->m_len
);
1031 n
->m_len
+= m
->m_len
;
1032 sb
->sb_cc
+= m
->m_len
;
1042 m
->m_flags
&= ~M_EOR
;
1050 printf("semi-panic: sbcompress\n");
1052 postevent(0,sb
, EV_RWBYTES
);
1057 * Free all mbufs in a sockbuf.
1058 * Check that all resources are reclaimed.
1062 register struct sockbuf
*sb
;
1064 if (sb
->sb_so
== NULL
)
1065 panic ("sbflush sb->sb_so already null sb=%x\n", sb
);
1066 (void)sblock(sb
, M_WAIT
);
1067 while (sb
->sb_mbcnt
) {
1069 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
1070 * we would loop forever. Panic instead.
1072 if (!sb
->sb_cc
&& (sb
->sb_mb
== NULL
|| sb
->sb_mb
->m_len
))
1074 sbdrop(sb
, (int)sb
->sb_cc
);
1076 if (sb
->sb_cc
|| sb
->sb_mb
|| sb
->sb_mbcnt
|| sb
->sb_so
== NULL
)
1077 panic("sbflush: cc %ld || mb %p || mbcnt %ld sb_so=%x", sb
->sb_cc
, (void *)sb
->sb_mb
, sb
->sb_mbcnt
, sb
->sb_so
);
1079 postevent(0, sb
, EV_RWBYTES
);
1080 sbunlock(sb
, 1); /* keep socket locked */
1085 * Drop data from (the front of) a sockbuf.
1086 * use m_freem_list to free the mbuf structures
1087 * under a single lock... this is done by pruning
1088 * the top of the tree from the body by keeping track
1089 * of where we get to in the tree and then zeroing the
1090 * two pertinent pointers m_nextpkt and m_next
1091 * the socket buffer is then updated to point at the new
1092 * top of the tree and the pruned area is released via
1097 register struct sockbuf
*sb
;
1100 register struct mbuf
*m
, *free_list
, *ml
;
1101 struct mbuf
*next
, *last
;
1103 KERNEL_DEBUG((DBG_FNC_SBDROP
| DBG_FUNC_START
), sb
, len
, 0, 0, 0);
1105 next
= (m
= sb
->sb_mb
) ? m
->m_nextpkt
: 0;
1106 free_list
= last
= m
;
1107 ml
= (struct mbuf
*)0;
1112 /* temporarily replacing this panic with printf because
1113 * it occurs occasionally when closing a socket when there
1114 * is no harm in ignoring it. This problem will be investigated
1117 /* panic("sbdrop"); */
1118 printf("sbdrop - count not zero\n");
1120 /* zero the counts. if we have no mbufs, we have no data (PR-2986815) */
1126 next
= m
->m_nextpkt
;
1129 if (m
->m_len
> len
) {
1141 while (m
&& m
->m_len
== 0) {
1148 ml
->m_next
= (struct mbuf
*)0;
1149 last
->m_nextpkt
= (struct mbuf
*)0;
1150 m_freem_list(free_list
);
1154 m
->m_nextpkt
= next
;
1158 postevent(0, sb
, EV_RWBYTES
);
1160 KERNEL_DEBUG((DBG_FNC_SBDROP
| DBG_FUNC_END
), sb
, 0, 0, 0, 0);
1164 * Drop a record off the front of a sockbuf
1165 * and move the next record to the front.
1169 register struct sockbuf
*sb
;
1171 register struct mbuf
*m
, *mn
;
1175 sb
->sb_mb
= m
->m_nextpkt
;
1182 postevent(0, sb
, EV_RWBYTES
);
1186 * Create a "control" mbuf containing the specified data
1187 * with the specified type for presentation on a socket buffer.
1190 sbcreatecontrol(p
, size
, type
, level
)
1195 register struct cmsghdr
*cp
;
1198 if (CMSG_SPACE((u_int
)size
) > MLEN
)
1199 return ((struct mbuf
*) NULL
);
1200 if ((m
= m_get(M_DONTWAIT
, MT_CONTROL
)) == NULL
)
1201 return ((struct mbuf
*) NULL
);
1202 cp
= mtod(m
, struct cmsghdr
*);
1203 /* XXX check size? */
1204 (void)memcpy(CMSG_DATA(cp
), p
, size
);
1205 m
->m_len
= CMSG_SPACE(size
);
1206 cp
->cmsg_len
= CMSG_LEN(size
);
1207 cp
->cmsg_level
= level
;
1208 cp
->cmsg_type
= type
;
1213 * Some routines that return EOPNOTSUPP for entry points that are not
1214 * supported by a protocol. Fill in as needed.
1217 pru_abort_notsupp(struct socket
*so
)
1224 pru_accept_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1230 pru_attach_notsupp(struct socket
*so
, int proto
, struct proc
*p
)
1236 pru_bind_notsupp(struct socket
*so
, struct sockaddr
*nam
, struct proc
*p
)
1242 pru_connect_notsupp(struct socket
*so
, struct sockaddr
*nam
, struct proc
*p
)
1248 pru_connect2_notsupp(struct socket
*so1
, struct socket
*so2
)
1254 pru_control_notsupp(struct socket
*so
, u_long cmd
, caddr_t data
,
1255 struct ifnet
*ifp
, struct proc
*p
)
1261 pru_detach_notsupp(struct socket
*so
)
1267 pru_disconnect_notsupp(struct socket
*so
)
1273 pru_listen_notsupp(struct socket
*so
, struct proc
*p
)
1279 pru_peeraddr_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1285 pru_rcvd_notsupp(struct socket
*so
, int flags
)
1291 pru_rcvoob_notsupp(struct socket
*so
, struct mbuf
*m
, int flags
)
1297 pru_send_notsupp(struct socket
*so
, int flags
, struct mbuf
*m
,
1298 struct sockaddr
*addr
, struct mbuf
*control
,
1307 * This isn't really a ``null'' operation, but it's the default one
1308 * and doesn't do anything destructive.
1311 pru_sense_null(struct socket
*so
, struct stat
*sb
)
1313 sb
->st_blksize
= so
->so_snd
.sb_hiwat
;
1318 int pru_sosend_notsupp(struct socket
*so
, struct sockaddr
*addr
,
1319 struct uio
*uio
, struct mbuf
*top
,
1320 struct mbuf
*control
, int flags
)
1326 int pru_soreceive_notsupp(struct socket
*so
,
1327 struct sockaddr
**paddr
,
1328 struct uio
*uio
, struct mbuf
**mp0
,
1329 struct mbuf
**controlp
, int *flagsp
)
1336 pru_shutdown_notsupp(struct socket
*so
)
1342 pru_sockaddr_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1347 int pru_sosend(struct socket
*so
, struct sockaddr
*addr
,
1348 struct uio
*uio
, struct mbuf
*top
,
1349 struct mbuf
*control
, int flags
)
1354 int pru_soreceive(struct socket
*so
,
1355 struct sockaddr
**paddr
,
1356 struct uio
*uio
, struct mbuf
**mp0
,
1357 struct mbuf
**controlp
, int *flagsp
)
1364 pru_sopoll_notsupp(__unused
struct socket
*so
, __unused
int events
,
1365 __unused kauth_cred_t cred
, __unused
void *wql
)
1373 * The following are macros on BSD and functions on Darwin
1377 * Do we need to notify the other side when I/O is possible?
1381 sb_notify(struct sockbuf
*sb
)
1383 return ((sb
->sb_flags
& (SB_WAIT
|SB_SEL
|SB_ASYNC
|SB_UPCALL
|SB_KNOTE
)) != 0);
1387 * How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
1388 * This is problematical if the fields are unsigned, as the space might
1389 * still be negative (cc > hiwat or mbcnt > mbmax). Should detect
1390 * overflow and return 0. Should use "lmin" but it doesn't exist now.
1393 sbspace(struct sockbuf
*sb
)
1395 return ((long) imin((int)(sb
->sb_hiwat
- sb
->sb_cc
),
1396 (int)(sb
->sb_mbmax
- sb
->sb_mbcnt
)));
1399 /* do we have to send all at once on a socket? */
1401 sosendallatonce(struct socket
*so
)
1403 return (so
->so_proto
->pr_flags
& PR_ATOMIC
);
1406 /* can we read something from so? */
1408 soreadable(struct socket
*so
)
1410 return (so
->so_rcv
.sb_cc
>= so
->so_rcv
.sb_lowat
||
1411 (so
->so_state
& SS_CANTRCVMORE
) ||
1412 so
->so_comp
.tqh_first
|| so
->so_error
);
1415 /* can we write something to so? */
1418 sowriteable(struct socket
*so
)
1420 return ((sbspace(&(so
)->so_snd
) >= (so
)->so_snd
.sb_lowat
&&
1421 ((so
->so_state
&SS_ISCONNECTED
) ||
1422 (so
->so_proto
->pr_flags
&PR_CONNREQUIRED
)==0)) ||
1423 (so
->so_state
& SS_CANTSENDMORE
) ||
1427 /* adjust counters in sb reflecting allocation of m */
1430 sballoc(struct sockbuf
*sb
, struct mbuf
*m
)
1432 sb
->sb_cc
+= m
->m_len
;
1433 sb
->sb_mbcnt
+= MSIZE
;
1434 if (m
->m_flags
& M_EXT
)
1435 sb
->sb_mbcnt
+= m
->m_ext
.ext_size
;
1438 /* adjust counters in sb reflecting freeing of m */
1440 sbfree(struct sockbuf
*sb
, struct mbuf
*m
)
1442 sb
->sb_cc
-= m
->m_len
;
1443 sb
->sb_mbcnt
-= MSIZE
;
1444 if (m
->m_flags
& M_EXT
)
1445 sb
->sb_mbcnt
-= m
->m_ext
.ext_size
;
1449 * Set lock on sockbuf sb; sleep if lock is already held.
1450 * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
1451 * Returns error without lock if sleep is interrupted.
1454 sblock(struct sockbuf
*sb
, int wf
)
1456 return(sb
->sb_flags
& SB_LOCK
?
1457 ((wf
== M_WAIT
) ? sb_lock(sb
) : EWOULDBLOCK
) :
1458 (sb
->sb_flags
|= SB_LOCK
), 0);
1461 /* release lock on sockbuf sb */
1463 sbunlock(struct sockbuf
*sb
, int keeplocked
)
1465 struct socket
*so
= sb
->sb_so
;
1467 lck_mtx_t
*mutex_held
;
1470 __asm__
volatile("mflr %0" : "=r" (lr
));
1473 sb
->sb_flags
&= ~SB_LOCK
;
1475 if (so
->so_proto
->pr_getlock
!= NULL
)
1476 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
1478 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1480 if (keeplocked
== 0)
1481 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1483 if (sb
->sb_flags
& SB_WANT
) {
1484 sb
->sb_flags
&= ~SB_WANT
;
1485 if (so
->so_usecount
< 0)
1486 panic("sbunlock: b4 wakeup so=%x ref=%d lr=%x sb_flags=%x\n", sb
->sb_so
, so
->so_usecount
, lr_saved
, sb
->sb_flags
);
1488 wakeup((caddr_t
)&(sb
)->sb_flags
);
1490 if (keeplocked
== 0) { /* unlock on exit */
1492 if (so
->so_usecount
< 0)
1493 panic("sbunlock: unlock on exit so=%x lr=%x sb_flags=%x\n", so
, so
->so_usecount
,lr_saved
, sb
->sb_flags
);
1494 so
->reserved4
= lr_saved
;
1495 lck_mtx_unlock(mutex_held
);
1500 sorwakeup(struct socket
* so
)
1502 if (sb_notify(&so
->so_rcv
))
1503 sowakeup(so
, &so
->so_rcv
);
1507 sowwakeup(struct socket
* so
)
1509 if (sb_notify(&so
->so_snd
))
1510 sowakeup(so
, &so
->so_snd
);
1515 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
1518 dup_sockaddr(sa
, canwait
)
1519 struct sockaddr
*sa
;
1522 struct sockaddr
*sa2
;
1524 MALLOC(sa2
, struct sockaddr
*, sa
->sa_len
, M_SONAME
,
1525 canwait
? M_WAITOK
: M_NOWAIT
);
1527 bcopy(sa
, sa2
, sa
->sa_len
);
1532 * Create an external-format (``xsocket'') structure using the information
1533 * in the kernel-format socket structure pointed to by so. This is done
1534 * to reduce the spew of irrelevant information over this interface,
1535 * to isolate user code from changes in the kernel structure, and
1536 * potentially to provide information-hiding if we decide that
1537 * some of this information should be hidden from users.
1540 sotoxsocket(struct socket
*so
, struct xsocket
*xso
)
1542 xso
->xso_len
= sizeof *xso
;
1544 xso
->so_type
= so
->so_type
;
1545 xso
->so_options
= so
->so_options
;
1546 xso
->so_linger
= so
->so_linger
;
1547 xso
->so_state
= so
->so_state
;
1548 xso
->so_pcb
= so
->so_pcb
;
1550 xso
->xso_protocol
= so
->so_proto
->pr_protocol
;
1551 xso
->xso_family
= so
->so_proto
->pr_domain
->dom_family
;
1554 xso
->xso_protocol
= xso
->xso_family
= 0;
1555 xso
->so_qlen
= so
->so_qlen
;
1556 xso
->so_incqlen
= so
->so_incqlen
;
1557 xso
->so_qlimit
= so
->so_qlimit
;
1558 xso
->so_timeo
= so
->so_timeo
;
1559 xso
->so_error
= so
->so_error
;
1560 xso
->so_pgid
= so
->so_pgid
;
1561 xso
->so_oobmark
= so
->so_oobmark
;
1562 sbtoxsockbuf(&so
->so_snd
, &xso
->so_snd
);
1563 sbtoxsockbuf(&so
->so_rcv
, &xso
->so_rcv
);
1564 xso
->so_uid
= so
->so_uid
;
1568 * This does the same for sockbufs. Note that the xsockbuf structure,
1569 * since it is always embedded in a socket, does not include a self
1570 * pointer nor a length. We make this entry point public in case
1571 * some other mechanism needs it.
1574 sbtoxsockbuf(struct sockbuf
*sb
, struct xsockbuf
*xsb
)
1576 xsb
->sb_cc
= sb
->sb_cc
;
1577 xsb
->sb_hiwat
= sb
->sb_hiwat
;
1578 xsb
->sb_mbcnt
= sb
->sb_mbcnt
;
1579 xsb
->sb_mbmax
= sb
->sb_mbmax
;
1580 xsb
->sb_lowat
= sb
->sb_lowat
;
1581 xsb
->sb_flags
= sb
->sb_flags
;
1582 xsb
->sb_timeo
= (u_long
)(sb
->sb_timeo
.tv_sec
* hz
) + sb
->sb_timeo
.tv_usec
/ tick
;
1583 if (xsb
->sb_timeo
== 0 && sb
->sb_timeo
.tv_usec
!= 0)
1588 * Here is the definition of some of the basic objects in the kern.ipc
1589 * branch of the MIB.
1591 SYSCTL_NODE(_kern
, KERN_IPC
, ipc
, CTLFLAG_RW
, 0, "IPC");
1593 /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
1595 SYSCTL_INT(_kern
, KERN_DUMMY
, dummy
, CTLFLAG_RW
, &dummy
, 0, "");
1597 SYSCTL_INT(_kern_ipc
, KIPC_MAXSOCKBUF
, maxsockbuf
, CTLFLAG_RW
,
1598 &sb_max
, 0, "Maximum socket buffer size");
1599 SYSCTL_INT(_kern_ipc
, OID_AUTO
, maxsockets
, CTLFLAG_RD
,
1600 &maxsockets
, 0, "Maximum number of sockets avaliable");
1601 SYSCTL_INT(_kern_ipc
, KIPC_SOCKBUF_WASTE
, sockbuf_waste_factor
, CTLFLAG_RW
,
1602 &sb_efficiency
, 0, "");
1603 SYSCTL_INT(_kern_ipc
, KIPC_NMBCLUSTERS
, nmbclusters
, CTLFLAG_RD
, &nmbclusters
, 0, "");