2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
22 /* Copyright (c) 1998, 1999 Apple Computer, Inc. All Rights Reserved */
23 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
25 * Copyright (c) 1982, 1986, 1988, 1990, 1993
26 * The Regents of the University of California. All rights reserved.
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
31 * 1. Redistributions of source code must retain the above copyright
32 * notice, this list of conditions and the following disclaimer.
33 * 2. Redistributions in binary form must reproduce the above copyright
34 * notice, this list of conditions and the following disclaimer in the
35 * documentation and/or other materials provided with the distribution.
36 * 3. All advertising materials mentioning features or use of this software
37 * must display the following acknowledgement:
38 * This product includes software developed by the University of
39 * California, Berkeley and its contributors.
40 * 4. Neither the name of the University nor the names of its contributors
41 * may be used to endorse or promote products derived from this software
42 * without specific prior written permission.
44 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
45 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
46 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
47 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
48 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
49 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
50 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
51 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
52 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
53 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
57 * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.9 2001/07/26 18:53:02 peter Exp $
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/domain.h>
63 #include <sys/kernel.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/malloc.h>
68 #include <sys/protosw.h>
70 #include <sys/socket.h>
71 #include <sys/socketvar.h>
72 #include <sys/signalvar.h>
73 #include <sys/sysctl.h>
75 #include <kern/locks.h>
76 #include <net/route.h>
77 #include <netinet/in.h>
78 #include <netinet/in_pcb.h>
79 #include <sys/kdebug.h>
81 #define DBG_FNC_SBDROP NETDBG_CODE(DBG_NETSOCK, 4)
82 #define DBG_FNC_SBAPPEND NETDBG_CODE(DBG_NETSOCK, 5)
86 * Primitive routines for operating on sockets and socket buffers
89 u_long sb_max
= SB_MAX
; /* XXX should be static */
91 static u_long sb_efficiency
= 8; /* parameter for sbreserve() */
94 * Procedures to manipulate state flags of socket
95 * and do appropriate wakeups. Normal sequence from the
96 * active (originating) side is that soisconnecting() is
97 * called during processing of connect() call,
98 * resulting in an eventual call to soisconnected() if/when the
99 * connection is established. When the connection is torn down
100 * soisdisconnecting() is called during processing of disconnect() call,
101 * and soisdisconnected() is called when the connection to the peer
102 * is totally severed. The semantics of these routines are such that
103 * connectionless protocols can call soisconnected() and soisdisconnected()
104 * only, bypassing the in-progress calls when setting up a ``connection''
107 * From the passive side, a socket is created with
108 * two queues of sockets: so_incomp for connections in progress
109 * and so_comp for connections already made and awaiting user acceptance.
110 * As a protocol is preparing incoming connections, it creates a socket
111 * structure queued on so_incomp by calling sonewconn(). When the connection
112 * is established, soisconnected() is called, and transfers the
113 * socket structure to so_comp, making it available to accept().
115 * If a socket is closed with sockets on either
116 * so_incomp or so_comp, these sockets are dropped.
118 * If higher level protocols are implemented in
119 * the kernel, the wakeups done here will sometimes
120 * cause software-interrupt process scheduling.
124 register struct socket
*so
;
127 so
->so_state
&= ~(SS_ISCONNECTED
|SS_ISDISCONNECTING
);
128 so
->so_state
|= SS_ISCONNECTING
;
130 sflt_notify(so
, sock_evt_connecting
, NULL
);
137 struct socket
*head
= so
->so_head
;
139 so
->so_state
&= ~(SS_ISCONNECTING
|SS_ISDISCONNECTING
|SS_ISCONFIRMING
);
140 so
->so_state
|= SS_ISCONNECTED
;
142 sflt_notify(so
, sock_evt_connected
, NULL
);
144 if (head
&& (so
->so_state
& SS_INCOMP
)) {
145 so
->so_state
&= ~SS_INCOMP
;
146 so
->so_state
|= SS_COMP
;
147 if (head
->so_proto
->pr_getlock
!= NULL
) {
148 socket_unlock(so
, 0);
149 socket_lock(head
, 1);
151 postevent(head
, 0, EV_RCONN
);
152 TAILQ_REMOVE(&head
->so_incomp
, so
, so_list
);
154 TAILQ_INSERT_TAIL(&head
->so_comp
, so
, so_list
);
156 wakeup_one((caddr_t
)&head
->so_timeo
);
157 if (head
->so_proto
->pr_getlock
!= NULL
) {
158 socket_unlock(head
, 1);
162 postevent(so
, 0, EV_WCONN
);
163 wakeup((caddr_t
)&so
->so_timeo
);
170 soisdisconnecting(so
)
171 register struct socket
*so
;
173 so
->so_state
&= ~SS_ISCONNECTING
;
174 so
->so_state
|= (SS_ISDISCONNECTING
|SS_CANTRCVMORE
|SS_CANTSENDMORE
);
175 sflt_notify(so
, sock_evt_disconnecting
, NULL
);
176 wakeup((caddr_t
)&so
->so_timeo
);
183 register struct socket
*so
;
185 so
->so_state
&= ~(SS_ISCONNECTING
|SS_ISCONNECTED
|SS_ISDISCONNECTING
);
186 so
->so_state
|= (SS_CANTRCVMORE
|SS_CANTSENDMORE
|SS_ISDISCONNECTED
);
187 sflt_notify(so
, sock_evt_disconnected
, NULL
);
188 wakeup((caddr_t
)&so
->so_timeo
);
194 * Return a random connection that hasn't been serviced yet and
195 * is eligible for discard. There is a one in qlen chance that
196 * we will return a null, saying that there are no dropable
197 * requests. In this case, the protocol specific code should drop
198 * the new request. This insures fairness.
200 * This may be used in conjunction with protocol specific queue
201 * congestion routines.
205 register struct socket
*head
;
207 struct socket
*so
, *sonext
= NULL
;
208 unsigned int i
, j
, qlen
;
210 static struct timeval old_runtime
;
211 static unsigned int cur_cnt
, old_cnt
;
215 if ((i
= (tv
.tv_sec
- old_runtime
.tv_sec
)) != 0) {
217 old_cnt
= cur_cnt
/ i
;
221 so
= TAILQ_FIRST(&head
->so_incomp
);
225 qlen
= head
->so_incqlen
;
226 if (++cur_cnt
> qlen
|| old_cnt
> qlen
) {
227 rnd
= (314159 * rnd
+ 66329) & 0xffff;
228 j
= ((qlen
+ 1) * rnd
) >> 16;
231 // if (in_pcb_checkstate(so->so_pcb, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
233 sonext
= TAILQ_NEXT(so
, so_list
);
234 // in_pcb_check_state(so->so_pcb, WNT_RELEASE, 0);
235 socket_unlock(so
, 1);
240 // if (in_pcb_checkstate(so->so_pcb, WNT_ACQUIRE, 0) == WNT_STOPUSING)
247 * When an attempt at a new connection is noted on a socket
248 * which accepts connections, sonewconn is called. If the
249 * connection is possible (subject to space constraints, etc.)
250 * then we allocate a new structure, propoerly linked into the
251 * data structure of the original socket, and return this.
252 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
254 static struct socket
*
255 sonewconn_internal(head
, connstatus
)
256 register struct socket
*head
;
260 register struct socket
*so
;
261 lck_mtx_t
*mutex_held
;
263 if (head
->so_proto
->pr_getlock
!= NULL
)
264 mutex_held
= (*head
->so_proto
->pr_getlock
)(head
, 0);
266 mutex_held
= head
->so_proto
->pr_domain
->dom_mtx
;
267 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
269 if (head
->so_qlen
> 3 * head
->so_qlimit
/ 2)
270 return ((struct socket
*)0);
271 so
= soalloc(1, head
->so_proto
->pr_domain
->dom_family
, head
->so_type
);
273 return ((struct socket
*)0);
274 /* check if head was closed during the soalloc */
275 if (head
->so_proto
== NULL
) {
277 return ((struct socket
*)0);
281 so
->so_type
= head
->so_type
;
282 so
->so_options
= head
->so_options
&~ SO_ACCEPTCONN
;
283 so
->so_linger
= head
->so_linger
;
284 so
->so_state
= head
->so_state
| SS_NOFDREF
;
285 so
->so_proto
= head
->so_proto
;
286 so
->so_timeo
= head
->so_timeo
;
287 so
->so_pgid
= head
->so_pgid
;
288 so
->so_uid
= head
->so_uid
;
292 so
->so_rcv
.sb_flags
|= SB_RECV
; /* XXX */
293 so
->so_rcv
.sb_so
= so
->so_snd
.sb_so
= so
;
294 TAILQ_INIT(&so
->so_evlist
);
297 if (soreserve(so
, head
->so_snd
.sb_hiwat
, head
->so_rcv
.sb_hiwat
)) {
300 return ((struct socket
*)0);
304 * Must be done with head unlocked to avoid deadlock for protocol with per socket mutexes.
306 if (head
->so_proto
->pr_unlock
)
307 socket_unlock(head
, 0);
308 if (((*so
->so_proto
->pr_usrreqs
->pru_attach
)(so
, 0, NULL
) != 0) || error
) {
311 if (head
->so_proto
->pr_unlock
)
312 socket_lock(head
, 0);
313 return ((struct socket
*)0);
315 if (head
->so_proto
->pr_unlock
)
316 socket_lock(head
, 0);
318 so
->so_proto
->pr_domain
->dom_refs
++;
322 TAILQ_INSERT_TAIL(&head
->so_comp
, so
, so_list
);
323 so
->so_state
|= SS_COMP
;
325 TAILQ_INSERT_TAIL(&head
->so_incomp
, so
, so_list
);
326 so
->so_state
|= SS_INCOMP
;
332 /* Attach socket filters for this protocol */
336 so
->so_state
|= connstatus
;
338 wakeup((caddr_t
)&head
->so_timeo
);
348 const struct sockaddr
*from
)
351 struct socket_filter_entry
*filter
;
355 for (filter
= head
->so_filt
; filter
&& (error
== 0);
356 filter
= filter
->sfe_next_onsocket
) {
357 if (filter
->sfe_filter
->sf_filter
.sf_connect_in
) {
361 socket_unlock(head
, 0);
363 error
= filter
->sfe_filter
->sf_filter
.sf_connect_in(
364 filter
->sfe_cookie
, head
, from
);
368 socket_lock(head
, 0);
376 return sonewconn_internal(head
, connstatus
);
380 * Socantsendmore indicates that no more data will be sent on the
381 * socket; it would normally be applied to a socket when the user
382 * informs the system that no more data is to be sent, by the protocol
383 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data
384 * will be received, and will normally be applied to the socket by a
385 * protocol when it detects that the peer will send no more data.
386 * Data queued for reading in the socket may yet be read.
393 so
->so_state
|= SS_CANTSENDMORE
;
394 sflt_notify(so
, sock_evt_cantsendmore
, NULL
);
402 so
->so_state
|= SS_CANTRCVMORE
;
403 sflt_notify(so
, sock_evt_cantrecvmore
, NULL
);
408 * Wait for data to arrive at/drain from a socket buffer.
414 int error
= 0, lr
, lr_saved
;
415 struct socket
*so
= sb
->sb_so
;
416 lck_mtx_t
*mutex_held
;
420 __asm__
volatile("mflr %0" : "=r" (lr
));
425 if (so
->so_proto
->pr_getlock
!= NULL
)
426 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
428 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
430 sb
->sb_flags
|= SB_WAIT
;
432 if (so
->so_usecount
< 1)
433 panic("sbwait: so=%x refcount=%d\n", so
, so
->so_usecount
);
434 ts
.tv_sec
= sb
->sb_timeo
.tv_sec
;
435 ts
.tv_nsec
= sb
->sb_timeo
.tv_usec
* 1000;
436 error
= msleep((caddr_t
)&sb
->sb_cc
, mutex_held
,
437 (sb
->sb_flags
& SB_NOINTR
) ? PSOCK
: PSOCK
| PCATCH
, "sbwait",
440 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
442 if (so
->so_usecount
< 1)
443 panic("sbwait: so=%x refcount=%d\n", so
, so
->so_usecount
);
445 if ((so
->so_state
& SS_DRAINING
)) {
453 * Lock a sockbuf already known to be locked;
454 * return any error returned from sleep (EINTR).
458 register struct sockbuf
*sb
;
460 struct socket
*so
= sb
->sb_so
;
461 lck_mtx_t
* mutex_held
;
462 int error
= 0, lr
, lr_saved
;
465 __asm__
volatile("mflr %0" : "=r" (lr
));
470 panic("sb_lock: null so back pointer sb=%x\n", sb
);
472 while (sb
->sb_flags
& SB_LOCK
) {
473 sb
->sb_flags
|= SB_WANT
;
474 if (so
->so_proto
->pr_getlock
!= NULL
)
475 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
477 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
478 if (so
->so_usecount
< 1)
479 panic("sb_lock: so=%x refcount=%d\n", so
, so
->so_usecount
);
480 error
= msleep((caddr_t
)&sb
->sb_flags
, mutex_held
,
481 (sb
->sb_flags
& SB_NOINTR
) ? PSOCK
: PSOCK
| PCATCH
, "sblock", 0);
482 if (so
->so_usecount
< 1)
483 panic("sb_lock: 2 so=%x refcount=%d\n", so
, so
->so_usecount
);
487 sb
->sb_flags
|= SB_LOCK
;
492 * Wakeup processes waiting on a socket buffer.
493 * Do asynchronous notification via SIGIO
494 * if the socket has the SS_ASYNC flag set.
498 register struct socket
*so
;
499 register struct sockbuf
*sb
;
501 struct proc
*p
= current_proc();
502 sb
->sb_flags
&= ~SB_SEL
;
503 selwakeup(&sb
->sb_sel
);
504 if (sb
->sb_flags
& SB_WAIT
) {
505 sb
->sb_flags
&= ~SB_WAIT
;
506 wakeup((caddr_t
)&sb
->sb_cc
);
508 if (so
->so_state
& SS_ASYNC
) {
510 gsignal(-so
->so_pgid
, SIGIO
);
511 else if (so
->so_pgid
> 0 && (p
= pfind(so
->so_pgid
)) != 0)
514 if (sb
->sb_flags
& SB_KNOTE
) {
515 KNOTE(&sb
->sb_sel
.si_note
, SO_FILT_HINT_LOCKED
);
517 if (sb
->sb_flags
& SB_UPCALL
) {
518 socket_unlock(so
, 0);
519 (*so
->so_upcall
)(so
, so
->so_upcallarg
, M_DONTWAIT
);
525 * Socket buffer (struct sockbuf) utility routines.
527 * Each socket contains two socket buffers: one for sending data and
528 * one for receiving data. Each buffer contains a queue of mbufs,
529 * information about the number of mbufs and amount of data in the
530 * queue, and other fields allowing select() statements and notification
531 * on data availability to be implemented.
533 * Data stored in a socket buffer is maintained as a list of records.
534 * Each record is a list of mbufs chained together with the m_next
535 * field. Records are chained together with the m_nextpkt field. The upper
536 * level routine soreceive() expects the following conventions to be
537 * observed when placing information in the receive buffer:
539 * 1. If the protocol requires each message be preceded by the sender's
540 * name, then a record containing that name must be present before
541 * any associated data (mbuf's must be of type MT_SONAME).
542 * 2. If the protocol supports the exchange of ``access rights'' (really
543 * just additional data associated with the message), and there are
544 * ``rights'' to be received, then a record containing this data
545 * should be present (mbuf's must be of type MT_RIGHTS).
546 * 3. If a name or rights record exists, then it must be followed by
547 * a data record, perhaps of zero length.
549 * Before using a new socket structure it is first necessary to reserve
550 * buffer space to the socket, by calling sbreserve(). This should commit
551 * some of the available buffer space in the system buffer pool for the
552 * socket (currently, it does nothing but enforce limits). The space
553 * should be released by calling sbrelease() when the socket is destroyed.
557 soreserve(so
, sndcc
, rcvcc
)
558 register struct socket
*so
;
562 if (sbreserve(&so
->so_snd
, sndcc
) == 0)
564 if (sbreserve(&so
->so_rcv
, rcvcc
) == 0)
566 if (so
->so_rcv
.sb_lowat
== 0)
567 so
->so_rcv
.sb_lowat
= 1;
568 if (so
->so_snd
.sb_lowat
== 0)
569 so
->so_snd
.sb_lowat
= MCLBYTES
;
570 if (so
->so_snd
.sb_lowat
> so
->so_snd
.sb_hiwat
)
571 so
->so_snd
.sb_lowat
= so
->so_snd
.sb_hiwat
;
575 selthreadclear(&so
->so_snd
.sb_sel
);
577 sbrelease(&so
->so_snd
);
583 * Allot mbufs to a sockbuf.
584 * Attempt to scale mbmax so that mbcnt doesn't become limiting
585 * if buffering efficiency is near the normal case.
592 if ((u_quad_t
)cc
> (u_quad_t
)sb_max
* MCLBYTES
/ (MSIZE
+ MCLBYTES
))
595 sb
->sb_mbmax
= min(cc
* sb_efficiency
, sb_max
);
596 if (sb
->sb_lowat
> sb
->sb_hiwat
)
597 sb
->sb_lowat
= sb
->sb_hiwat
;
602 * Free mbufs held by a socket, and reserved mbuf space.
604 /* WARNING needs to do selthreadclear() before calling this */
617 * Routines to add and remove
618 * data from an mbuf queue.
620 * The routines sbappend() or sbappendrecord() are normally called to
621 * append new mbufs to a socket buffer, after checking that adequate
622 * space is available, comparing the function sbspace() with the amount
623 * of data to be added. sbappendrecord() differs from sbappend() in
624 * that data supplied is treated as the beginning of a new record.
625 * To place a sender's address, optional access rights, and data in a
626 * socket receive buffer, sbappendaddr() should be used. To place
627 * access rights and data in a socket receive buffer, sbappendrights()
628 * should be used. In either case, the new data begins a new record.
629 * Note that unlike sbappend() and sbappendrecord(), these routines check
630 * for the caller that there will be enough space to store the data.
631 * Each fails if there is not enough space, or if it cannot find mbufs
632 * to store additional information in.
634 * Reliable protocols may use the socket send buffer to hold data
635 * awaiting acknowledgement. Data is normally copied from a socket
636 * send buffer in a protocol with m_copy for output to a peer,
637 * and then removing the data from the socket buffer with sbdrop()
638 * or sbdroprecord() when the data is acknowledged by the peer.
642 * Append mbuf chain m to the last record in the
643 * socket buffer sb. The additional space associated
644 * the mbuf chain is recorded in sb. Empty mbufs are
645 * discarded and mbufs are compacted where possible.
652 register struct mbuf
*n
, *sb_first
;
658 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_START
), sb
, m
->m_len
, 0, 0, 0);
664 sb_first
= n
= sb
->sb_mb
;
669 if (n
->m_flags
& M_EOR
) {
670 result
= sbappendrecord(sb
, m
); /* XXXXXX!!!! */
671 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_END
), sb
, sb
->sb_cc
, 0, 0, 0);
674 } while (n
->m_next
&& (n
= n
->m_next
));
677 if (!filtered
&& (sb
->sb_flags
& SB_RECV
) != 0) {
678 error
= sflt_data_in(sb
->sb_so
, NULL
, &m
, NULL
, 0, &filtered
);
680 /* no data was appended, caller should not call sowakeup */
685 If we any filters, the socket lock was dropped. n and sb_first
686 cached data from the socket buffer. This cache is not valid
687 since we dropped the lock. We must start over. Since filtered
688 is set we won't run through the filters a second time. We just
689 set n and sb_start again.
695 result
= sbcompress(sb
, m
, n
);
697 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_END
), sb
, sb
->sb_cc
, 0, 0, 0);
705 register struct sockbuf
*sb
;
707 register struct mbuf
*m
;
708 register struct mbuf
*n
= 0;
709 register u_long len
= 0, mbcnt
= 0;
710 lck_mtx_t
*mutex_held
;
712 if (sb
->sb_so
->so_proto
->pr_getlock
!= NULL
)
713 mutex_held
= (*sb
->sb_so
->so_proto
->pr_getlock
)(sb
->sb_so
, 0);
715 mutex_held
= sb
->sb_so
->so_proto
->pr_domain
->dom_mtx
;
717 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
722 for (m
= sb
->sb_mb
; m
; m
= n
) {
724 for (; m
; m
= m
->m_next
) {
727 if (m
->m_flags
& M_EXT
) /*XXX*/ /* pretty sure this is bogus */
728 mbcnt
+= m
->m_ext
.ext_size
;
731 if (len
!= sb
->sb_cc
|| mbcnt
!= sb
->sb_mbcnt
) {
732 panic("cc %ld != %ld || mbcnt %ld != %ld\n", len
, sb
->sb_cc
,
733 mbcnt
, sb
->sb_mbcnt
);
739 * As above, except the mbuf chain
740 * begins a new record.
743 sbappendrecord(sb
, m0
)
744 register struct sockbuf
*sb
;
745 register struct mbuf
*m0
;
747 register struct mbuf
*m
;
753 if ((sb
->sb_flags
& SB_RECV
) != 0) {
754 int error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, NULL
, sock_data_filt_flag_record
, NULL
);
756 if (error
!= EJUSTRETURN
)
767 * Put the first mbuf on the queue.
768 * Note this permits zero length records.
777 if (m
&& (m0
->m_flags
& M_EOR
)) {
778 m0
->m_flags
&= ~M_EOR
;
781 return sbcompress(sb
, m
, m0
);
785 * As above except that OOB data
786 * is inserted at the beginning of the sockbuf,
787 * but after any other OOB data.
800 if ((sb
->sb_flags
& SB_RECV
) != 0) {
801 int error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, NULL
,
802 sock_data_filt_flag_oob
, NULL
);
805 if (error
!= EJUSTRETURN
) {
812 for (mp
= &sb
->sb_mb
; *mp
; mp
= &((*mp
)->m_nextpkt
)) {
818 continue; /* WANT next train */
823 goto again
; /* inspect THIS train further */
828 * Put the first mbuf on the queue.
829 * Note this permits zero length records.
836 if (m
&& (m0
->m_flags
& M_EOR
)) {
837 m0
->m_flags
&= ~M_EOR
;
840 return sbcompress(sb
, m
, m0
);
844 * Append address and data, and optionally, control (ancillary) data
845 * to the receive queue of a socket. If present,
846 * m0 must include a packet header with total length.
847 * Returns 0 if no space in sockbuf or insufficient mbufs.
850 sbappendaddr_internal(sb
, asa
, m0
, control
)
851 register struct sockbuf
*sb
;
852 struct sockaddr
*asa
;
853 struct mbuf
*m0
, *control
;
855 register struct mbuf
*m
, *n
;
856 int space
= asa
->sa_len
;
858 if (m0
&& (m0
->m_flags
& M_PKTHDR
) == 0)
859 panic("sbappendaddr");
862 space
+= m0
->m_pkthdr
.len
;
863 for (n
= control
; n
; n
= n
->m_next
) {
865 if (n
->m_next
== 0) /* keep pointer to last control buf */
868 if (space
> sbspace(sb
))
870 if (asa
->sa_len
> MLEN
)
872 MGET(m
, M_DONTWAIT
, MT_SONAME
);
875 m
->m_len
= asa
->sa_len
;
876 bcopy((caddr_t
)asa
, mtod(m
, caddr_t
), asa
->sa_len
);
878 n
->m_next
= m0
; /* concatenate data to control */
882 for (n
= m
; n
; n
= n
->m_next
)
891 postevent(0,sb
,EV_RWBYTES
);
898 struct sockaddr
* asa
,
900 struct mbuf
*control
,
905 if (error_out
) *error_out
= 0;
907 if (m0
&& (m0
->m_flags
& M_PKTHDR
) == 0)
908 panic("sbappendaddrorfree");
910 /* Call socket data in filters */
911 if ((sb
->sb_flags
& SB_RECV
) != 0) {
913 error
= sflt_data_in(sb
->sb_so
, asa
, &m0
, &control
, 0, NULL
);
915 if (error
!= EJUSTRETURN
) {
917 if (control
) m_freem(control
);
918 if (error_out
) *error_out
= error
;
924 result
= sbappendaddr_internal(sb
, asa
, m0
, control
);
927 if (control
) m_freem(control
);
928 if (error_out
) *error_out
= ENOBUFS
;
935 sbappendcontrol_internal(sb
, m0
, control
)
937 struct mbuf
*control
, *m0
;
939 register struct mbuf
*m
, *n
;
943 panic("sbappendcontrol");
945 for (m
= control
; ; m
= m
->m_next
) {
950 n
= m
; /* save pointer to last control buffer */
951 for (m
= m0
; m
; m
= m
->m_next
)
953 if (space
> sbspace(sb
))
955 n
->m_next
= m0
; /* concatenate data to control */
956 for (m
= control
; m
; m
= m
->m_next
)
962 n
->m_nextpkt
= control
;
965 postevent(0,sb
,EV_RWBYTES
);
973 struct mbuf
*control
,
978 if (error_out
) *error_out
= 0;
980 if (sb
->sb_flags
& SB_RECV
) {
982 error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, &control
, 0, NULL
);
984 if (error
!= EJUSTRETURN
) {
986 if (control
) m_freem(control
);
987 if (error_out
) *error_out
= error
;
993 result
= sbappendcontrol_internal(sb
, m0
, control
);
996 if (control
) m_freem(control
);
997 if (error_out
) *error_out
= ENOBUFS
;
1004 * Compress mbuf chain m into the socket
1005 * buffer sb following mbuf n. If n
1006 * is null, the buffer is presumed empty.
1009 sbcompress(sb
, m
, n
)
1010 register struct sockbuf
*sb
;
1011 register struct mbuf
*m
, *n
;
1013 register int eor
= 0;
1014 register struct mbuf
*o
;
1017 eor
|= m
->m_flags
& M_EOR
;
1018 if (m
->m_len
== 0 &&
1020 (((o
= m
->m_next
) || (o
= n
)) &&
1021 o
->m_type
== m
->m_type
))) {
1025 if (n
&& (n
->m_flags
& M_EOR
) == 0 &&
1029 m
->m_len
<= MCLBYTES
/ 4 && /* XXX: Don't copy too much */
1030 m
->m_len
<= M_TRAILINGSPACE(n
) &&
1031 n
->m_type
== m
->m_type
) {
1032 bcopy(mtod(m
, caddr_t
), mtod(n
, caddr_t
) + n
->m_len
,
1033 (unsigned)m
->m_len
);
1034 n
->m_len
+= m
->m_len
;
1035 sb
->sb_cc
+= m
->m_len
;
1045 m
->m_flags
&= ~M_EOR
;
1053 printf("semi-panic: sbcompress\n");
1055 postevent(0,sb
, EV_RWBYTES
);
1060 * Free all mbufs in a sockbuf.
1061 * Check that all resources are reclaimed.
1065 register struct sockbuf
*sb
;
1067 if (sb
->sb_so
== NULL
)
1068 panic ("sbflush sb->sb_so already null sb=%x\n", sb
);
1069 (void)sblock(sb
, M_WAIT
);
1070 while (sb
->sb_mbcnt
) {
1072 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
1073 * we would loop forever. Panic instead.
1075 if (!sb
->sb_cc
&& (sb
->sb_mb
== NULL
|| sb
->sb_mb
->m_len
))
1077 sbdrop(sb
, (int)sb
->sb_cc
);
1079 if (sb
->sb_cc
|| sb
->sb_mb
|| sb
->sb_mbcnt
|| sb
->sb_so
== NULL
)
1080 panic("sbflush: cc %ld || mb %p || mbcnt %ld sb_so=%x", sb
->sb_cc
, (void *)sb
->sb_mb
, sb
->sb_mbcnt
, sb
->sb_so
);
1082 postevent(0, sb
, EV_RWBYTES
);
1083 sbunlock(sb
, 1); /* keep socket locked */
1088 * Drop data from (the front of) a sockbuf.
1089 * use m_freem_list to free the mbuf structures
1090 * under a single lock... this is done by pruning
1091 * the top of the tree from the body by keeping track
1092 * of where we get to in the tree and then zeroing the
1093 * two pertinent pointers m_nextpkt and m_next
1094 * the socket buffer is then updated to point at the new
1095 * top of the tree and the pruned area is released via
1100 register struct sockbuf
*sb
;
1103 register struct mbuf
*m
, *free_list
, *ml
;
1104 struct mbuf
*next
, *last
;
1106 KERNEL_DEBUG((DBG_FNC_SBDROP
| DBG_FUNC_START
), sb
, len
, 0, 0, 0);
1108 next
= (m
= sb
->sb_mb
) ? m
->m_nextpkt
: 0;
1109 free_list
= last
= m
;
1110 ml
= (struct mbuf
*)0;
1115 /* temporarily replacing this panic with printf because
1116 * it occurs occasionally when closing a socket when there
1117 * is no harm in ignoring it. This problem will be investigated
1120 /* panic("sbdrop"); */
1121 printf("sbdrop - count not zero\n");
1123 /* zero the counts. if we have no mbufs, we have no data (PR-2986815) */
1129 next
= m
->m_nextpkt
;
1132 if (m
->m_len
> len
) {
1144 while (m
&& m
->m_len
== 0) {
1151 ml
->m_next
= (struct mbuf
*)0;
1152 last
->m_nextpkt
= (struct mbuf
*)0;
1153 m_freem_list(free_list
);
1157 m
->m_nextpkt
= next
;
1161 postevent(0, sb
, EV_RWBYTES
);
1163 KERNEL_DEBUG((DBG_FNC_SBDROP
| DBG_FUNC_END
), sb
, 0, 0, 0, 0);
1167 * Drop a record off the front of a sockbuf
1168 * and move the next record to the front.
1172 register struct sockbuf
*sb
;
1174 register struct mbuf
*m
, *mn
;
1178 sb
->sb_mb
= m
->m_nextpkt
;
1185 postevent(0, sb
, EV_RWBYTES
);
1189 * Create a "control" mbuf containing the specified data
1190 * with the specified type for presentation on a socket buffer.
1193 sbcreatecontrol(p
, size
, type
, level
)
1198 register struct cmsghdr
*cp
;
1201 if (CMSG_SPACE((u_int
)size
) > MLEN
)
1202 return ((struct mbuf
*) NULL
);
1203 if ((m
= m_get(M_DONTWAIT
, MT_CONTROL
)) == NULL
)
1204 return ((struct mbuf
*) NULL
);
1205 cp
= mtod(m
, struct cmsghdr
*);
1206 /* XXX check size? */
1207 (void)memcpy(CMSG_DATA(cp
), p
, size
);
1208 m
->m_len
= CMSG_SPACE(size
);
1209 cp
->cmsg_len
= CMSG_LEN(size
);
1210 cp
->cmsg_level
= level
;
1211 cp
->cmsg_type
= type
;
1216 * Some routines that return EOPNOTSUPP for entry points that are not
1217 * supported by a protocol. Fill in as needed.
1220 pru_abort_notsupp(struct socket
*so
)
1227 pru_accept_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1233 pru_attach_notsupp(struct socket
*so
, int proto
, struct proc
*p
)
1239 pru_bind_notsupp(struct socket
*so
, struct sockaddr
*nam
, struct proc
*p
)
1245 pru_connect_notsupp(struct socket
*so
, struct sockaddr
*nam
, struct proc
*p
)
1251 pru_connect2_notsupp(struct socket
*so1
, struct socket
*so2
)
1257 pru_control_notsupp(struct socket
*so
, u_long cmd
, caddr_t data
,
1258 struct ifnet
*ifp
, struct proc
*p
)
1264 pru_detach_notsupp(struct socket
*so
)
1270 pru_disconnect_notsupp(struct socket
*so
)
1276 pru_listen_notsupp(struct socket
*so
, struct proc
*p
)
1282 pru_peeraddr_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1288 pru_rcvd_notsupp(struct socket
*so
, int flags
)
1294 pru_rcvoob_notsupp(struct socket
*so
, struct mbuf
*m
, int flags
)
1300 pru_send_notsupp(struct socket
*so
, int flags
, struct mbuf
*m
,
1301 struct sockaddr
*addr
, struct mbuf
*control
,
1310 * This isn't really a ``null'' operation, but it's the default one
1311 * and doesn't do anything destructive.
1314 pru_sense_null(struct socket
*so
, struct stat
*sb
)
1316 sb
->st_blksize
= so
->so_snd
.sb_hiwat
;
1321 int pru_sosend_notsupp(struct socket
*so
, struct sockaddr
*addr
,
1322 struct uio
*uio
, struct mbuf
*top
,
1323 struct mbuf
*control
, int flags
)
1329 int pru_soreceive_notsupp(struct socket
*so
,
1330 struct sockaddr
**paddr
,
1331 struct uio
*uio
, struct mbuf
**mp0
,
1332 struct mbuf
**controlp
, int *flagsp
)
1339 pru_shutdown_notsupp(struct socket
*so
)
1345 pru_sockaddr_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1350 int pru_sosend(struct socket
*so
, struct sockaddr
*addr
,
1351 struct uio
*uio
, struct mbuf
*top
,
1352 struct mbuf
*control
, int flags
)
1357 int pru_soreceive(struct socket
*so
,
1358 struct sockaddr
**paddr
,
1359 struct uio
*uio
, struct mbuf
**mp0
,
1360 struct mbuf
**controlp
, int *flagsp
)
1367 pru_sopoll_notsupp(__unused
struct socket
*so
, __unused
int events
,
1368 __unused kauth_cred_t cred
, __unused
void *wql
)
1376 * The following are macros on BSD and functions on Darwin
1380 * Do we need to notify the other side when I/O is possible?
1384 sb_notify(struct sockbuf
*sb
)
1386 return ((sb
->sb_flags
& (SB_WAIT
|SB_SEL
|SB_ASYNC
|SB_UPCALL
|SB_KNOTE
)) != 0);
1390 * How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
1391 * This is problematical if the fields are unsigned, as the space might
1392 * still be negative (cc > hiwat or mbcnt > mbmax). Should detect
1393 * overflow and return 0. Should use "lmin" but it doesn't exist now.
1396 sbspace(struct sockbuf
*sb
)
1398 return ((long) imin((int)(sb
->sb_hiwat
- sb
->sb_cc
),
1399 (int)(sb
->sb_mbmax
- sb
->sb_mbcnt
)));
1402 /* do we have to send all at once on a socket? */
1404 sosendallatonce(struct socket
*so
)
1406 return (so
->so_proto
->pr_flags
& PR_ATOMIC
);
1409 /* can we read something from so? */
1411 soreadable(struct socket
*so
)
1413 return (so
->so_rcv
.sb_cc
>= so
->so_rcv
.sb_lowat
||
1414 (so
->so_state
& SS_CANTRCVMORE
) ||
1415 so
->so_comp
.tqh_first
|| so
->so_error
);
1418 /* can we write something to so? */
1421 sowriteable(struct socket
*so
)
1423 return ((sbspace(&(so
)->so_snd
) >= (so
)->so_snd
.sb_lowat
&&
1424 ((so
->so_state
&SS_ISCONNECTED
) ||
1425 (so
->so_proto
->pr_flags
&PR_CONNREQUIRED
)==0)) ||
1426 (so
->so_state
& SS_CANTSENDMORE
) ||
1430 /* adjust counters in sb reflecting allocation of m */
1433 sballoc(struct sockbuf
*sb
, struct mbuf
*m
)
1435 sb
->sb_cc
+= m
->m_len
;
1436 sb
->sb_mbcnt
+= MSIZE
;
1437 if (m
->m_flags
& M_EXT
)
1438 sb
->sb_mbcnt
+= m
->m_ext
.ext_size
;
1441 /* adjust counters in sb reflecting freeing of m */
1443 sbfree(struct sockbuf
*sb
, struct mbuf
*m
)
1445 sb
->sb_cc
-= m
->m_len
;
1446 sb
->sb_mbcnt
-= MSIZE
;
1447 if (m
->m_flags
& M_EXT
)
1448 sb
->sb_mbcnt
-= m
->m_ext
.ext_size
;
1452 * Set lock on sockbuf sb; sleep if lock is already held.
1453 * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
1454 * Returns error without lock if sleep is interrupted.
1457 sblock(struct sockbuf
*sb
, int wf
)
1459 return(sb
->sb_flags
& SB_LOCK
?
1460 ((wf
== M_WAIT
) ? sb_lock(sb
) : EWOULDBLOCK
) :
1461 (sb
->sb_flags
|= SB_LOCK
), 0);
1464 /* release lock on sockbuf sb */
1466 sbunlock(struct sockbuf
*sb
, int keeplocked
)
1468 struct socket
*so
= sb
->sb_so
;
1470 lck_mtx_t
*mutex_held
;
1473 __asm__
volatile("mflr %0" : "=r" (lr
));
1476 sb
->sb_flags
&= ~SB_LOCK
;
1478 if (so
->so_proto
->pr_getlock
!= NULL
)
1479 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
1481 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1483 if (keeplocked
== 0)
1484 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1486 if (sb
->sb_flags
& SB_WANT
) {
1487 sb
->sb_flags
&= ~SB_WANT
;
1488 if (so
->so_usecount
< 0)
1489 panic("sbunlock: b4 wakeup so=%x ref=%d lr=%x sb_flags=%x\n", sb
->sb_so
, so
->so_usecount
, lr_saved
, sb
->sb_flags
);
1491 wakeup((caddr_t
)&(sb
)->sb_flags
);
1493 if (keeplocked
== 0) { /* unlock on exit */
1495 if (so
->so_usecount
< 0)
1496 panic("sbunlock: unlock on exit so=%x lr=%x sb_flags=%x\n", so
, so
->so_usecount
,lr_saved
, sb
->sb_flags
);
1497 so
->reserved4
= lr_saved
;
1498 lck_mtx_unlock(mutex_held
);
1503 sorwakeup(struct socket
* so
)
1505 if (sb_notify(&so
->so_rcv
))
1506 sowakeup(so
, &so
->so_rcv
);
1510 sowwakeup(struct socket
* so
)
1512 if (sb_notify(&so
->so_snd
))
1513 sowakeup(so
, &so
->so_snd
);
1518 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
1521 dup_sockaddr(sa
, canwait
)
1522 struct sockaddr
*sa
;
1525 struct sockaddr
*sa2
;
1527 MALLOC(sa2
, struct sockaddr
*, sa
->sa_len
, M_SONAME
,
1528 canwait
? M_WAITOK
: M_NOWAIT
);
1530 bcopy(sa
, sa2
, sa
->sa_len
);
1535 * Create an external-format (``xsocket'') structure using the information
1536 * in the kernel-format socket structure pointed to by so. This is done
1537 * to reduce the spew of irrelevant information over this interface,
1538 * to isolate user code from changes in the kernel structure, and
1539 * potentially to provide information-hiding if we decide that
1540 * some of this information should be hidden from users.
1543 sotoxsocket(struct socket
*so
, struct xsocket
*xso
)
1545 xso
->xso_len
= sizeof *xso
;
1547 xso
->so_type
= so
->so_type
;
1548 xso
->so_options
= so
->so_options
;
1549 xso
->so_linger
= so
->so_linger
;
1550 xso
->so_state
= so
->so_state
;
1551 xso
->so_pcb
= so
->so_pcb
;
1553 xso
->xso_protocol
= so
->so_proto
->pr_protocol
;
1554 xso
->xso_family
= so
->so_proto
->pr_domain
->dom_family
;
1557 xso
->xso_protocol
= xso
->xso_family
= 0;
1558 xso
->so_qlen
= so
->so_qlen
;
1559 xso
->so_incqlen
= so
->so_incqlen
;
1560 xso
->so_qlimit
= so
->so_qlimit
;
1561 xso
->so_timeo
= so
->so_timeo
;
1562 xso
->so_error
= so
->so_error
;
1563 xso
->so_pgid
= so
->so_pgid
;
1564 xso
->so_oobmark
= so
->so_oobmark
;
1565 sbtoxsockbuf(&so
->so_snd
, &xso
->so_snd
);
1566 sbtoxsockbuf(&so
->so_rcv
, &xso
->so_rcv
);
1567 xso
->so_uid
= so
->so_uid
;
1571 * This does the same for sockbufs. Note that the xsockbuf structure,
1572 * since it is always embedded in a socket, does not include a self
1573 * pointer nor a length. We make this entry point public in case
1574 * some other mechanism needs it.
1577 sbtoxsockbuf(struct sockbuf
*sb
, struct xsockbuf
*xsb
)
1579 xsb
->sb_cc
= sb
->sb_cc
;
1580 xsb
->sb_hiwat
= sb
->sb_hiwat
;
1581 xsb
->sb_mbcnt
= sb
->sb_mbcnt
;
1582 xsb
->sb_mbmax
= sb
->sb_mbmax
;
1583 xsb
->sb_lowat
= sb
->sb_lowat
;
1584 xsb
->sb_flags
= sb
->sb_flags
;
1585 xsb
->sb_timeo
= (u_long
)(sb
->sb_timeo
.tv_sec
* hz
) + sb
->sb_timeo
.tv_usec
/ tick
;
1586 if (xsb
->sb_timeo
== 0 && sb
->sb_timeo
.tv_usec
!= 0)
1591 * Here is the definition of some of the basic objects in the kern.ipc
1592 * branch of the MIB.
1594 SYSCTL_NODE(_kern
, KERN_IPC
, ipc
, CTLFLAG_RW
, 0, "IPC");
1596 /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
1598 SYSCTL_INT(_kern
, KERN_DUMMY
, dummy
, CTLFLAG_RW
, &dummy
, 0, "");
1600 SYSCTL_INT(_kern_ipc
, KIPC_MAXSOCKBUF
, maxsockbuf
, CTLFLAG_RW
,
1601 &sb_max
, 0, "Maximum socket buffer size");
1602 SYSCTL_INT(_kern_ipc
, OID_AUTO
, maxsockets
, CTLFLAG_RD
,
1603 &maxsockets
, 0, "Maximum number of sockets avaliable");
1604 SYSCTL_INT(_kern_ipc
, KIPC_SOCKBUF_WASTE
, sockbuf_waste_factor
, CTLFLAG_RW
,
1605 &sb_efficiency
, 0, "");
1606 SYSCTL_INT(_kern_ipc
, KIPC_NMBCLUSTERS
, nmbclusters
, CTLFLAG_RD
, &nmbclusters
, 0, "");