2 * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
22 /* Copyright (c) 1998, 1999 Apple Computer, Inc. All Rights Reserved */
23 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
25 * Copyright (c) 1982, 1986, 1988, 1990, 1993
26 * The Regents of the University of California. All rights reserved.
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
31 * 1. Redistributions of source code must retain the above copyright
32 * notice, this list of conditions and the following disclaimer.
33 * 2. Redistributions in binary form must reproduce the above copyright
34 * notice, this list of conditions and the following disclaimer in the
35 * documentation and/or other materials provided with the distribution.
36 * 3. All advertising materials mentioning features or use of this software
37 * must display the following acknowledgement:
38 * This product includes software developed by the University of
39 * California, Berkeley and its contributors.
40 * 4. Neither the name of the University nor the names of its contributors
41 * may be used to endorse or promote products derived from this software
42 * without specific prior written permission.
44 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
45 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
46 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
47 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
48 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
49 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
50 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
51 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
52 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
53 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
56 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
57 * $FreeBSD: src/sys/kern/uipc_socket2.c,v 1.55.2.9 2001/07/26 18:53:02 peter Exp $
60 #include <sys/param.h>
61 #include <sys/systm.h>
62 #include <sys/domain.h>
63 #include <sys/kernel.h>
64 #include <sys/proc_internal.h>
65 #include <sys/kauth.h>
66 #include <sys/malloc.h>
68 #include <sys/protosw.h>
70 #include <sys/socket.h>
71 #include <sys/socketvar.h>
72 #include <sys/signalvar.h>
73 #include <sys/sysctl.h>
75 #include <kern/locks.h>
76 #include <net/route.h>
77 #include <netinet/in.h>
78 #include <netinet/in_pcb.h>
79 #include <sys/kdebug.h>
81 #define DBG_FNC_SBDROP NETDBG_CODE(DBG_NETSOCK, 4)
82 #define DBG_FNC_SBAPPEND NETDBG_CODE(DBG_NETSOCK, 5)
86 * Primitive routines for operating on sockets and socket buffers
89 u_long sb_max
= SB_MAX
; /* XXX should be static */
91 static u_long sb_efficiency
= 8; /* parameter for sbreserve() */
94 * Procedures to manipulate state flags of socket
95 * and do appropriate wakeups. Normal sequence from the
96 * active (originating) side is that soisconnecting() is
97 * called during processing of connect() call,
98 * resulting in an eventual call to soisconnected() if/when the
99 * connection is established. When the connection is torn down
100 * soisdisconnecting() is called during processing of disconnect() call,
101 * and soisdisconnected() is called when the connection to the peer
102 * is totally severed. The semantics of these routines are such that
103 * connectionless protocols can call soisconnected() and soisdisconnected()
104 * only, bypassing the in-progress calls when setting up a ``connection''
107 * From the passive side, a socket is created with
108 * two queues of sockets: so_incomp for connections in progress
109 * and so_comp for connections already made and awaiting user acceptance.
110 * As a protocol is preparing incoming connections, it creates a socket
111 * structure queued on so_incomp by calling sonewconn(). When the connection
112 * is established, soisconnected() is called, and transfers the
113 * socket structure to so_comp, making it available to accept().
115 * If a socket is closed with sockets on either
116 * so_incomp or so_comp, these sockets are dropped.
118 * If higher level protocols are implemented in
119 * the kernel, the wakeups done here will sometimes
120 * cause software-interrupt process scheduling.
124 register struct socket
*so
;
127 so
->so_state
&= ~(SS_ISCONNECTED
|SS_ISDISCONNECTING
);
128 so
->so_state
|= SS_ISCONNECTING
;
130 sflt_notify(so
, sock_evt_connecting
, NULL
);
137 struct socket
*head
= so
->so_head
;
139 so
->so_state
&= ~(SS_ISCONNECTING
|SS_ISDISCONNECTING
|SS_ISCONFIRMING
);
140 so
->so_state
|= SS_ISCONNECTED
;
142 sflt_notify(so
, sock_evt_connected
, NULL
);
144 if (head
&& (so
->so_state
& SS_INCOMP
)) {
145 so
->so_state
&= ~SS_INCOMP
;
146 so
->so_state
|= SS_COMP
;
147 if (head
->so_proto
->pr_getlock
!= NULL
) {
148 socket_unlock(so
, 0);
149 socket_lock(head
, 1);
151 postevent(head
, 0, EV_RCONN
);
152 TAILQ_REMOVE(&head
->so_incomp
, so
, so_list
);
154 TAILQ_INSERT_TAIL(&head
->so_comp
, so
, so_list
);
156 wakeup_one((caddr_t
)&head
->so_timeo
);
157 if (head
->so_proto
->pr_getlock
!= NULL
) {
158 socket_unlock(head
, 1);
162 postevent(so
, 0, EV_WCONN
);
163 wakeup((caddr_t
)&so
->so_timeo
);
170 soisdisconnecting(so
)
171 register struct socket
*so
;
173 so
->so_state
&= ~SS_ISCONNECTING
;
174 so
->so_state
|= (SS_ISDISCONNECTING
|SS_CANTRCVMORE
|SS_CANTSENDMORE
);
175 sflt_notify(so
, sock_evt_disconnecting
, NULL
);
176 wakeup((caddr_t
)&so
->so_timeo
);
183 register struct socket
*so
;
185 so
->so_state
&= ~(SS_ISCONNECTING
|SS_ISCONNECTED
|SS_ISDISCONNECTING
);
186 so
->so_state
|= (SS_CANTRCVMORE
|SS_CANTSENDMORE
|SS_ISDISCONNECTED
);
187 sflt_notify(so
, sock_evt_disconnected
, NULL
);
188 wakeup((caddr_t
)&so
->so_timeo
);
194 * Return a random connection that hasn't been serviced yet and
195 * is eligible for discard. There is a one in qlen chance that
196 * we will return a null, saying that there are no dropable
197 * requests. In this case, the protocol specific code should drop
198 * the new request. This insures fairness.
200 * This may be used in conjunction with protocol specific queue
201 * congestion routines.
205 register struct socket
*head
;
207 struct socket
*so
, *sonext
= NULL
;
208 unsigned int i
, j
, qlen
;
210 static struct timeval old_runtime
;
211 static unsigned int cur_cnt
, old_cnt
;
215 if ((i
= (tv
.tv_sec
- old_runtime
.tv_sec
)) != 0) {
217 old_cnt
= cur_cnt
/ i
;
221 so
= TAILQ_FIRST(&head
->so_incomp
);
225 qlen
= head
->so_incqlen
;
226 if (++cur_cnt
> qlen
|| old_cnt
> qlen
) {
227 rnd
= (314159 * rnd
+ 66329) & 0xffff;
228 j
= ((qlen
+ 1) * rnd
) >> 16;
231 // if (in_pcb_checkstate(so->so_pcb, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
233 sonext
= TAILQ_NEXT(so
, so_list
);
234 // in_pcb_check_state(so->so_pcb, WNT_RELEASE, 0);
235 socket_unlock(so
, 1);
240 // if (in_pcb_checkstate(so->so_pcb, WNT_ACQUIRE, 0) == WNT_STOPUSING)
247 * When an attempt at a new connection is noted on a socket
248 * which accepts connections, sonewconn is called. If the
249 * connection is possible (subject to space constraints, etc.)
250 * then we allocate a new structure, propoerly linked into the
251 * data structure of the original socket, and return this.
252 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
254 static struct socket
*
255 sonewconn_internal(head
, connstatus
)
256 register struct socket
*head
;
260 register struct socket
*so
;
261 lck_mtx_t
*mutex_held
;
263 if (head
->so_proto
->pr_getlock
!= NULL
)
264 mutex_held
= (*head
->so_proto
->pr_getlock
)(head
, 0);
266 mutex_held
= head
->so_proto
->pr_domain
->dom_mtx
;
267 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
269 if (head
->so_qlen
> 3 * head
->so_qlimit
/ 2)
270 return ((struct socket
*)0);
271 so
= soalloc(1, head
->so_proto
->pr_domain
->dom_family
, head
->so_type
);
273 return ((struct socket
*)0);
274 /* check if head was closed during the soalloc */
275 if (head
->so_proto
== NULL
) {
277 return ((struct socket
*)0);
281 so
->so_type
= head
->so_type
;
282 so
->so_options
= head
->so_options
&~ SO_ACCEPTCONN
;
283 so
->so_linger
= head
->so_linger
;
284 so
->so_state
= head
->so_state
| SS_NOFDREF
;
285 so
->so_proto
= head
->so_proto
;
286 so
->so_timeo
= head
->so_timeo
;
287 so
->so_pgid
= head
->so_pgid
;
288 so
->so_uid
= head
->so_uid
;
291 if (soreserve(so
, head
->so_snd
.sb_hiwat
, head
->so_rcv
.sb_hiwat
)) {
294 return ((struct socket
*)0);
298 * Must be done with head unlocked to avoid deadlock for protocol with per socket mutexes.
300 if (head
->so_proto
->pr_unlock
)
301 socket_unlock(head
, 0);
302 if (((*so
->so_proto
->pr_usrreqs
->pru_attach
)(so
, 0, NULL
) != 0) || error
) {
305 if (head
->so_proto
->pr_unlock
)
306 socket_lock(head
, 0);
307 return ((struct socket
*)0);
309 if (head
->so_proto
->pr_unlock
)
310 socket_lock(head
, 0);
312 so
->so_proto
->pr_domain
->dom_refs
++;
316 TAILQ_INSERT_TAIL(&head
->so_comp
, so
, so_list
);
317 so
->so_state
|= SS_COMP
;
319 TAILQ_INSERT_TAIL(&head
->so_incomp
, so
, so_list
);
320 so
->so_state
|= SS_INCOMP
;
325 so
->so_rcv
.sb_so
= so
->so_snd
.sb_so
= so
;
326 TAILQ_INIT(&so
->so_evlist
);
328 /* Attach socket filters for this protocol */
332 so
->so_state
|= connstatus
;
334 wakeup((caddr_t
)&head
->so_timeo
);
344 const struct sockaddr
*from
)
347 struct socket_filter_entry
*filter
;
351 for (filter
= head
->so_filt
; filter
&& (error
== 0);
352 filter
= filter
->sfe_next_onsocket
) {
353 if (filter
->sfe_filter
->sf_filter
.sf_connect_in
) {
357 socket_unlock(head
, 0);
359 error
= filter
->sfe_filter
->sf_filter
.sf_connect_in(
360 filter
->sfe_cookie
, head
, from
);
364 socket_lock(head
, 0);
372 return sonewconn_internal(head
, connstatus
);
376 * Socantsendmore indicates that no more data will be sent on the
377 * socket; it would normally be applied to a socket when the user
378 * informs the system that no more data is to be sent, by the protocol
379 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data
380 * will be received, and will normally be applied to the socket by a
381 * protocol when it detects that the peer will send no more data.
382 * Data queued for reading in the socket may yet be read.
389 so
->so_state
|= SS_CANTSENDMORE
;
390 sflt_notify(so
, sock_evt_cantsendmore
, NULL
);
398 so
->so_state
|= SS_CANTRCVMORE
;
399 sflt_notify(so
, sock_evt_cantrecvmore
, NULL
);
404 * Wait for data to arrive at/drain from a socket buffer.
410 int error
= 0, lr
, lr_saved
;
411 struct socket
*so
= sb
->sb_so
;
412 lck_mtx_t
*mutex_held
;
416 __asm__
volatile("mflr %0" : "=r" (lr
));
421 if (so
->so_proto
->pr_getlock
!= NULL
)
422 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
424 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
426 sb
->sb_flags
|= SB_WAIT
;
428 if (so
->so_usecount
< 1)
429 panic("sbwait: so=%x refcount=%d\n", so
, so
->so_usecount
);
430 ts
.tv_sec
= sb
->sb_timeo
.tv_sec
;
431 ts
.tv_nsec
= sb
->sb_timeo
.tv_usec
* 1000;
432 error
= msleep((caddr_t
)&sb
->sb_cc
, mutex_held
,
433 (sb
->sb_flags
& SB_NOINTR
) ? PSOCK
: PSOCK
| PCATCH
, "sbwait",
436 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
438 if (so
->so_usecount
< 1)
439 panic("sbwait: so=%x refcount=%d\n", so
, so
->so_usecount
);
441 if ((so
->so_state
& SS_DRAINING
)) {
449 * Lock a sockbuf already known to be locked;
450 * return any error returned from sleep (EINTR).
454 register struct sockbuf
*sb
;
456 struct socket
*so
= sb
->sb_so
;
457 lck_mtx_t
* mutex_held
;
458 int error
= 0, lr
, lr_saved
;
461 __asm__
volatile("mflr %0" : "=r" (lr
));
466 panic("sb_lock: null so back pointer sb=%x\n", sb
);
468 while (sb
->sb_flags
& SB_LOCK
) {
469 sb
->sb_flags
|= SB_WANT
;
470 if (so
->so_proto
->pr_getlock
!= NULL
)
471 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
473 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
474 if (so
->so_usecount
< 1)
475 panic("sb_lock: so=%x refcount=%d\n", so
, so
->so_usecount
);
476 error
= msleep((caddr_t
)&sb
->sb_flags
, mutex_held
,
477 (sb
->sb_flags
& SB_NOINTR
) ? PSOCK
: PSOCK
| PCATCH
, "sblock", 0);
478 if (so
->so_usecount
< 1)
479 panic("sb_lock: 2 so=%x refcount=%d\n", so
, so
->so_usecount
);
483 sb
->sb_flags
|= SB_LOCK
;
488 * Wakeup processes waiting on a socket buffer.
489 * Do asynchronous notification via SIGIO
490 * if the socket has the SS_ASYNC flag set.
494 register struct socket
*so
;
495 register struct sockbuf
*sb
;
497 struct proc
*p
= current_proc();
498 sb
->sb_flags
&= ~SB_SEL
;
499 selwakeup(&sb
->sb_sel
);
500 if (sb
->sb_flags
& SB_WAIT
) {
501 sb
->sb_flags
&= ~SB_WAIT
;
502 wakeup((caddr_t
)&sb
->sb_cc
);
504 if (so
->so_state
& SS_ASYNC
) {
506 gsignal(-so
->so_pgid
, SIGIO
);
507 else if (so
->so_pgid
> 0 && (p
= pfind(so
->so_pgid
)) != 0)
510 if (sb
->sb_flags
& SB_KNOTE
) {
511 KNOTE(&sb
->sb_sel
.si_note
, SO_FILT_HINT_LOCKED
);
513 if (sb
->sb_flags
& SB_UPCALL
) {
514 socket_unlock(so
, 0);
515 (*so
->so_upcall
)(so
, so
->so_upcallarg
, M_DONTWAIT
);
521 * Socket buffer (struct sockbuf) utility routines.
523 * Each socket contains two socket buffers: one for sending data and
524 * one for receiving data. Each buffer contains a queue of mbufs,
525 * information about the number of mbufs and amount of data in the
526 * queue, and other fields allowing select() statements and notification
527 * on data availability to be implemented.
529 * Data stored in a socket buffer is maintained as a list of records.
530 * Each record is a list of mbufs chained together with the m_next
531 * field. Records are chained together with the m_nextpkt field. The upper
532 * level routine soreceive() expects the following conventions to be
533 * observed when placing information in the receive buffer:
535 * 1. If the protocol requires each message be preceded by the sender's
536 * name, then a record containing that name must be present before
537 * any associated data (mbuf's must be of type MT_SONAME).
538 * 2. If the protocol supports the exchange of ``access rights'' (really
539 * just additional data associated with the message), and there are
540 * ``rights'' to be received, then a record containing this data
541 * should be present (mbuf's must be of type MT_RIGHTS).
542 * 3. If a name or rights record exists, then it must be followed by
543 * a data record, perhaps of zero length.
545 * Before using a new socket structure it is first necessary to reserve
546 * buffer space to the socket, by calling sbreserve(). This should commit
547 * some of the available buffer space in the system buffer pool for the
548 * socket (currently, it does nothing but enforce limits). The space
549 * should be released by calling sbrelease() when the socket is destroyed.
553 soreserve(so
, sndcc
, rcvcc
)
554 register struct socket
*so
;
558 if (sbreserve(&so
->so_snd
, sndcc
) == 0)
560 if (sbreserve(&so
->so_rcv
, rcvcc
) == 0)
562 if (so
->so_rcv
.sb_lowat
== 0)
563 so
->so_rcv
.sb_lowat
= 1;
564 if (so
->so_snd
.sb_lowat
== 0)
565 so
->so_snd
.sb_lowat
= MCLBYTES
;
566 if (so
->so_snd
.sb_lowat
> so
->so_snd
.sb_hiwat
)
567 so
->so_snd
.sb_lowat
= so
->so_snd
.sb_hiwat
;
571 selthreadclear(&so
->so_snd
.sb_sel
);
573 sbrelease(&so
->so_snd
);
579 * Allot mbufs to a sockbuf.
580 * Attempt to scale mbmax so that mbcnt doesn't become limiting
581 * if buffering efficiency is near the normal case.
588 if ((u_quad_t
)cc
> (u_quad_t
)sb_max
* MCLBYTES
/ (MSIZE
+ MCLBYTES
))
591 sb
->sb_mbmax
= min(cc
* sb_efficiency
, sb_max
);
592 if (sb
->sb_lowat
> sb
->sb_hiwat
)
593 sb
->sb_lowat
= sb
->sb_hiwat
;
598 * Free mbufs held by a socket, and reserved mbuf space.
600 /* WARNING needs to do selthreadclear() before calling this */
613 * Routines to add and remove
614 * data from an mbuf queue.
616 * The routines sbappend() or sbappendrecord() are normally called to
617 * append new mbufs to a socket buffer, after checking that adequate
618 * space is available, comparing the function sbspace() with the amount
619 * of data to be added. sbappendrecord() differs from sbappend() in
620 * that data supplied is treated as the beginning of a new record.
621 * To place a sender's address, optional access rights, and data in a
622 * socket receive buffer, sbappendaddr() should be used. To place
623 * access rights and data in a socket receive buffer, sbappendrights()
624 * should be used. In either case, the new data begins a new record.
625 * Note that unlike sbappend() and sbappendrecord(), these routines check
626 * for the caller that there will be enough space to store the data.
627 * Each fails if there is not enough space, or if it cannot find mbufs
628 * to store additional information in.
630 * Reliable protocols may use the socket send buffer to hold data
631 * awaiting acknowledgement. Data is normally copied from a socket
632 * send buffer in a protocol with m_copy for output to a peer,
633 * and then removing the data from the socket buffer with sbdrop()
634 * or sbdroprecord() when the data is acknowledged by the peer.
638 * Append mbuf chain m to the last record in the
639 * socket buffer sb. The additional space associated
640 * the mbuf chain is recorded in sb. Empty mbufs are
641 * discarded and mbufs are compacted where possible.
648 register struct mbuf
*n
, *sb_first
;
654 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_START
), sb
, m
->m_len
, 0, 0, 0);
660 sb_first
= n
= sb
->sb_mb
;
665 if (n
->m_flags
& M_EOR
) {
666 result
= sbappendrecord(sb
, m
); /* XXXXXX!!!! */
667 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_END
), sb
, sb
->sb_cc
, 0, 0, 0);
670 } while (n
->m_next
&& (n
= n
->m_next
));
673 if (!filtered
&& (sb
->sb_flags
& SB_RECV
) != 0) {
674 error
= sflt_data_in(sb
->sb_so
, NULL
, &m
, NULL
, 0, &filtered
);
676 /* no data was appended, caller should not call sowakeup */
681 If we any filters, the socket lock was dropped. n and sb_first
682 cached data from the socket buffer. This cache is not valid
683 since we dropped the lock. We must start over. Since filtered
684 is set we won't run through the filters a second time. We just
685 set n and sb_start again.
691 result
= sbcompress(sb
, m
, n
);
693 KERNEL_DEBUG((DBG_FNC_SBAPPEND
| DBG_FUNC_END
), sb
, sb
->sb_cc
, 0, 0, 0);
701 register struct sockbuf
*sb
;
703 register struct mbuf
*m
;
704 register struct mbuf
*n
= 0;
705 register u_long len
= 0, mbcnt
= 0;
706 lck_mtx_t
*mutex_held
;
708 if (sb
->sb_so
->so_proto
->pr_getlock
!= NULL
)
709 mutex_held
= (*sb
->sb_so
->so_proto
->pr_getlock
)(sb
->sb_so
, 0);
711 mutex_held
= sb
->sb_so
->so_proto
->pr_domain
->dom_mtx
;
713 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
718 for (m
= sb
->sb_mb
; m
; m
= n
) {
720 for (; m
; m
= m
->m_next
) {
723 if (m
->m_flags
& M_EXT
) /*XXX*/ /* pretty sure this is bogus */
724 mbcnt
+= m
->m_ext
.ext_size
;
727 if (len
!= sb
->sb_cc
|| mbcnt
!= sb
->sb_mbcnt
) {
728 panic("cc %ld != %ld || mbcnt %ld != %ld\n", len
, sb
->sb_cc
,
729 mbcnt
, sb
->sb_mbcnt
);
735 * As above, except the mbuf chain
736 * begins a new record.
739 sbappendrecord(sb
, m0
)
740 register struct sockbuf
*sb
;
741 register struct mbuf
*m0
;
743 register struct mbuf
*m
;
749 if ((sb
->sb_flags
& SB_RECV
) != 0) {
750 int error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, NULL
, sock_data_filt_flag_record
, NULL
);
752 if (error
!= EJUSTRETURN
)
763 * Put the first mbuf on the queue.
764 * Note this permits zero length records.
773 if (m
&& (m0
->m_flags
& M_EOR
)) {
774 m0
->m_flags
&= ~M_EOR
;
777 return sbcompress(sb
, m
, m0
);
781 * As above except that OOB data
782 * is inserted at the beginning of the sockbuf,
783 * but after any other OOB data.
796 if ((sb
->sb_flags
& SB_RECV
) != 0) {
797 int error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, NULL
,
798 sock_data_filt_flag_oob
, NULL
);
801 if (error
!= EJUSTRETURN
) {
808 for (mp
= &sb
->sb_mb
; *mp
; mp
= &((*mp
)->m_nextpkt
)) {
814 continue; /* WANT next train */
819 goto again
; /* inspect THIS train further */
824 * Put the first mbuf on the queue.
825 * Note this permits zero length records.
832 if (m
&& (m0
->m_flags
& M_EOR
)) {
833 m0
->m_flags
&= ~M_EOR
;
836 return sbcompress(sb
, m
, m0
);
840 * Append address and data, and optionally, control (ancillary) data
841 * to the receive queue of a socket. If present,
842 * m0 must include a packet header with total length.
843 * Returns 0 if no space in sockbuf or insufficient mbufs.
846 sbappendaddr_internal(sb
, asa
, m0
, control
)
847 register struct sockbuf
*sb
;
848 struct sockaddr
*asa
;
849 struct mbuf
*m0
, *control
;
851 register struct mbuf
*m
, *n
;
852 int space
= asa
->sa_len
;
854 if (m0
&& (m0
->m_flags
& M_PKTHDR
) == 0)
855 panic("sbappendaddr");
858 space
+= m0
->m_pkthdr
.len
;
859 for (n
= control
; n
; n
= n
->m_next
) {
861 if (n
->m_next
== 0) /* keep pointer to last control buf */
864 if (space
> sbspace(sb
))
866 if (asa
->sa_len
> MLEN
)
868 MGET(m
, M_DONTWAIT
, MT_SONAME
);
871 m
->m_len
= asa
->sa_len
;
872 bcopy((caddr_t
)asa
, mtod(m
, caddr_t
), asa
->sa_len
);
874 n
->m_next
= m0
; /* concatenate data to control */
878 for (n
= m
; n
; n
= n
->m_next
)
887 postevent(0,sb
,EV_RWBYTES
);
894 struct sockaddr
* asa
,
896 struct mbuf
*control
,
901 if (error_out
) *error_out
= 0;
903 if (m0
&& (m0
->m_flags
& M_PKTHDR
) == 0)
904 panic("sbappendaddrorfree");
906 /* Call socket data in filters */
907 if ((sb
->sb_flags
& SB_RECV
) != 0) {
909 error
= sflt_data_in(sb
->sb_so
, asa
, &m0
, &control
, 0, NULL
);
911 if (error
!= EJUSTRETURN
) {
913 if (control
) m_freem(control
);
914 if (error_out
) *error_out
= error
;
920 result
= sbappendaddr_internal(sb
, asa
, m0
, control
);
923 if (control
) m_freem(control
);
924 if (error_out
) *error_out
= ENOBUFS
;
931 sbappendcontrol_internal(sb
, m0
, control
)
933 struct mbuf
*control
, *m0
;
935 register struct mbuf
*m
, *n
;
939 panic("sbappendcontrol");
941 for (m
= control
; ; m
= m
->m_next
) {
946 n
= m
; /* save pointer to last control buffer */
947 for (m
= m0
; m
; m
= m
->m_next
)
949 if (space
> sbspace(sb
))
951 n
->m_next
= m0
; /* concatenate data to control */
952 for (m
= control
; m
; m
= m
->m_next
)
958 n
->m_nextpkt
= control
;
961 postevent(0,sb
,EV_RWBYTES
);
969 struct mbuf
*control
,
974 if (error_out
) *error_out
= 0;
976 if (sb
->sb_flags
& SB_RECV
) {
978 error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, &control
, 0, NULL
);
980 if (error
!= EJUSTRETURN
) {
982 if (control
) m_freem(control
);
983 if (error_out
) *error_out
= error
;
989 result
= sbappendcontrol_internal(sb
, m0
, control
);
992 if (control
) m_freem(control
);
993 if (error_out
) *error_out
= ENOBUFS
;
1000 * Compress mbuf chain m into the socket
1001 * buffer sb following mbuf n. If n
1002 * is null, the buffer is presumed empty.
1005 sbcompress(sb
, m
, n
)
1006 register struct sockbuf
*sb
;
1007 register struct mbuf
*m
, *n
;
1009 register int eor
= 0;
1010 register struct mbuf
*o
;
1013 eor
|= m
->m_flags
& M_EOR
;
1014 if (m
->m_len
== 0 &&
1016 (((o
= m
->m_next
) || (o
= n
)) &&
1017 o
->m_type
== m
->m_type
))) {
1021 if (n
&& (n
->m_flags
& M_EOR
) == 0 &&
1025 m
->m_len
<= MCLBYTES
/ 4 && /* XXX: Don't copy too much */
1026 m
->m_len
<= M_TRAILINGSPACE(n
) &&
1027 n
->m_type
== m
->m_type
) {
1028 bcopy(mtod(m
, caddr_t
), mtod(n
, caddr_t
) + n
->m_len
,
1029 (unsigned)m
->m_len
);
1030 n
->m_len
+= m
->m_len
;
1031 sb
->sb_cc
+= m
->m_len
;
1041 m
->m_flags
&= ~M_EOR
;
1049 printf("semi-panic: sbcompress\n");
1051 postevent(0,sb
, EV_RWBYTES
);
1056 * Free all mbufs in a sockbuf.
1057 * Check that all resources are reclaimed.
1061 register struct sockbuf
*sb
;
1063 if (sb
->sb_so
== NULL
)
1064 panic ("sbflush sb->sb_so already null sb=%x\n", sb
);
1065 (void)sblock(sb
, M_WAIT
);
1066 while (sb
->sb_mbcnt
) {
1068 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
1069 * we would loop forever. Panic instead.
1071 if (!sb
->sb_cc
&& (sb
->sb_mb
== NULL
|| sb
->sb_mb
->m_len
))
1073 sbdrop(sb
, (int)sb
->sb_cc
);
1075 if (sb
->sb_cc
|| sb
->sb_mb
|| sb
->sb_mbcnt
|| sb
->sb_so
== NULL
)
1076 panic("sbflush: cc %ld || mb %p || mbcnt %ld sb_so=%x", sb
->sb_cc
, (void *)sb
->sb_mb
, sb
->sb_mbcnt
, sb
->sb_so
);
1078 postevent(0, sb
, EV_RWBYTES
);
1079 sbunlock(sb
, 1); /* keep socket locked */
1084 * Drop data from (the front of) a sockbuf.
1085 * use m_freem_list to free the mbuf structures
1086 * under a single lock... this is done by pruning
1087 * the top of the tree from the body by keeping track
1088 * of where we get to in the tree and then zeroing the
1089 * two pertinent pointers m_nextpkt and m_next
1090 * the socket buffer is then updated to point at the new
1091 * top of the tree and the pruned area is released via
1096 register struct sockbuf
*sb
;
1099 register struct mbuf
*m
, *free_list
, *ml
;
1100 struct mbuf
*next
, *last
;
1102 KERNEL_DEBUG((DBG_FNC_SBDROP
| DBG_FUNC_START
), sb
, len
, 0, 0, 0);
1104 next
= (m
= sb
->sb_mb
) ? m
->m_nextpkt
: 0;
1105 free_list
= last
= m
;
1106 ml
= (struct mbuf
*)0;
1111 /* temporarily replacing this panic with printf because
1112 * it occurs occasionally when closing a socket when there
1113 * is no harm in ignoring it. This problem will be investigated
1116 /* panic("sbdrop"); */
1117 printf("sbdrop - count not zero\n");
1119 /* zero the counts. if we have no mbufs, we have no data (PR-2986815) */
1125 next
= m
->m_nextpkt
;
1128 if (m
->m_len
> len
) {
1140 while (m
&& m
->m_len
== 0) {
1147 ml
->m_next
= (struct mbuf
*)0;
1148 last
->m_nextpkt
= (struct mbuf
*)0;
1149 m_freem_list(free_list
);
1153 m
->m_nextpkt
= next
;
1157 postevent(0, sb
, EV_RWBYTES
);
1159 KERNEL_DEBUG((DBG_FNC_SBDROP
| DBG_FUNC_END
), sb
, 0, 0, 0, 0);
1163 * Drop a record off the front of a sockbuf
1164 * and move the next record to the front.
1168 register struct sockbuf
*sb
;
1170 register struct mbuf
*m
, *mn
;
1174 sb
->sb_mb
= m
->m_nextpkt
;
1181 postevent(0, sb
, EV_RWBYTES
);
1185 * Create a "control" mbuf containing the specified data
1186 * with the specified type for presentation on a socket buffer.
1189 sbcreatecontrol(p
, size
, type
, level
)
1194 register struct cmsghdr
*cp
;
1197 if (CMSG_SPACE((u_int
)size
) > MLEN
)
1198 return ((struct mbuf
*) NULL
);
1199 if ((m
= m_get(M_DONTWAIT
, MT_CONTROL
)) == NULL
)
1200 return ((struct mbuf
*) NULL
);
1201 cp
= mtod(m
, struct cmsghdr
*);
1202 /* XXX check size? */
1203 (void)memcpy(CMSG_DATA(cp
), p
, size
);
1204 m
->m_len
= CMSG_SPACE(size
);
1205 cp
->cmsg_len
= CMSG_LEN(size
);
1206 cp
->cmsg_level
= level
;
1207 cp
->cmsg_type
= type
;
1212 * Some routines that return EOPNOTSUPP for entry points that are not
1213 * supported by a protocol. Fill in as needed.
1216 pru_abort_notsupp(struct socket
*so
)
1223 pru_accept_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1229 pru_attach_notsupp(struct socket
*so
, int proto
, struct proc
*p
)
1235 pru_bind_notsupp(struct socket
*so
, struct sockaddr
*nam
, struct proc
*p
)
1241 pru_connect_notsupp(struct socket
*so
, struct sockaddr
*nam
, struct proc
*p
)
1247 pru_connect2_notsupp(struct socket
*so1
, struct socket
*so2
)
1253 pru_control_notsupp(struct socket
*so
, u_long cmd
, caddr_t data
,
1254 struct ifnet
*ifp
, struct proc
*p
)
1260 pru_detach_notsupp(struct socket
*so
)
1266 pru_disconnect_notsupp(struct socket
*so
)
1272 pru_listen_notsupp(struct socket
*so
, struct proc
*p
)
1278 pru_peeraddr_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1284 pru_rcvd_notsupp(struct socket
*so
, int flags
)
1290 pru_rcvoob_notsupp(struct socket
*so
, struct mbuf
*m
, int flags
)
1296 pru_send_notsupp(struct socket
*so
, int flags
, struct mbuf
*m
,
1297 struct sockaddr
*addr
, struct mbuf
*control
,
1306 * This isn't really a ``null'' operation, but it's the default one
1307 * and doesn't do anything destructive.
1310 pru_sense_null(struct socket
*so
, struct stat
*sb
)
1312 sb
->st_blksize
= so
->so_snd
.sb_hiwat
;
1317 int pru_sosend_notsupp(struct socket
*so
, struct sockaddr
*addr
,
1318 struct uio
*uio
, struct mbuf
*top
,
1319 struct mbuf
*control
, int flags
)
1325 int pru_soreceive_notsupp(struct socket
*so
,
1326 struct sockaddr
**paddr
,
1327 struct uio
*uio
, struct mbuf
**mp0
,
1328 struct mbuf
**controlp
, int *flagsp
)
1335 pru_shutdown_notsupp(struct socket
*so
)
1341 pru_sockaddr_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1346 int pru_sosend(struct socket
*so
, struct sockaddr
*addr
,
1347 struct uio
*uio
, struct mbuf
*top
,
1348 struct mbuf
*control
, int flags
)
1353 int pru_soreceive(struct socket
*so
,
1354 struct sockaddr
**paddr
,
1355 struct uio
*uio
, struct mbuf
**mp0
,
1356 struct mbuf
**controlp
, int *flagsp
)
1363 pru_sopoll_notsupp(__unused
struct socket
*so
, __unused
int events
,
1364 __unused kauth_cred_t cred
, __unused
void *wql
)
1372 * The following are macros on BSD and functions on Darwin
1376 * Do we need to notify the other side when I/O is possible?
1380 sb_notify(struct sockbuf
*sb
)
1382 return ((sb
->sb_flags
& (SB_WAIT
|SB_SEL
|SB_ASYNC
|SB_UPCALL
|SB_KNOTE
)) != 0);
1386 * How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
1387 * This is problematical if the fields are unsigned, as the space might
1388 * still be negative (cc > hiwat or mbcnt > mbmax). Should detect
1389 * overflow and return 0. Should use "lmin" but it doesn't exist now.
1392 sbspace(struct sockbuf
*sb
)
1394 return ((long) imin((int)(sb
->sb_hiwat
- sb
->sb_cc
),
1395 (int)(sb
->sb_mbmax
- sb
->sb_mbcnt
)));
1398 /* do we have to send all at once on a socket? */
1400 sosendallatonce(struct socket
*so
)
1402 return (so
->so_proto
->pr_flags
& PR_ATOMIC
);
1405 /* can we read something from so? */
1407 soreadable(struct socket
*so
)
1409 return (so
->so_rcv
.sb_cc
>= so
->so_rcv
.sb_lowat
||
1410 (so
->so_state
& SS_CANTRCVMORE
) ||
1411 so
->so_comp
.tqh_first
|| so
->so_error
);
1414 /* can we write something to so? */
1417 sowriteable(struct socket
*so
)
1419 return ((sbspace(&(so
)->so_snd
) >= (so
)->so_snd
.sb_lowat
&&
1420 ((so
->so_state
&SS_ISCONNECTED
) ||
1421 (so
->so_proto
->pr_flags
&PR_CONNREQUIRED
)==0)) ||
1422 (so
->so_state
& SS_CANTSENDMORE
) ||
1426 /* adjust counters in sb reflecting allocation of m */
1429 sballoc(struct sockbuf
*sb
, struct mbuf
*m
)
1431 sb
->sb_cc
+= m
->m_len
;
1432 sb
->sb_mbcnt
+= MSIZE
;
1433 if (m
->m_flags
& M_EXT
)
1434 sb
->sb_mbcnt
+= m
->m_ext
.ext_size
;
1437 /* adjust counters in sb reflecting freeing of m */
1439 sbfree(struct sockbuf
*sb
, struct mbuf
*m
)
1441 sb
->sb_cc
-= m
->m_len
;
1442 sb
->sb_mbcnt
-= MSIZE
;
1443 if (m
->m_flags
& M_EXT
)
1444 sb
->sb_mbcnt
-= m
->m_ext
.ext_size
;
1448 * Set lock on sockbuf sb; sleep if lock is already held.
1449 * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
1450 * Returns error without lock if sleep is interrupted.
1453 sblock(struct sockbuf
*sb
, int wf
)
1455 return(sb
->sb_flags
& SB_LOCK
?
1456 ((wf
== M_WAIT
) ? sb_lock(sb
) : EWOULDBLOCK
) :
1457 (sb
->sb_flags
|= SB_LOCK
), 0);
1460 /* release lock on sockbuf sb */
1462 sbunlock(struct sockbuf
*sb
, int keeplocked
)
1464 struct socket
*so
= sb
->sb_so
;
1466 lck_mtx_t
*mutex_held
;
1469 __asm__
volatile("mflr %0" : "=r" (lr
));
1472 sb
->sb_flags
&= ~SB_LOCK
;
1474 if (so
->so_proto
->pr_getlock
!= NULL
)
1475 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, 0);
1477 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
1479 if (keeplocked
== 0)
1480 lck_mtx_assert(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1482 if (sb
->sb_flags
& SB_WANT
) {
1483 sb
->sb_flags
&= ~SB_WANT
;
1484 if (so
->so_usecount
< 0)
1485 panic("sbunlock: b4 wakeup so=%x ref=%d lr=%x sb_flags=%x\n", sb
->sb_so
, so
->so_usecount
, lr_saved
, sb
->sb_flags
);
1487 wakeup((caddr_t
)&(sb
)->sb_flags
);
1489 if (keeplocked
== 0) { /* unlock on exit */
1491 if (so
->so_usecount
< 0)
1492 panic("sbunlock: unlock on exit so=%x lr=%x sb_flags=%x\n", so
, so
->so_usecount
,lr_saved
, sb
->sb_flags
);
1493 so
->reserved4
= lr_saved
;
1494 lck_mtx_unlock(mutex_held
);
1499 sorwakeup(struct socket
* so
)
1501 if (sb_notify(&so
->so_rcv
))
1502 sowakeup(so
, &so
->so_rcv
);
1506 sowwakeup(struct socket
* so
)
1508 if (sb_notify(&so
->so_snd
))
1509 sowakeup(so
, &so
->so_snd
);
1514 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
1517 dup_sockaddr(sa
, canwait
)
1518 struct sockaddr
*sa
;
1521 struct sockaddr
*sa2
;
1523 MALLOC(sa2
, struct sockaddr
*, sa
->sa_len
, M_SONAME
,
1524 canwait
? M_WAITOK
: M_NOWAIT
);
1526 bcopy(sa
, sa2
, sa
->sa_len
);
1531 * Create an external-format (``xsocket'') structure using the information
1532 * in the kernel-format socket structure pointed to by so. This is done
1533 * to reduce the spew of irrelevant information over this interface,
1534 * to isolate user code from changes in the kernel structure, and
1535 * potentially to provide information-hiding if we decide that
1536 * some of this information should be hidden from users.
1539 sotoxsocket(struct socket
*so
, struct xsocket
*xso
)
1541 xso
->xso_len
= sizeof *xso
;
1543 xso
->so_type
= so
->so_type
;
1544 xso
->so_options
= so
->so_options
;
1545 xso
->so_linger
= so
->so_linger
;
1546 xso
->so_state
= so
->so_state
;
1547 xso
->so_pcb
= so
->so_pcb
;
1549 xso
->xso_protocol
= so
->so_proto
->pr_protocol
;
1550 xso
->xso_family
= so
->so_proto
->pr_domain
->dom_family
;
1553 xso
->xso_protocol
= xso
->xso_family
= 0;
1554 xso
->so_qlen
= so
->so_qlen
;
1555 xso
->so_incqlen
= so
->so_incqlen
;
1556 xso
->so_qlimit
= so
->so_qlimit
;
1557 xso
->so_timeo
= so
->so_timeo
;
1558 xso
->so_error
= so
->so_error
;
1559 xso
->so_pgid
= so
->so_pgid
;
1560 xso
->so_oobmark
= so
->so_oobmark
;
1561 sbtoxsockbuf(&so
->so_snd
, &xso
->so_snd
);
1562 sbtoxsockbuf(&so
->so_rcv
, &xso
->so_rcv
);
1563 xso
->so_uid
= so
->so_uid
;
1567 * This does the same for sockbufs. Note that the xsockbuf structure,
1568 * since it is always embedded in a socket, does not include a self
1569 * pointer nor a length. We make this entry point public in case
1570 * some other mechanism needs it.
1573 sbtoxsockbuf(struct sockbuf
*sb
, struct xsockbuf
*xsb
)
1575 xsb
->sb_cc
= sb
->sb_cc
;
1576 xsb
->sb_hiwat
= sb
->sb_hiwat
;
1577 xsb
->sb_mbcnt
= sb
->sb_mbcnt
;
1578 xsb
->sb_mbmax
= sb
->sb_mbmax
;
1579 xsb
->sb_lowat
= sb
->sb_lowat
;
1580 xsb
->sb_flags
= sb
->sb_flags
;
1581 xsb
->sb_timeo
= (u_long
)(sb
->sb_timeo
.tv_sec
* hz
) + sb
->sb_timeo
.tv_usec
/ tick
;
1582 if (xsb
->sb_timeo
== 0 && sb
->sb_timeo
.tv_usec
!= 0)
1587 * Here is the definition of some of the basic objects in the kern.ipc
1588 * branch of the MIB.
1590 SYSCTL_NODE(_kern
, KERN_IPC
, ipc
, CTLFLAG_RW
, 0, "IPC");
1592 /* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
1594 SYSCTL_INT(_kern
, KERN_DUMMY
, dummy
, CTLFLAG_RW
, &dummy
, 0, "");
1596 SYSCTL_INT(_kern_ipc
, KIPC_MAXSOCKBUF
, maxsockbuf
, CTLFLAG_RW
,
1597 &sb_max
, 0, "Maximum socket buffer size");
1598 SYSCTL_INT(_kern_ipc
, OID_AUTO
, maxsockets
, CTLFLAG_RD
,
1599 &maxsockets
, 0, "Maximum number of sockets avaliable");
1600 SYSCTL_INT(_kern_ipc
, KIPC_SOCKBUF_WASTE
, sockbuf_waste_factor
, CTLFLAG_RW
,
1601 &sb_efficiency
, 0, "");
1602 SYSCTL_INT(_kern_ipc
, KIPC_NMBCLUSTERS
, nmbclusters
, CTLFLAG_RD
, &nmbclusters
, 0, "");