2 * Copyright (c) 1998-2020 Apple Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
61 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/domain.h>
73 #include <sys/kernel.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/malloc.h>
78 #include <sys/mcache.h>
79 #include <sys/protosw.h>
81 #include <sys/socket.h>
82 #include <sys/socketvar.h>
83 #include <sys/signalvar.h>
84 #include <sys/sysctl.h>
85 #include <sys/syslog.h>
86 #include <sys/unpcb.h>
88 #include <kern/locks.h>
89 #include <net/route.h>
90 #include <net/content_filter.h>
91 #include <netinet/in.h>
92 #include <netinet/in_pcb.h>
93 #include <netinet/tcp_var.h>
94 #include <sys/kdebug.h>
95 #include <libkern/OSAtomic.h>
98 #include <security/mac_framework.h>
101 #include <mach/vm_param.h>
104 #include <netinet/mptcp_var.h>
107 #define DBG_FNC_SBDROP NETDBG_CODE(DBG_NETSOCK, 4)
108 #define DBG_FNC_SBAPPEND NETDBG_CODE(DBG_NETSOCK, 5)
110 SYSCTL_DECL(_kern_ipc
);
112 __private_extern__ u_int32_t net_io_policy_throttle_best_effort
= 0;
113 SYSCTL_INT(_kern_ipc
, OID_AUTO
, throttle_best_effort
,
114 CTLFLAG_RW
| CTLFLAG_LOCKED
, &net_io_policy_throttle_best_effort
, 0, "");
116 static inline void sbcompress(struct sockbuf
*, struct mbuf
*, struct mbuf
*);
117 static struct socket
*sonewconn_internal(struct socket
*, int);
118 static int sbappendcontrol_internal(struct sockbuf
*, struct mbuf
*,
120 static void soevent_ifdenied(struct socket
*);
122 static int sbappendrecord_common(struct sockbuf
*sb
, struct mbuf
*m0
, boolean_t nodrop
);
123 static int sbappend_common(struct sockbuf
*sb
, struct mbuf
*m
, boolean_t nodrop
);
126 * Primitive routines for operating on sockets and socket buffers
128 static int soqlimitcompat
= 1;
129 static int soqlencomp
= 0;
132 * Based on the number of mbuf clusters configured, high_sb_max and sb_max can
133 * get scaled up or down to suit that memory configuration. high_sb_max is a
134 * higher limit on sb_max that is checked when sb_max gets set through sysctl.
137 u_int32_t sb_max
= SB_MAX
; /* XXX should be static */
138 u_int32_t high_sb_max
= SB_MAX
;
140 static u_int32_t sb_efficiency
= 8; /* parameter for sbreserve() */
141 int32_t total_sbmb_cnt
__attribute__((aligned(8))) = 0;
142 int32_t total_sbmb_cnt_floor
__attribute__((aligned(8))) = 0;
143 int32_t total_sbmb_cnt_peak
__attribute__((aligned(8))) = 0;
144 int64_t sbmb_limreached
__attribute__((aligned(8))) = 0;
146 u_int32_t net_io_policy_log
= 0; /* log socket policy changes */
147 #if CONFIG_PROC_UUID_POLICY
148 u_int32_t net_io_policy_uuid
= 1; /* enable UUID socket policy */
149 #endif /* CONFIG_PROC_UUID_POLICY */
152 * Procedures to manipulate state flags of socket
153 * and do appropriate wakeups. Normal sequence from the
154 * active (originating) side is that soisconnecting() is
155 * called during processing of connect() call,
156 * resulting in an eventual call to soisconnected() if/when the
157 * connection is established. When the connection is torn down
158 * soisdisconnecting() is called during processing of disconnect() call,
159 * and soisdisconnected() is called when the connection to the peer
160 * is totally severed. The semantics of these routines are such that
161 * connectionless protocols can call soisconnected() and soisdisconnected()
162 * only, bypassing the in-progress calls when setting up a ``connection''
165 * From the passive side, a socket is created with
166 * two queues of sockets: so_incomp for connections in progress
167 * and so_comp for connections already made and awaiting user acceptance.
168 * As a protocol is preparing incoming connections, it creates a socket
169 * structure queued on so_incomp by calling sonewconn(). When the connection
170 * is established, soisconnected() is called, and transfers the
171 * socket structure to so_comp, making it available to accept().
173 * If a socket is closed with sockets on either
174 * so_incomp or so_comp, these sockets are dropped.
176 * If higher level protocols are implemented in
177 * the kernel, the wakeups done here will sometimes
178 * cause software-interrupt process scheduling.
181 soisconnecting(struct socket
*so
)
183 so
->so_state
&= ~(SS_ISCONNECTED
| SS_ISDISCONNECTING
);
184 so
->so_state
|= SS_ISCONNECTING
;
186 sflt_notify(so
, sock_evt_connecting
, NULL
);
190 soisconnected(struct socket
*so
)
193 * If socket is subject to filter and is pending initial verdict,
194 * delay marking socket as connected and do not present the connected
195 * socket to user just yet.
197 if (cfil_sock_connected_pending_verdict(so
)) {
201 so
->so_state
&= ~(SS_ISCONNECTING
| SS_ISDISCONNECTING
| SS_ISCONFIRMING
);
202 so
->so_state
|= SS_ISCONNECTED
;
204 soreserve_preconnect(so
, 0);
206 sflt_notify(so
, sock_evt_connected
, NULL
);
208 if (so
->so_head
!= NULL
&& (so
->so_state
& SS_INCOMP
)) {
209 struct socket
*head
= so
->so_head
;
213 * Enforce lock order when the protocol has per socket locks
215 if (head
->so_proto
->pr_getlock
!= NULL
) {
216 socket_lock(head
, 1);
217 so_acquire_accept_list(head
, so
);
220 if (so
->so_head
== head
&& (so
->so_state
& SS_INCOMP
)) {
221 so
->so_state
&= ~SS_INCOMP
;
222 so
->so_state
|= SS_COMP
;
223 TAILQ_REMOVE(&head
->so_incomp
, so
, so_list
);
224 TAILQ_INSERT_TAIL(&head
->so_comp
, so
, so_list
);
228 * We have to release the accept list in
229 * case a socket callback calls sock_accept()
232 so_release_accept_list(head
);
233 socket_unlock(so
, 0);
236 wakeup_one((caddr_t
)&head
->so_timeo
);
239 socket_unlock(head
, 1);
242 } else if (locked
!= 0) {
243 so_release_accept_list(head
);
244 socket_unlock(head
, 1);
247 wakeup((caddr_t
)&so
->so_timeo
);
250 soevent(so
, SO_FILT_HINT_LOCKED
| SO_FILT_HINT_CONNECTED
|
251 SO_FILT_HINT_CONNINFO_UPDATED
);
256 socanwrite(struct socket
*so
)
258 return (so
->so_state
& SS_ISCONNECTED
) ||
259 !(so
->so_proto
->pr_flags
& PR_CONNREQUIRED
) ||
260 (so
->so_flags1
& SOF1_PRECONNECT_DATA
);
264 soisdisconnecting(struct socket
*so
)
266 so
->so_state
&= ~SS_ISCONNECTING
;
267 so
->so_state
|= (SS_ISDISCONNECTING
| SS_CANTRCVMORE
| SS_CANTSENDMORE
);
268 soevent(so
, SO_FILT_HINT_LOCKED
);
269 sflt_notify(so
, sock_evt_disconnecting
, NULL
);
270 wakeup((caddr_t
)&so
->so_timeo
);
276 soisdisconnected(struct socket
*so
)
278 so
->so_state
&= ~(SS_ISCONNECTING
| SS_ISCONNECTED
| SS_ISDISCONNECTING
);
279 so
->so_state
|= (SS_CANTRCVMORE
| SS_CANTSENDMORE
| SS_ISDISCONNECTED
);
280 soevent(so
, SO_FILT_HINT_LOCKED
| SO_FILT_HINT_DISCONNECTED
|
281 SO_FILT_HINT_CONNINFO_UPDATED
);
282 sflt_notify(so
, sock_evt_disconnected
, NULL
);
283 wakeup((caddr_t
)&so
->so_timeo
);
288 /* Notify content filters as soon as we cannot send/receive data */
289 cfil_sock_notify_shutdown(so
, SHUT_RDWR
);
290 #endif /* CONTENT_FILTER */
294 * This function will issue a wakeup like soisdisconnected but it will not
295 * notify the socket filters. This will avoid unlocking the socket
296 * in the midst of closing it.
299 sodisconnectwakeup(struct socket
*so
)
301 so
->so_state
&= ~(SS_ISCONNECTING
| SS_ISCONNECTED
| SS_ISDISCONNECTING
);
302 so
->so_state
|= (SS_CANTRCVMORE
| SS_CANTSENDMORE
| SS_ISDISCONNECTED
);
303 soevent(so
, SO_FILT_HINT_LOCKED
| SO_FILT_HINT_DISCONNECTED
|
304 SO_FILT_HINT_CONNINFO_UPDATED
);
305 wakeup((caddr_t
)&so
->so_timeo
);
310 /* Notify content filters as soon as we cannot send/receive data */
311 cfil_sock_notify_shutdown(so
, SHUT_RDWR
);
312 #endif /* CONTENT_FILTER */
316 * When an attempt at a new connection is noted on a socket
317 * which accepts connections, sonewconn is called. If the
318 * connection is possible (subject to space constraints, etc.)
319 * then we allocate a new structure, propoerly linked into the
320 * data structure of the original socket, and return this.
321 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
323 static struct socket
*
324 sonewconn_internal(struct socket
*head
, int connstatus
)
326 int so_qlen
, error
= 0;
328 lck_mtx_t
*mutex_held
;
330 if (head
->so_proto
->pr_getlock
!= NULL
) {
331 mutex_held
= (*head
->so_proto
->pr_getlock
)(head
, 0);
333 mutex_held
= head
->so_proto
->pr_domain
->dom_mtx
;
335 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
339 * This is the default case; so_qlen represents the
340 * sum of both incomplete and completed queues.
342 so_qlen
= head
->so_qlen
;
345 * When kern.ipc.soqlencomp is set to 1, so_qlen
346 * represents only the completed queue. Since we
347 * cannot let the incomplete queue goes unbounded
348 * (in case of SYN flood), we cap the incomplete
349 * queue length to at most somaxconn, and use that
350 * as so_qlen so that we fail immediately below.
352 so_qlen
= head
->so_qlen
- head
->so_incqlen
;
353 if (head
->so_incqlen
> somaxconn
) {
359 (soqlimitcompat
? head
->so_qlimit
: (3 * head
->so_qlimit
/ 2))) {
360 return (struct socket
*)0;
362 so
= soalloc(1, SOCK_DOM(head
), head
->so_type
);
364 return (struct socket
*)0;
366 /* check if head was closed during the soalloc */
367 if (head
->so_proto
== NULL
) {
369 return (struct socket
*)0;
372 so
->so_type
= head
->so_type
;
373 so
->so_options
= head
->so_options
& ~SO_ACCEPTCONN
;
374 so
->so_linger
= head
->so_linger
;
375 so
->so_state
= head
->so_state
| SS_NOFDREF
;
376 so
->so_proto
= head
->so_proto
;
377 so
->so_timeo
= head
->so_timeo
;
378 so
->so_pgid
= head
->so_pgid
;
379 kauth_cred_ref(head
->so_cred
);
380 so
->so_cred
= head
->so_cred
;
381 so
->last_pid
= head
->last_pid
;
382 so
->last_upid
= head
->last_upid
;
383 memcpy(so
->last_uuid
, head
->last_uuid
, sizeof(so
->last_uuid
));
384 if (head
->so_flags
& SOF_DELEGATED
) {
385 so
->e_pid
= head
->e_pid
;
386 so
->e_upid
= head
->e_upid
;
387 memcpy(so
->e_uuid
, head
->e_uuid
, sizeof(so
->e_uuid
));
389 /* inherit socket options stored in so_flags */
390 so
->so_flags
= head
->so_flags
&
391 (SOF_NOSIGPIPE
| SOF_NOADDRAVAIL
| SOF_REUSESHAREUID
|
392 SOF_NOTIFYCONFLICT
| SOF_BINDRANDOMPORT
| SOF_NPX_SETOPTSHUT
|
393 SOF_NODEFUNCT
| SOF_PRIVILEGED_TRAFFIC_CLASS
| SOF_NOTSENT_LOWAT
|
395 so
->so_flags1
|= SOF1_INBOUND
;
397 so
->next_lock_lr
= 0;
398 so
->next_unlock_lr
= 0;
400 so
->so_rcv
.sb_flags
|= SB_RECV
; /* XXX */
401 so
->so_rcv
.sb_so
= so
->so_snd
.sb_so
= so
;
403 /* inherit traffic management properties of listener */
405 head
->so_flags1
& (SOF1_TRAFFIC_MGT_SO_BACKGROUND
| SOF1_TC_NET_SERV_TYPE
|
406 SOF1_QOSMARKING_ALLOWED
| SOF1_QOSMARKING_POLICY_OVERRIDE
);
407 so
->so_background_thread
= head
->so_background_thread
;
408 so
->so_traffic_class
= head
->so_traffic_class
;
409 so
->so_netsvctype
= head
->so_netsvctype
;
411 if (soreserve(so
, head
->so_snd
.sb_hiwat
, head
->so_rcv
.sb_hiwat
)) {
413 return (struct socket
*)0;
415 so
->so_rcv
.sb_flags
|= (head
->so_rcv
.sb_flags
& SB_USRSIZE
);
416 so
->so_snd
.sb_flags
|= (head
->so_snd
.sb_flags
& SB_USRSIZE
);
419 * Must be done with head unlocked to avoid deadlock
420 * for protocol with per socket mutexes.
422 if (head
->so_proto
->pr_unlock
) {
423 socket_unlock(head
, 0);
425 if (((*so
->so_proto
->pr_usrreqs
->pru_attach
)(so
, 0, NULL
) != 0) ||
428 if (head
->so_proto
->pr_unlock
) {
429 socket_lock(head
, 0);
431 return (struct socket
*)0;
433 if (head
->so_proto
->pr_unlock
) {
434 socket_lock(head
, 0);
436 * Radar 7385998 Recheck that the head is still accepting
437 * to avoid race condition when head is getting closed.
439 if ((head
->so_options
& SO_ACCEPTCONN
) == 0) {
440 so
->so_state
&= ~SS_NOFDREF
;
442 return (struct socket
*)0;
446 if (so
->so_proto
->pr_copy_last_owner
!= NULL
) {
447 (*so
->so_proto
->pr_copy_last_owner
)(so
, head
);
449 atomic_add_32(&so
->so_proto
->pr_domain
->dom_refs
, 1);
451 /* Insert in head appropriate lists */
452 so_acquire_accept_list(head
, NULL
);
457 * Since this socket is going to be inserted into the incomp
458 * queue, it can be picked up by another thread in
459 * tcp_dropdropablreq to get dropped before it is setup..
460 * To prevent this race, set in-progress flag which can be
463 so
->so_flags
|= SOF_INCOMP_INPROGRESS
;
466 TAILQ_INSERT_TAIL(&head
->so_comp
, so
, so_list
);
467 so
->so_state
|= SS_COMP
;
469 TAILQ_INSERT_TAIL(&head
->so_incomp
, so
, so_list
);
470 so
->so_state
|= SS_INCOMP
;
475 so_release_accept_list(head
);
477 /* Attach socket filters for this protocol */
481 so
->so_state
|= connstatus
;
483 wakeup((caddr_t
)&head
->so_timeo
);
490 sonewconn(struct socket
*head
, int connstatus
, const struct sockaddr
*from
)
492 int error
= sflt_connectin(head
, from
);
497 return sonewconn_internal(head
, connstatus
);
501 * Socantsendmore indicates that no more data will be sent on the
502 * socket; it would normally be applied to a socket when the user
503 * informs the system that no more data is to be sent, by the protocol
504 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data
505 * will be received, and will normally be applied to the socket by a
506 * protocol when it detects that the peer will send no more data.
507 * Data queued for reading in the socket may yet be read.
511 socantsendmore(struct socket
*so
)
513 so
->so_state
|= SS_CANTSENDMORE
;
514 soevent(so
, SO_FILT_HINT_LOCKED
| SO_FILT_HINT_CANTSENDMORE
);
515 sflt_notify(so
, sock_evt_cantsendmore
, NULL
);
520 socantrcvmore(struct socket
*so
)
522 so
->so_state
|= SS_CANTRCVMORE
;
523 soevent(so
, SO_FILT_HINT_LOCKED
| SO_FILT_HINT_CANTRCVMORE
);
524 sflt_notify(so
, sock_evt_cantrecvmore
, NULL
);
529 * Wait for data to arrive at/drain from a socket buffer.
532 sbwait(struct sockbuf
*sb
)
534 boolean_t nointr
= (sb
->sb_flags
& SB_NOINTR
);
535 void *lr_saved
= __builtin_return_address(0);
536 struct socket
*so
= sb
->sb_so
;
537 lck_mtx_t
*mutex_held
;
542 panic("%s: null so, sb=%p sb_flags=0x%x lr=%p\n",
543 __func__
, sb
, sb
->sb_flags
, lr_saved
);
545 } else if (so
->so_usecount
< 1) {
546 panic("%s: sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
547 "lrh= %s\n", __func__
, sb
, sb
->sb_flags
, so
,
548 so
->so_usecount
, lr_saved
, solockhistory_nr(so
));
552 if ((so
->so_state
& SS_DRAINING
) || (so
->so_flags
& SOF_DEFUNCT
)) {
554 if (so
->so_flags
& SOF_DEFUNCT
) {
555 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
556 "(%d)\n", __func__
, proc_selfpid(),
557 proc_best_name(current_proc()),
558 (uint64_t)VM_KERNEL_ADDRPERM(so
),
559 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
564 if (so
->so_proto
->pr_getlock
!= NULL
) {
565 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, PR_F_WILLUNLOCK
);
567 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
570 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
572 ts
.tv_sec
= sb
->sb_timeo
.tv_sec
;
573 ts
.tv_nsec
= sb
->sb_timeo
.tv_usec
* 1000;
576 VERIFY(sb
->sb_waiters
!= 0);
578 error
= msleep((caddr_t
)&sb
->sb_cc
, mutex_held
,
579 nointr
? PSOCK
: PSOCK
| PCATCH
,
580 nointr
? "sbwait_nointr" : "sbwait", &ts
);
582 VERIFY(sb
->sb_waiters
!= 0);
585 if (so
->so_usecount
< 1) {
586 panic("%s: 2 sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
587 "lrh= %s\n", __func__
, sb
, sb
->sb_flags
, so
,
588 so
->so_usecount
, lr_saved
, solockhistory_nr(so
));
592 if ((so
->so_state
& SS_DRAINING
) || (so
->so_flags
& SOF_DEFUNCT
)) {
594 if (so
->so_flags
& SOF_DEFUNCT
) {
595 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
596 "(%d)\n", __func__
, proc_selfpid(),
597 proc_best_name(current_proc()),
598 (uint64_t)VM_KERNEL_ADDRPERM(so
),
599 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
607 sbwakeup(struct sockbuf
*sb
)
609 if (sb
->sb_waiters
> 0) {
610 wakeup((caddr_t
)&sb
->sb_cc
);
615 * Wakeup processes waiting on a socket buffer.
616 * Do asynchronous notification via SIGIO
617 * if the socket has the SS_ASYNC flag set.
620 sowakeup(struct socket
*so
, struct sockbuf
*sb
, struct socket
*so2
)
622 if (so
->so_flags
& SOF_DEFUNCT
) {
623 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] si 0x%x, "
624 "fl 0x%x [%s]\n", __func__
, proc_selfpid(),
625 proc_best_name(current_proc()),
626 (uint64_t)VM_KERNEL_ADDRPERM(so
), SOCK_DOM(so
),
627 SOCK_TYPE(so
), (uint32_t)sb
->sb_sel
.si_flags
, sb
->sb_flags
,
628 (sb
->sb_flags
& SB_RECV
) ? "rcv" : "snd");
631 sb
->sb_flags
&= ~SB_SEL
;
632 selwakeup(&sb
->sb_sel
);
634 if (so
->so_state
& SS_ASYNC
) {
635 if (so
->so_pgid
< 0) {
636 gsignal(-so
->so_pgid
, SIGIO
);
637 } else if (so
->so_pgid
> 0) {
638 proc_signal(so
->so_pgid
, SIGIO
);
641 if (sb
->sb_flags
& SB_KNOTE
) {
642 KNOTE(&sb
->sb_sel
.si_note
, SO_FILT_HINT_LOCKED
);
644 if (sb
->sb_flags
& SB_UPCALL
) {
645 void (*sb_upcall
)(struct socket
*, void *, int);
646 caddr_t sb_upcallarg
;
647 int lock
= !(sb
->sb_flags
& SB_UPCALL_LOCK
);
649 sb_upcall
= sb
->sb_upcall
;
650 sb_upcallarg
= sb
->sb_upcallarg
;
651 /* Let close know that we're about to do an upcall */
652 so
->so_upcallusecount
++;
656 struct unpcb
*unp
= sotounpcb(so2
);
657 unp
->unp_flags
|= UNP_DONTDISCONNECT
;
660 socket_unlock(so2
, 0);
662 socket_unlock(so
, 0);
664 (*sb_upcall
)(so
, sb_upcallarg
, M_DONTWAIT
);
666 if (so2
&& so
> so2
) {
670 unp
= sotounpcb(so2
);
672 if (unp
->rw_thrcount
== 0) {
673 unp
->unp_flags
&= ~UNP_DONTDISCONNECT
;
680 if (so2
&& so
< so2
) {
684 unp
= sotounpcb(so2
);
686 if (unp
->rw_thrcount
== 0) {
687 unp
->unp_flags
&= ~UNP_DONTDISCONNECT
;
693 so
->so_upcallusecount
--;
694 /* Tell close that it's safe to proceed */
695 if ((so
->so_flags
& SOF_CLOSEWAIT
) &&
696 so
->so_upcallusecount
== 0) {
697 wakeup((caddr_t
)&so
->so_upcallusecount
);
702 * Trap disconnection events for content filters
704 if ((so
->so_flags
& SOF_CONTENT_FILTER
) != 0) {
705 if ((sb
->sb_flags
& SB_RECV
)) {
706 if (so
->so_state
& (SS_CANTRCVMORE
)) {
707 cfil_sock_notify_shutdown(so
, SHUT_RD
);
710 if (so
->so_state
& (SS_CANTSENDMORE
)) {
711 cfil_sock_notify_shutdown(so
, SHUT_WR
);
715 #endif /* CONTENT_FILTER */
719 * Socket buffer (struct sockbuf) utility routines.
721 * Each socket contains two socket buffers: one for sending data and
722 * one for receiving data. Each buffer contains a queue of mbufs,
723 * information about the number of mbufs and amount of data in the
724 * queue, and other fields allowing select() statements and notification
725 * on data availability to be implemented.
727 * Data stored in a socket buffer is maintained as a list of records.
728 * Each record is a list of mbufs chained together with the m_next
729 * field. Records are chained together with the m_nextpkt field. The upper
730 * level routine soreceive() expects the following conventions to be
731 * observed when placing information in the receive buffer:
733 * 1. If the protocol requires each message be preceded by the sender's
734 * name, then a record containing that name must be present before
735 * any associated data (mbuf's must be of type MT_SONAME).
736 * 2. If the protocol supports the exchange of ``access rights'' (really
737 * just additional data associated with the message), and there are
738 * ``rights'' to be received, then a record containing this data
739 * should be present (mbuf's must be of type MT_RIGHTS).
740 * 3. If a name or rights record exists, then it must be followed by
741 * a data record, perhaps of zero length.
743 * Before using a new socket structure it is first necessary to reserve
744 * buffer space to the socket, by calling sbreserve(). This should commit
745 * some of the available buffer space in the system buffer pool for the
746 * socket (currently, it does nothing but enforce limits). The space
747 * should be released by calling sbrelease() when the socket is destroyed.
755 soreserve(struct socket
*so
, uint32_t sndcc
, uint32_t rcvcc
)
758 * We do not want to fail the creation of a socket
759 * when kern.ipc.maxsockbuf is less than the
760 * default socket buffer socket size of the protocol
761 * so force the buffer sizes to be at most the
762 * limit enforced by sbreserve()
764 uint64_t maxcc
= (uint64_t)sb_max
* MCLBYTES
/ (MSIZE
+ MCLBYTES
);
766 sndcc
= (uint32_t)maxcc
;
769 rcvcc
= (uint32_t)maxcc
;
771 if (sbreserve(&so
->so_snd
, sndcc
) == 0) {
774 so
->so_snd
.sb_idealsize
= sndcc
;
777 if (sbreserve(&so
->so_rcv
, rcvcc
) == 0) {
780 so
->so_rcv
.sb_idealsize
= rcvcc
;
783 if (so
->so_rcv
.sb_lowat
== 0) {
784 so
->so_rcv
.sb_lowat
= 1;
786 if (so
->so_snd
.sb_lowat
== 0) {
787 so
->so_snd
.sb_lowat
= MCLBYTES
;
789 if (so
->so_snd
.sb_lowat
> so
->so_snd
.sb_hiwat
) {
790 so
->so_snd
.sb_lowat
= so
->so_snd
.sb_hiwat
;
794 so
->so_snd
.sb_flags
&= ~SB_SEL
;
795 selthreadclear(&so
->so_snd
.sb_sel
);
796 sbrelease(&so
->so_snd
);
802 soreserve_preconnect(struct socket
*so
, unsigned int pre_cc
)
804 /* As of now, same bytes for both preconnect read and write */
805 so
->so_snd
.sb_preconn_hiwat
= pre_cc
;
806 so
->so_rcv
.sb_preconn_hiwat
= pre_cc
;
810 * Allot mbufs to a sockbuf.
811 * Attempt to scale mbmax so that mbcnt doesn't become limiting
812 * if buffering efficiency is near the normal case.
815 sbreserve(struct sockbuf
*sb
, u_int32_t cc
)
817 if ((u_quad_t
)cc
> (u_quad_t
)sb_max
* MCLBYTES
/ (MSIZE
+ MCLBYTES
) ||
818 (cc
> sb
->sb_hiwat
&& (sb
->sb_flags
& SB_LIMITED
))) {
822 sb
->sb_mbmax
= min(cc
* sb_efficiency
, sb_max
);
823 if (sb
->sb_lowat
> sb
->sb_hiwat
) {
824 sb
->sb_lowat
= sb
->sb_hiwat
;
830 * Free mbufs held by a socket, and reserved mbuf space.
832 /* WARNING needs to do selthreadclear() before calling this */
834 sbrelease(struct sockbuf
*sb
)
842 * Routines to add and remove
843 * data from an mbuf queue.
845 * The routines sbappend() or sbappendrecord() are normally called to
846 * append new mbufs to a socket buffer, after checking that adequate
847 * space is available, comparing the function sbspace() with the amount
848 * of data to be added. sbappendrecord() differs from sbappend() in
849 * that data supplied is treated as the beginning of a new record.
850 * To place a sender's address, optional access rights, and data in a
851 * socket receive buffer, sbappendaddr() should be used. To place
852 * access rights and data in a socket receive buffer, sbappendrights()
853 * should be used. In either case, the new data begins a new record.
854 * Note that unlike sbappend() and sbappendrecord(), these routines check
855 * for the caller that there will be enough space to store the data.
856 * Each fails if there is not enough space, or if it cannot find mbufs
857 * to store additional information in.
859 * Reliable protocols may use the socket send buffer to hold data
860 * awaiting acknowledgement. Data is normally copied from a socket
861 * send buffer in a protocol with m_copy for output to a peer,
862 * and then removing the data from the socket buffer with sbdrop()
863 * or sbdroprecord() when the data is acknowledged by the peer.
867 * Append mbuf chain m to the last record in the
868 * socket buffer sb. The additional space associated
869 * the mbuf chain is recorded in sb. Empty mbufs are
870 * discarded and mbufs are compacted where possible.
873 sbappend_common(struct sockbuf
*sb
, struct mbuf
*m
, boolean_t nodrop
)
875 struct socket
*so
= sb
->sb_so
;
877 if (m
== NULL
|| (sb
->sb_flags
& SB_DROP
)) {
878 if (m
!= NULL
&& !nodrop
) {
884 SBLASTRECORDCHK(sb
, "sbappend 1");
886 if (sb
->sb_lastrecord
!= NULL
&& (sb
->sb_mbtail
->m_flags
& M_EOR
)) {
887 return sbappendrecord_common(sb
, m
, nodrop
);
890 if (SOCK_DOM(sb
->sb_so
) == PF_INET
|| SOCK_DOM(sb
->sb_so
) == PF_INET6
) {
891 ASSERT(nodrop
== FALSE
);
892 if (sb
->sb_flags
& SB_RECV
&& !(m
&& m
->m_flags
& M_SKIPCFIL
)) {
893 int error
= sflt_data_in(so
, NULL
, &m
, NULL
, 0);
894 SBLASTRECORDCHK(sb
, "sbappend 2");
898 error
= cfil_sock_data_in(so
, NULL
, m
, NULL
, 0);
900 #endif /* CONTENT_FILTER */
903 if (error
!= EJUSTRETURN
) {
909 m
->m_flags
&= ~M_SKIPCFIL
;
913 /* If this is the first record, it's also the last record */
914 if (sb
->sb_lastrecord
== NULL
) {
915 sb
->sb_lastrecord
= m
;
918 sbcompress(sb
, m
, sb
->sb_mbtail
);
919 SBLASTRECORDCHK(sb
, "sbappend 3");
924 sbappend(struct sockbuf
*sb
, struct mbuf
*m
)
926 return sbappend_common(sb
, m
, FALSE
);
930 sbappend_nodrop(struct sockbuf
*sb
, struct mbuf
*m
)
932 return sbappend_common(sb
, m
, TRUE
);
936 * Similar to sbappend, except that this is optimized for stream sockets.
939 sbappendstream(struct sockbuf
*sb
, struct mbuf
*m
)
941 struct socket
*so
= sb
->sb_so
;
943 if (m
== NULL
|| (sb
->sb_flags
& SB_DROP
)) {
950 if (m
->m_nextpkt
!= NULL
|| (sb
->sb_mb
!= sb
->sb_lastrecord
)) {
951 panic("sbappendstream: nexpkt %p || mb %p != lastrecord %p\n",
952 m
->m_nextpkt
, sb
->sb_mb
, sb
->sb_lastrecord
);
956 SBLASTMBUFCHK(sb
, __func__
);
958 if (SOCK_DOM(sb
->sb_so
) == PF_INET
|| SOCK_DOM(sb
->sb_so
) == PF_INET6
) {
959 if (sb
->sb_flags
& SB_RECV
&& !(m
&& m
->m_flags
& M_SKIPCFIL
)) {
960 int error
= sflt_data_in(so
, NULL
, &m
, NULL
, 0);
961 SBLASTRECORDCHK(sb
, "sbappendstream 1");
965 error
= cfil_sock_data_in(so
, NULL
, m
, NULL
, 0);
967 #endif /* CONTENT_FILTER */
970 if (error
!= EJUSTRETURN
) {
976 m
->m_flags
&= ~M_SKIPCFIL
;
980 sbcompress(sb
, m
, sb
->sb_mbtail
);
981 sb
->sb_lastrecord
= sb
->sb_mb
;
982 SBLASTRECORDCHK(sb
, "sbappendstream 2");
988 sbcheck(struct sockbuf
*sb
)
992 u_int32_t len
= 0, mbcnt
= 0;
993 lck_mtx_t
*mutex_held
;
995 if (sb
->sb_so
->so_proto
->pr_getlock
!= NULL
) {
996 mutex_held
= (*sb
->sb_so
->so_proto
->pr_getlock
)(sb
->sb_so
, 0);
998 mutex_held
= sb
->sb_so
->so_proto
->pr_domain
->dom_mtx
;
1001 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
1003 if (sbchecking
== 0) {
1007 for (m
= sb
->sb_mb
; m
; m
= n
) {
1009 for (; m
; m
= m
->m_next
) {
1012 /* XXX pretty sure this is bogus */
1013 if (m
->m_flags
& M_EXT
) {
1014 mbcnt
+= m
->m_ext
.ext_size
;
1018 if (len
!= sb
->sb_cc
|| mbcnt
!= sb
->sb_mbcnt
) {
1019 panic("cc %ld != %ld || mbcnt %ld != %ld\n", len
, sb
->sb_cc
,
1020 mbcnt
, sb
->sb_mbcnt
);
1026 sblastrecordchk(struct sockbuf
*sb
, const char *where
)
1028 struct mbuf
*m
= sb
->sb_mb
;
1030 while (m
&& m
->m_nextpkt
) {
1034 if (m
!= sb
->sb_lastrecord
) {
1035 printf("sblastrecordchk: mb 0x%llx lastrecord 0x%llx "
1037 (uint64_t)VM_KERNEL_ADDRPERM(sb
->sb_mb
),
1038 (uint64_t)VM_KERNEL_ADDRPERM(sb
->sb_lastrecord
),
1039 (uint64_t)VM_KERNEL_ADDRPERM(m
));
1040 printf("packet chain:\n");
1041 for (m
= sb
->sb_mb
; m
!= NULL
; m
= m
->m_nextpkt
) {
1042 printf("\t0x%llx\n", (uint64_t)VM_KERNEL_ADDRPERM(m
));
1044 panic("sblastrecordchk from %s", where
);
1049 sblastmbufchk(struct sockbuf
*sb
, const char *where
)
1051 struct mbuf
*m
= sb
->sb_mb
;
1054 while (m
&& m
->m_nextpkt
) {
1058 while (m
&& m
->m_next
) {
1062 if (m
!= sb
->sb_mbtail
) {
1063 printf("sblastmbufchk: mb 0x%llx mbtail 0x%llx last 0x%llx\n",
1064 (uint64_t)VM_KERNEL_ADDRPERM(sb
->sb_mb
),
1065 (uint64_t)VM_KERNEL_ADDRPERM(sb
->sb_mbtail
),
1066 (uint64_t)VM_KERNEL_ADDRPERM(m
));
1067 printf("packet tree:\n");
1068 for (m
= sb
->sb_mb
; m
!= NULL
; m
= m
->m_nextpkt
) {
1070 for (n
= m
; n
!= NULL
; n
= n
->m_next
) {
1072 (uint64_t)VM_KERNEL_ADDRPERM(n
));
1076 panic("sblastmbufchk from %s", where
);
1081 * Similar to sbappend, except the mbuf chain begins a new record.
1084 sbappendrecord_common(struct sockbuf
*sb
, struct mbuf
*m0
, boolean_t nodrop
)
1089 if (m0
== NULL
|| (sb
->sb_flags
& SB_DROP
)) {
1090 if (m0
!= NULL
&& nodrop
== FALSE
) {
1096 for (m
= m0
; m
!= NULL
; m
= m
->m_next
) {
1100 if (space
> sbspace(sb
) && !(sb
->sb_flags
& SB_UNIX
)) {
1101 if (nodrop
== FALSE
) {
1107 if (SOCK_DOM(sb
->sb_so
) == PF_INET
|| SOCK_DOM(sb
->sb_so
) == PF_INET6
) {
1108 ASSERT(nodrop
== FALSE
);
1109 if (sb
->sb_flags
& SB_RECV
&& !(m0
&& m0
->m_flags
& M_SKIPCFIL
)) {
1110 int error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, NULL
,
1111 sock_data_filt_flag_record
);
1115 error
= cfil_sock_data_in(sb
->sb_so
, NULL
, m0
, NULL
, 0);
1117 #endif /* CONTENT_FILTER */
1120 SBLASTRECORDCHK(sb
, "sbappendrecord 1");
1121 if (error
!= EJUSTRETURN
) {
1127 m0
->m_flags
&= ~M_SKIPCFIL
;
1132 * Note this permits zero length records.
1135 SBLASTRECORDCHK(sb
, "sbappendrecord 2");
1136 if (sb
->sb_lastrecord
!= NULL
) {
1137 sb
->sb_lastrecord
->m_nextpkt
= m0
;
1141 sb
->sb_lastrecord
= m0
;
1146 if (m
&& (m0
->m_flags
& M_EOR
)) {
1147 m0
->m_flags
&= ~M_EOR
;
1148 m
->m_flags
|= M_EOR
;
1150 sbcompress(sb
, m
, m0
);
1151 SBLASTRECORDCHK(sb
, "sbappendrecord 3");
1156 sbappendrecord(struct sockbuf
*sb
, struct mbuf
*m0
)
1158 return sbappendrecord_common(sb
, m0
, FALSE
);
1162 sbappendrecord_nodrop(struct sockbuf
*sb
, struct mbuf
*m0
)
1164 return sbappendrecord_common(sb
, m0
, TRUE
);
1168 * Concatenate address (optional), control (optional) and data into one
1169 * single mbuf chain. If sockbuf *sb is passed in, space check will be
1172 * Returns: mbuf chain pointer if succeeded, NULL if failed
1175 sbconcat_mbufs(struct sockbuf
*sb
, struct sockaddr
*asa
, struct mbuf
*m0
, struct mbuf
*control
)
1177 struct mbuf
*m
= NULL
, *n
= NULL
;
1180 if (m0
&& (m0
->m_flags
& M_PKTHDR
) == 0) {
1181 panic("sbconcat_mbufs");
1185 space
+= m0
->m_pkthdr
.len
;
1187 for (n
= control
; n
; n
= n
->m_next
) {
1189 if (n
->m_next
== 0) { /* keep pointer to last control buf */
1195 if (asa
->sa_len
> MLEN
) {
1198 space
+= asa
->sa_len
;
1201 if (sb
!= NULL
&& space
> sbspace(sb
)) {
1206 n
->m_next
= m0
; /* concatenate data to control */
1212 MGET(m
, M_DONTWAIT
, MT_SONAME
);
1215 /* unchain control and data if necessary */
1220 m
->m_len
= asa
->sa_len
;
1221 bcopy((caddr_t
)asa
, mtod(m
, caddr_t
), asa
->sa_len
);
1223 m
->m_next
= control
;
1232 * Queue mbuf chain to the receive queue of a socket.
1233 * Parameter space is the total len of the mbuf chain.
1234 * If passed in, sockbuf space will be checked.
1236 * Returns: 0 Invalid mbuf chain
1240 sbappendchain(struct sockbuf
*sb
, struct mbuf
*m
, int space
)
1242 struct mbuf
*n
, *nlast
;
1248 if (space
!= 0 && space
> sbspace(sb
)) {
1252 for (n
= m
; n
->m_next
!= NULL
; n
= n
->m_next
) {
1258 if (sb
->sb_lastrecord
!= NULL
) {
1259 sb
->sb_lastrecord
->m_nextpkt
= m
;
1263 sb
->sb_lastrecord
= m
;
1264 sb
->sb_mbtail
= nlast
;
1266 SBLASTMBUFCHK(sb
, __func__
);
1267 SBLASTRECORDCHK(sb
, "sbappendadddr 2");
1272 * Returns: 0 Error: No space/out of mbufs/etc.
1275 * Imputed: (*error_out) errno for error
1277 * sflt_data_in:??? [whatever a filter author chooses]
1280 sbappendaddr(struct sockbuf
*sb
, struct sockaddr
*asa
, struct mbuf
*m0
,
1281 struct mbuf
*control
, int *error_out
)
1284 boolean_t sb_unix
= (sb
->sb_flags
& SB_UNIX
);
1285 struct mbuf
*mbuf_chain
= NULL
;
1291 if (m0
&& (m0
->m_flags
& M_PKTHDR
) == 0) {
1292 panic("sbappendaddrorfree");
1295 if (sb
->sb_flags
& SB_DROP
) {
1299 if (control
!= NULL
&& !sb_unix
) {
1302 if (error_out
!= NULL
) {
1303 *error_out
= EINVAL
;
1308 if (SOCK_DOM(sb
->sb_so
) == PF_INET
|| SOCK_DOM(sb
->sb_so
) == PF_INET6
) {
1309 /* Call socket data in filters */
1310 if (sb
->sb_flags
& SB_RECV
&& !(m0
&& m0
->m_flags
& M_SKIPCFIL
)) {
1312 error
= sflt_data_in(sb
->sb_so
, asa
, &m0
, &control
, 0);
1313 SBLASTRECORDCHK(sb
, __func__
);
1317 error
= cfil_sock_data_in(sb
->sb_so
, asa
, m0
, control
,
1320 #endif /* CONTENT_FILTER */
1323 if (error
!= EJUSTRETURN
) {
1327 if (control
!= NULL
&& !sb_unix
) {
1337 m0
->m_flags
&= ~M_SKIPCFIL
;
1341 mbuf_chain
= sbconcat_mbufs(sb
, asa
, m0
, control
);
1342 SBLASTRECORDCHK(sb
, "sbappendadddr 1");
1343 result
= sbappendchain(sb
, mbuf_chain
, 0);
1348 if (control
!= NULL
&& !sb_unix
) {
1352 *error_out
= ENOBUFS
;
1360 is_cmsg_valid(struct mbuf
*control
, struct cmsghdr
*cmsg
)
1366 if (cmsg
->cmsg_len
< sizeof(struct cmsghdr
)) {
1370 if ((uint8_t *)control
->m_data
>= (uint8_t *)cmsg
+ cmsg
->cmsg_len
) {
1374 if ((uint8_t *)control
->m_data
+ control
->m_len
<
1375 (uint8_t *)cmsg
+ cmsg
->cmsg_len
) {
1383 sbappendcontrol_internal(struct sockbuf
*sb
, struct mbuf
*m0
,
1384 struct mbuf
*control
)
1386 struct mbuf
*m
, *mlast
, *n
;
1390 panic("sbappendcontrol");
1393 for (m
= control
;; m
= m
->m_next
) {
1395 if (m
->m_next
== 0) {
1399 n
= m
; /* save pointer to last control buffer */
1400 for (m
= m0
; m
; m
= m
->m_next
) {
1403 if (space
> sbspace(sb
) && !(sb
->sb_flags
& SB_UNIX
)) {
1406 n
->m_next
= m0
; /* concatenate data to control */
1407 SBLASTRECORDCHK(sb
, "sbappendcontrol 1");
1409 for (m
= control
; m
->m_next
!= NULL
; m
= m
->m_next
) {
1415 if (sb
->sb_lastrecord
!= NULL
) {
1416 sb
->sb_lastrecord
->m_nextpkt
= control
;
1418 sb
->sb_mb
= control
;
1420 sb
->sb_lastrecord
= control
;
1421 sb
->sb_mbtail
= mlast
;
1423 SBLASTMBUFCHK(sb
, __func__
);
1424 SBLASTRECORDCHK(sb
, "sbappendcontrol 2");
1429 sbappendcontrol(struct sockbuf
*sb
, struct mbuf
*m0
, struct mbuf
*control
,
1433 boolean_t sb_unix
= (sb
->sb_flags
& SB_UNIX
);
1439 if (sb
->sb_flags
& SB_DROP
) {
1443 if (control
!= NULL
&& !sb_unix
) {
1446 if (error_out
!= NULL
) {
1447 *error_out
= EINVAL
;
1452 if (SOCK_DOM(sb
->sb_so
) == PF_INET
|| SOCK_DOM(sb
->sb_so
) == PF_INET6
) {
1453 if (sb
->sb_flags
& SB_RECV
&& !(m0
&& m0
->m_flags
& M_SKIPCFIL
)) {
1456 error
= sflt_data_in(sb
->sb_so
, NULL
, &m0
, &control
, 0);
1457 SBLASTRECORDCHK(sb
, __func__
);
1461 error
= cfil_sock_data_in(sb
->sb_so
, NULL
, m0
, control
,
1464 #endif /* CONTENT_FILTER */
1467 if (error
!= EJUSTRETURN
) {
1471 if (control
!= NULL
&& !sb_unix
) {
1481 m0
->m_flags
&= ~M_SKIPCFIL
;
1485 result
= sbappendcontrol_internal(sb
, m0
, control
);
1490 if (control
!= NULL
&& !sb_unix
) {
1494 *error_out
= ENOBUFS
;
1502 * TCP streams have Multipath TCP support or are regular TCP sockets.
1505 sbappendstream_rcvdemux(struct socket
*so
, struct mbuf
*m
)
1511 !((so
->so_flags
& SOF_MP_SUBFLOW
) &&
1512 (m
->m_flags
& M_PKTHDR
) &&
1513 (m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP_DFIN
))) {
1519 if (so
->so_flags
& SOF_MP_SUBFLOW
) {
1520 return sbappendmptcpstream_rcv(&so
->so_rcv
, m
);
1524 return sbappendstream(&so
->so_rcv
, m
);
1530 sbappendmptcpstream_rcv(struct sockbuf
*sb
, struct mbuf
*m
)
1532 struct socket
*so
= sb
->sb_so
;
1534 VERIFY(m
== NULL
|| (m
->m_flags
& M_PKTHDR
));
1535 /* SB_NOCOMPRESS must be set prevent loss of M_PKTHDR data */
1536 VERIFY((sb
->sb_flags
& (SB_RECV
| SB_NOCOMPRESS
)) ==
1537 (SB_RECV
| SB_NOCOMPRESS
));
1539 if (m
== NULL
|| m_pktlen(m
) == 0 || (sb
->sb_flags
& SB_DROP
) ||
1540 (so
->so_state
& SS_CANTRCVMORE
)) {
1541 if (m
&& (m
->m_flags
& M_PKTHDR
) &&
1543 (m
->m_pkthdr
.pkt_flags
& PKTF_MPTCP_DFIN
)) {
1544 mptcp_input(tptomptp(sototcpcb(so
))->mpt_mpte
, m
);
1546 } else if (m
!= NULL
) {
1551 /* the socket is not closed, so SOF_MP_SUBFLOW must be set */
1552 VERIFY(so
->so_flags
& SOF_MP_SUBFLOW
);
1554 if (m
->m_nextpkt
!= NULL
|| (sb
->sb_mb
!= sb
->sb_lastrecord
)) {
1555 panic("%s: nexpkt %p || mb %p != lastrecord %p\n", __func__
,
1556 m
->m_nextpkt
, sb
->sb_mb
, sb
->sb_lastrecord
);
1560 SBLASTMBUFCHK(sb
, __func__
);
1562 /* No filter support (SB_RECV) on mptcp subflow sockets */
1564 sbcompress(sb
, m
, sb
->sb_mbtail
);
1565 sb
->sb_lastrecord
= sb
->sb_mb
;
1566 SBLASTRECORDCHK(sb
, __func__
);
1572 * Compress mbuf chain m into the socket
1573 * buffer sb following mbuf n. If n
1574 * is null, the buffer is presumed empty.
1577 sbcompress(struct sockbuf
*sb
, struct mbuf
*m
, struct mbuf
*n
)
1579 int eor
= 0, compress
= (!(sb
->sb_flags
& SB_NOCOMPRESS
));
1583 /* There is nothing to compress; just update the tail */
1584 for (; n
->m_next
!= NULL
; n
= n
->m_next
) {
1592 eor
|= m
->m_flags
& M_EOR
;
1593 if (compress
&& m
->m_len
== 0 && (eor
== 0 ||
1594 (((o
= m
->m_next
) || (o
= n
)) && o
->m_type
== m
->m_type
))) {
1595 if (sb
->sb_lastrecord
== m
) {
1596 sb
->sb_lastrecord
= m
->m_next
;
1601 if (compress
&& n
!= NULL
&& (n
->m_flags
& M_EOR
) == 0 &&
1605 m
->m_len
<= MCLBYTES
/ 4 && /* XXX: Don't copy too much */
1606 m
->m_len
<= M_TRAILINGSPACE(n
) &&
1607 n
->m_type
== m
->m_type
) {
1608 bcopy(mtod(m
, caddr_t
), mtod(n
, caddr_t
) + n
->m_len
,
1609 (unsigned)m
->m_len
);
1610 n
->m_len
+= m
->m_len
;
1611 sb
->sb_cc
+= m
->m_len
;
1612 if (m
->m_type
!= MT_DATA
&& m
->m_type
!= MT_HEADER
&&
1613 m
->m_type
!= MT_OOBDATA
) {
1614 /* XXX: Probably don't need */
1615 sb
->sb_ctl
+= m
->m_len
;
1618 /* update send byte count */
1619 if (sb
->sb_flags
& SB_SNDBYTE_CNT
) {
1620 inp_incr_sndbytes_total(sb
->sb_so
,
1622 inp_incr_sndbytes_unsent(sb
->sb_so
,
1636 m
->m_flags
&= ~M_EOR
;
1644 printf("semi-panic: sbcompress\n");
1648 SBLASTMBUFCHK(sb
, __func__
);
1652 sb_empty_assert(struct sockbuf
*sb
, const char *where
)
1654 if (!(sb
->sb_cc
== 0 && sb
->sb_mb
== NULL
&& sb
->sb_mbcnt
== 0 &&
1655 sb
->sb_mbtail
== NULL
&& sb
->sb_lastrecord
== NULL
)) {
1656 panic("%s: sb %p so %p cc %d mbcnt %d mb %p mbtail %p "
1657 "lastrecord %p\n", where
, sb
, sb
->sb_so
, sb
->sb_cc
,
1658 sb
->sb_mbcnt
, sb
->sb_mb
, sb
->sb_mbtail
,
1665 * Free all mbufs in a sockbuf.
1666 * Check that all resources are reclaimed.
1669 sbflush(struct sockbuf
*sb
)
1671 void *lr_saved
= __builtin_return_address(0);
1672 struct socket
*so
= sb
->sb_so
;
1674 /* so_usecount may be 0 if we get here from sofreelastref() */
1676 panic("%s: null so, sb=%p sb_flags=0x%x lr=%p\n",
1677 __func__
, sb
, sb
->sb_flags
, lr_saved
);
1679 } else if (so
->so_usecount
< 0) {
1680 panic("%s: sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
1681 "lrh= %s\n", __func__
, sb
, sb
->sb_flags
, so
,
1682 so
->so_usecount
, lr_saved
, solockhistory_nr(so
));
1687 * Obtain lock on the socket buffer (SB_LOCK). This is required
1688 * to prevent the socket buffer from being unexpectedly altered
1689 * while it is used by another thread in socket send/receive.
1691 * sblock() must not fail here, hence the assertion.
1693 (void) sblock(sb
, SBL_WAIT
| SBL_NOINTR
| SBL_IGNDEFUNCT
);
1694 VERIFY(sb
->sb_flags
& SB_LOCK
);
1696 while (sb
->sb_mbcnt
> 0) {
1698 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
1699 * we would loop forever. Panic instead.
1701 if (!sb
->sb_cc
&& (sb
->sb_mb
== NULL
|| sb
->sb_mb
->m_len
)) {
1704 sbdrop(sb
, (int)sb
->sb_cc
);
1707 sb_empty_assert(sb
, __func__
);
1708 sbunlock(sb
, TRUE
); /* keep socket locked */
1712 * Drop data from (the front of) a sockbuf.
1713 * use m_freem_list to free the mbuf structures
1714 * under a single lock... this is done by pruning
1715 * the top of the tree from the body by keeping track
1716 * of where we get to in the tree and then zeroing the
1717 * two pertinent pointers m_nextpkt and m_next
1718 * the socket buffer is then updated to point at the new
1719 * top of the tree and the pruned area is released via
1723 sbdrop(struct sockbuf
*sb
, int len
)
1725 struct mbuf
*m
, *free_list
, *ml
;
1726 struct mbuf
*next
, *last
;
1728 next
= (m
= sb
->sb_mb
) ? m
->m_nextpkt
: 0;
1730 if (m
!= NULL
&& len
> 0 && !(sb
->sb_flags
& SB_RECV
) &&
1731 ((sb
->sb_so
->so_flags
& SOF_MP_SUBFLOW
) ||
1732 (SOCK_CHECK_DOM(sb
->sb_so
, PF_MULTIPATH
) &&
1733 SOCK_CHECK_PROTO(sb
->sb_so
, IPPROTO_TCP
))) &&
1734 !(sb
->sb_so
->so_flags1
& SOF1_POST_FALLBACK_SYNC
)) {
1735 mptcp_preproc_sbdrop(sb
->sb_so
, m
, (unsigned int)len
);
1737 if (m
!= NULL
&& len
> 0 && !(sb
->sb_flags
& SB_RECV
) &&
1738 (sb
->sb_so
->so_flags
& SOF_MP_SUBFLOW
) &&
1739 (sb
->sb_so
->so_flags1
& SOF1_POST_FALLBACK_SYNC
)) {
1740 mptcp_fallback_sbdrop(sb
->sb_so
, m
, len
);
1743 KERNEL_DEBUG((DBG_FNC_SBDROP
| DBG_FUNC_START
), sb
, len
, 0, 0, 0);
1745 free_list
= last
= m
;
1746 ml
= (struct mbuf
*)0;
1752 * temporarily replacing this panic with printf
1753 * because it occurs occasionally when closing
1754 * a socket when there is no harm in ignoring
1755 * it. This problem will be investigated
1758 /* panic("sbdrop"); */
1759 printf("sbdrop - count not zero\n");
1762 * zero the counts. if we have no mbufs,
1763 * we have no data (PR-2986815)
1770 next
= m
->m_nextpkt
;
1773 if (m
->m_len
> len
) {
1777 /* update the send byte count */
1778 if (sb
->sb_flags
& SB_SNDBYTE_CNT
) {
1779 inp_decr_sndbytes_total(sb
->sb_so
, len
);
1781 if (m
->m_type
!= MT_DATA
&& m
->m_type
!= MT_HEADER
&&
1782 m
->m_type
!= MT_OOBDATA
) {
1793 while (m
&& m
->m_len
== 0) {
1800 ml
->m_next
= (struct mbuf
*)0;
1801 last
->m_nextpkt
= (struct mbuf
*)0;
1802 m_freem_list(free_list
);
1806 m
->m_nextpkt
= next
;
1812 * First part is an inline SB_EMPTY_FIXUP(). Second part
1813 * makes sure sb_lastrecord is up-to-date if we dropped
1814 * part of the last record.
1818 sb
->sb_mbtail
= NULL
;
1819 sb
->sb_lastrecord
= NULL
;
1820 } else if (m
->m_nextpkt
== NULL
) {
1821 sb
->sb_lastrecord
= m
;
1825 cfil_sock_buf_update(sb
);
1826 #endif /* CONTENT_FILTER */
1828 KERNEL_DEBUG((DBG_FNC_SBDROP
| DBG_FUNC_END
), sb
, 0, 0, 0, 0);
1832 * Drop a record off the front of a sockbuf
1833 * and move the next record to the front.
1836 sbdroprecord(struct sockbuf
*sb
)
1838 struct mbuf
*m
, *mn
;
1842 sb
->sb_mb
= m
->m_nextpkt
;
1853 * Create a "control" mbuf containing the specified data
1854 * with the specified type for presentation on a socket buffer.
1857 sbcreatecontrol(caddr_t p
, int size
, int type
, int level
)
1862 if (CMSG_SPACE((u_int
)size
) > MLEN
) {
1863 return (struct mbuf
*)NULL
;
1865 if ((m
= m_get(M_DONTWAIT
, MT_CONTROL
)) == NULL
) {
1866 return (struct mbuf
*)NULL
;
1868 cp
= mtod(m
, struct cmsghdr
*);
1869 VERIFY(IS_P2ALIGNED(cp
, sizeof(u_int32_t
)));
1870 /* XXX check size? */
1871 (void) memcpy(CMSG_DATA(cp
), p
, size
);
1872 m
->m_len
= (int32_t)CMSG_SPACE(size
);
1873 cp
->cmsg_len
= CMSG_LEN(size
);
1874 cp
->cmsg_level
= level
;
1875 cp
->cmsg_type
= type
;
1880 sbcreatecontrol_mbuf(caddr_t p
, int size
, int type
, int level
, struct mbuf
**mp
)
1886 *mp
= sbcreatecontrol(p
, size
, type
, level
);
1890 if (CMSG_SPACE((u_int
)size
) + (*mp
)->m_len
> MLEN
) {
1891 mp
= &(*mp
)->m_next
;
1892 *mp
= sbcreatecontrol(p
, size
, type
, level
);
1898 cp
= (struct cmsghdr
*)(void *)(mtod(m
, char *) + m
->m_len
);
1899 /* CMSG_SPACE ensures 32-bit alignment */
1900 VERIFY(IS_P2ALIGNED(cp
, sizeof(u_int32_t
)));
1901 m
->m_len
+= (int32_t)CMSG_SPACE(size
);
1903 /* XXX check size? */
1904 (void) memcpy(CMSG_DATA(cp
), p
, size
);
1905 cp
->cmsg_len
= CMSG_LEN(size
);
1906 cp
->cmsg_level
= level
;
1907 cp
->cmsg_type
= type
;
1914 * Some routines that return EOPNOTSUPP for entry points that are not
1915 * supported by a protocol. Fill in as needed.
1918 pru_abort_notsupp(struct socket
*so
)
1925 pru_accept_notsupp(struct socket
*so
, struct sockaddr
**nam
)
1927 #pragma unused(so, nam)
1932 pru_attach_notsupp(struct socket
*so
, int proto
, struct proc
*p
)
1934 #pragma unused(so, proto, p)
1939 pru_bind_notsupp(struct socket
*so
, struct sockaddr
*nam
, struct proc
*p
)
1941 #pragma unused(so, nam, p)
1946 pru_connect_notsupp(struct socket
*so
, struct sockaddr
*nam
, struct proc
*p
)
1948 #pragma unused(so, nam, p)
1953 pru_connect2_notsupp(struct socket
*so1
, struct socket
*so2
)
1955 #pragma unused(so1, so2)
1960 pru_connectx_notsupp(struct socket
*so
, struct sockaddr
*src
,
1961 struct sockaddr
*dst
, struct proc
*p
, uint32_t ifscope
,
1962 sae_associd_t aid
, sae_connid_t
*pcid
, uint32_t flags
, void *arg
,
1963 uint32_t arglen
, struct uio
*uio
, user_ssize_t
*bytes_written
)
1965 #pragma unused(so, src, dst, p, ifscope, aid, pcid, flags, arg, arglen, uio, bytes_written)
1970 pru_control_notsupp(struct socket
*so
, u_long cmd
, caddr_t data
,
1971 struct ifnet
*ifp
, struct proc
*p
)
1973 #pragma unused(so, cmd, data, ifp, p)
1978 pru_detach_notsupp(struct socket
*so
)
1985 pru_disconnect_notsupp(struct socket
*so
)
1992 pru_disconnectx_notsupp(struct socket
*so
, sae_associd_t aid
, sae_connid_t cid
)
1994 #pragma unused(so, aid, cid)
1999 pru_listen_notsupp(struct socket
*so
, struct proc
*p
)
2001 #pragma unused(so, p)
2006 pru_peeraddr_notsupp(struct socket
*so
, struct sockaddr
**nam
)
2008 #pragma unused(so, nam)
2013 pru_rcvd_notsupp(struct socket
*so
, int flags
)
2015 #pragma unused(so, flags)
2020 pru_rcvoob_notsupp(struct socket
*so
, struct mbuf
*m
, int flags
)
2022 #pragma unused(so, m, flags)
2027 pru_send_notsupp(struct socket
*so
, int flags
, struct mbuf
*m
,
2028 struct sockaddr
*addr
, struct mbuf
*control
, struct proc
*p
)
2030 #pragma unused(so, flags, m, addr, control, p)
2035 pru_send_list_notsupp(struct socket
*so
, int flags
, struct mbuf
*m
,
2036 struct sockaddr
*addr
, struct mbuf
*control
, struct proc
*p
)
2038 #pragma unused(so, flags, m, addr, control, p)
2043 * This isn't really a ``null'' operation, but it's the default one
2044 * and doesn't do anything destructive.
2047 pru_sense_null(struct socket
*so
, void *ub
, int isstat64
)
2049 if (isstat64
!= 0) {
2050 struct stat64
*sb64
;
2052 sb64
= (struct stat64
*)ub
;
2053 sb64
->st_blksize
= so
->so_snd
.sb_hiwat
;
2057 sb
= (struct stat
*)ub
;
2058 sb
->st_blksize
= so
->so_snd
.sb_hiwat
;
2066 pru_sosend_notsupp(struct socket
*so
, struct sockaddr
*addr
, struct uio
*uio
,
2067 struct mbuf
*top
, struct mbuf
*control
, int flags
)
2069 #pragma unused(so, addr, uio, top, control, flags)
2074 pru_sosend_list_notsupp(struct socket
*so
, struct uio
**uio
,
2075 u_int uiocnt
, int flags
)
2077 #pragma unused(so, uio, uiocnt, flags)
2082 pru_soreceive_notsupp(struct socket
*so
, struct sockaddr
**paddr
,
2083 struct uio
*uio
, struct mbuf
**mp0
, struct mbuf
**controlp
, int *flagsp
)
2085 #pragma unused(so, paddr, uio, mp0, controlp, flagsp)
2090 pru_soreceive_list_notsupp(struct socket
*so
,
2091 struct recv_msg_elem
*recv_msg_array
, u_int uiocnt
, int *flagsp
)
2093 #pragma unused(so, recv_msg_array, uiocnt, flagsp)
2098 pru_shutdown_notsupp(struct socket
*so
)
2105 pru_sockaddr_notsupp(struct socket
*so
, struct sockaddr
**nam
)
2107 #pragma unused(so, nam)
2112 pru_sopoll_notsupp(struct socket
*so
, int events
, kauth_cred_t cred
, void *wql
)
2114 #pragma unused(so, events, cred, wql)
2119 pru_socheckopt_null(struct socket
*so
, struct sockopt
*sopt
)
2121 #pragma unused(so, sopt)
2123 * Allow all options for set/get by default.
2129 pru_preconnect_null(struct socket
*so
)
2136 pru_sanitize(struct pr_usrreqs
*pru
)
2138 #define DEFAULT(foo, bar) if ((foo) == NULL) (foo) = (bar)
2139 DEFAULT(pru
->pru_abort
, pru_abort_notsupp
);
2140 DEFAULT(pru
->pru_accept
, pru_accept_notsupp
);
2141 DEFAULT(pru
->pru_attach
, pru_attach_notsupp
);
2142 DEFAULT(pru
->pru_bind
, pru_bind_notsupp
);
2143 DEFAULT(pru
->pru_connect
, pru_connect_notsupp
);
2144 DEFAULT(pru
->pru_connect2
, pru_connect2_notsupp
);
2145 DEFAULT(pru
->pru_connectx
, pru_connectx_notsupp
);
2146 DEFAULT(pru
->pru_control
, pru_control_notsupp
);
2147 DEFAULT(pru
->pru_detach
, pru_detach_notsupp
);
2148 DEFAULT(pru
->pru_disconnect
, pru_disconnect_notsupp
);
2149 DEFAULT(pru
->pru_disconnectx
, pru_disconnectx_notsupp
);
2150 DEFAULT(pru
->pru_listen
, pru_listen_notsupp
);
2151 DEFAULT(pru
->pru_peeraddr
, pru_peeraddr_notsupp
);
2152 DEFAULT(pru
->pru_rcvd
, pru_rcvd_notsupp
);
2153 DEFAULT(pru
->pru_rcvoob
, pru_rcvoob_notsupp
);
2154 DEFAULT(pru
->pru_send
, pru_send_notsupp
);
2155 DEFAULT(pru
->pru_send_list
, pru_send_list_notsupp
);
2156 DEFAULT(pru
->pru_sense
, pru_sense_null
);
2157 DEFAULT(pru
->pru_shutdown
, pru_shutdown_notsupp
);
2158 DEFAULT(pru
->pru_sockaddr
, pru_sockaddr_notsupp
);
2159 DEFAULT(pru
->pru_sopoll
, pru_sopoll_notsupp
);
2160 DEFAULT(pru
->pru_soreceive
, pru_soreceive_notsupp
);
2161 DEFAULT(pru
->pru_soreceive_list
, pru_soreceive_list_notsupp
);
2162 DEFAULT(pru
->pru_sosend
, pru_sosend_notsupp
);
2163 DEFAULT(pru
->pru_sosend_list
, pru_sosend_list_notsupp
);
2164 DEFAULT(pru
->pru_socheckopt
, pru_socheckopt_null
);
2165 DEFAULT(pru
->pru_preconnect
, pru_preconnect_null
);
2170 * The following are macros on BSD and functions on Darwin
2174 * Do we need to notify the other side when I/O is possible?
2178 sb_notify(struct sockbuf
*sb
)
2180 return sb
->sb_waiters
> 0 ||
2181 (sb
->sb_flags
& (SB_SEL
| SB_ASYNC
| SB_UPCALL
| SB_KNOTE
));
2185 * How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
2186 * This is problematical if the fields are unsigned, as the space might
2187 * still be negative (cc > hiwat or mbcnt > mbmax). Should detect
2188 * overflow and return 0.
2191 sbspace(struct sockbuf
*sb
)
2194 int space
= imin((int)(sb
->sb_hiwat
- sb
->sb_cc
),
2195 (int)(sb
->sb_mbmax
- sb
->sb_mbcnt
));
2197 if (sb
->sb_preconn_hiwat
!= 0) {
2198 space
= imin((int)(sb
->sb_preconn_hiwat
- sb
->sb_cc
), space
);
2205 /* Compensate for data being processed by content filters */
2207 pending
= cfil_sock_data_space(sb
);
2208 #endif /* CONTENT_FILTER */
2209 if (pending
> space
) {
2218 /* do we have to send all at once on a socket? */
2220 sosendallatonce(struct socket
*so
)
2222 return so
->so_proto
->pr_flags
& PR_ATOMIC
;
2225 /* can we read something from so? */
2227 soreadable(struct socket
*so
)
2229 return so
->so_rcv
.sb_cc
>= so
->so_rcv
.sb_lowat
||
2230 ((so
->so_state
& SS_CANTRCVMORE
)
2232 && cfil_sock_data_pending(&so
->so_rcv
) == 0
2233 #endif /* CONTENT_FILTER */
2235 so
->so_comp
.tqh_first
|| so
->so_error
;
2238 /* can we write something to so? */
2241 sowriteable(struct socket
*so
)
2243 if ((so
->so_state
& SS_CANTSENDMORE
) ||
2247 if (so_wait_for_if_feedback(so
) || !socanwrite(so
)) {
2250 if (so
->so_flags1
& SOF1_PRECONNECT_DATA
) {
2254 if (sbspace(&(so
)->so_snd
) >= (so
)->so_snd
.sb_lowat
) {
2255 if (so
->so_flags
& SOF_NOTSENT_LOWAT
) {
2256 if ((SOCK_DOM(so
) == PF_INET6
||
2257 SOCK_DOM(so
) == PF_INET
) &&
2258 so
->so_type
== SOCK_STREAM
) {
2259 return tcp_notsent_lowat_check(so
);
2262 else if ((SOCK_DOM(so
) == PF_MULTIPATH
) &&
2263 (SOCK_PROTO(so
) == IPPROTO_TCP
)) {
2264 return mptcp_notsent_lowat_check(so
);
2277 /* adjust counters in sb reflecting allocation of m */
2280 sballoc(struct sockbuf
*sb
, struct mbuf
*m
)
2283 sb
->sb_cc
+= m
->m_len
;
2284 if (m
->m_type
!= MT_DATA
&& m
->m_type
!= MT_HEADER
&&
2285 m
->m_type
!= MT_OOBDATA
) {
2286 sb
->sb_ctl
+= m
->m_len
;
2288 sb
->sb_mbcnt
+= MSIZE
;
2290 if (m
->m_flags
& M_EXT
) {
2291 sb
->sb_mbcnt
+= m
->m_ext
.ext_size
;
2292 cnt
+= (m
->m_ext
.ext_size
>> MSIZESHIFT
);
2294 OSAddAtomic(cnt
, &total_sbmb_cnt
);
2295 VERIFY(total_sbmb_cnt
> 0);
2296 if (total_sbmb_cnt
> total_sbmb_cnt_peak
) {
2297 total_sbmb_cnt_peak
= total_sbmb_cnt
;
2301 * If data is being added to the send socket buffer,
2302 * update the send byte count
2304 if (sb
->sb_flags
& SB_SNDBYTE_CNT
) {
2305 inp_incr_sndbytes_total(sb
->sb_so
, m
->m_len
);
2306 inp_incr_sndbytes_unsent(sb
->sb_so
, m
->m_len
);
2310 /* adjust counters in sb reflecting freeing of m */
2312 sbfree(struct sockbuf
*sb
, struct mbuf
*m
)
2316 sb
->sb_cc
-= m
->m_len
;
2317 if (m
->m_type
!= MT_DATA
&& m
->m_type
!= MT_HEADER
&&
2318 m
->m_type
!= MT_OOBDATA
) {
2319 sb
->sb_ctl
-= m
->m_len
;
2321 sb
->sb_mbcnt
-= MSIZE
;
2322 if (m
->m_flags
& M_EXT
) {
2323 sb
->sb_mbcnt
-= m
->m_ext
.ext_size
;
2324 cnt
-= (m
->m_ext
.ext_size
>> MSIZESHIFT
);
2326 OSAddAtomic(cnt
, &total_sbmb_cnt
);
2327 VERIFY(total_sbmb_cnt
>= 0);
2328 if (total_sbmb_cnt
< total_sbmb_cnt_floor
) {
2329 total_sbmb_cnt_floor
= total_sbmb_cnt
;
2333 * If data is being removed from the send socket buffer,
2334 * update the send byte count
2336 if (sb
->sb_flags
& SB_SNDBYTE_CNT
) {
2337 inp_decr_sndbytes_total(sb
->sb_so
, m
->m_len
);
2342 * Set lock on sockbuf sb; sleep if lock is already held.
2343 * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
2344 * Returns error without lock if sleep is interrupted.
2347 sblock(struct sockbuf
*sb
, uint32_t flags
)
2349 boolean_t nointr
= ((sb
->sb_flags
& SB_NOINTR
) || (flags
& SBL_NOINTR
));
2350 void *lr_saved
= __builtin_return_address(0);
2351 struct socket
*so
= sb
->sb_so
;
2354 thread_t tp
= current_thread();
2356 VERIFY((flags
& SBL_VALID
) == flags
);
2358 /* so_usecount may be 0 if we get here from sofreelastref() */
2360 panic("%s: null so, sb=%p sb_flags=0x%x lr=%p\n",
2361 __func__
, sb
, sb
->sb_flags
, lr_saved
);
2363 } else if (so
->so_usecount
< 0) {
2364 panic("%s: sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
2365 "lrh= %s\n", __func__
, sb
, sb
->sb_flags
, so
,
2366 so
->so_usecount
, lr_saved
, solockhistory_nr(so
));
2371 * The content filter thread must hold the sockbuf lock
2373 if ((so
->so_flags
& SOF_CONTENT_FILTER
) && sb
->sb_cfil_thread
== tp
) {
2375 * Don't panic if we are defunct because SB_LOCK has
2376 * been cleared by sodefunct()
2378 if (!(so
->so_flags
& SOF_DEFUNCT
) && !(sb
->sb_flags
& SB_LOCK
)) {
2379 panic("%s: SB_LOCK not held for %p\n",
2383 /* Keep the sockbuf locked */
2387 if ((sb
->sb_flags
& SB_LOCK
) && !(flags
& SBL_WAIT
)) {
2391 * We may get here from sorflush(), in which case "sb" may not
2392 * point to the real socket buffer. Use the actual socket buffer
2393 * address from the socket instead.
2395 wchan
= (sb
->sb_flags
& SB_RECV
) ?
2396 &so
->so_rcv
.sb_flags
: &so
->so_snd
.sb_flags
;
2399 * A content filter thread has exclusive access to the sockbuf
2400 * until it clears the
2402 while ((sb
->sb_flags
& SB_LOCK
) ||
2403 ((so
->so_flags
& SOF_CONTENT_FILTER
) &&
2404 sb
->sb_cfil_thread
!= NULL
)) {
2405 lck_mtx_t
*mutex_held
;
2408 * XXX: This code should be moved up above outside of this loop;
2409 * however, we may get here as part of sofreelastref(), and
2410 * at that time pr_getlock() may no longer be able to return
2411 * us the lock. This will be fixed in future.
2413 if (so
->so_proto
->pr_getlock
!= NULL
) {
2414 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, PR_F_WILLUNLOCK
);
2416 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
2419 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
2422 VERIFY(sb
->sb_wantlock
!= 0);
2424 error
= msleep(wchan
, mutex_held
,
2425 nointr
? PSOCK
: PSOCK
| PCATCH
,
2426 nointr
? "sb_lock_nointr" : "sb_lock", NULL
);
2428 VERIFY(sb
->sb_wantlock
!= 0);
2431 if (error
== 0 && (so
->so_flags
& SOF_DEFUNCT
) &&
2432 !(flags
& SBL_IGNDEFUNCT
)) {
2434 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
2435 "(%d)\n", __func__
, proc_selfpid(),
2436 proc_best_name(current_proc()),
2437 (uint64_t)VM_KERNEL_ADDRPERM(so
),
2438 SOCK_DOM(so
), SOCK_TYPE(so
), error
);
2445 sb
->sb_flags
|= SB_LOCK
;
2450 * Release lock on sockbuf sb
2453 sbunlock(struct sockbuf
*sb
, boolean_t keeplocked
)
2455 void *lr_saved
= __builtin_return_address(0);
2456 struct socket
*so
= sb
->sb_so
;
2457 thread_t tp
= current_thread();
2459 /* so_usecount may be 0 if we get here from sofreelastref() */
2461 panic("%s: null so, sb=%p sb_flags=0x%x lr=%p\n",
2462 __func__
, sb
, sb
->sb_flags
, lr_saved
);
2464 } else if (so
->so_usecount
< 0) {
2465 panic("%s: sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
2466 "lrh= %s\n", __func__
, sb
, sb
->sb_flags
, so
,
2467 so
->so_usecount
, lr_saved
, solockhistory_nr(so
));
2472 * The content filter thread must hold the sockbuf lock
2474 if ((so
->so_flags
& SOF_CONTENT_FILTER
) && sb
->sb_cfil_thread
== tp
) {
2476 * Don't panic if we are defunct because SB_LOCK has
2477 * been cleared by sodefunct()
2479 if (!(so
->so_flags
& SOF_DEFUNCT
) &&
2480 !(sb
->sb_flags
& SB_LOCK
) &&
2481 !(so
->so_state
& SS_DEFUNCT
) &&
2482 !(so
->so_flags1
& SOF1_DEFUNCTINPROG
)) {
2483 panic("%s: SB_LOCK not held for %p\n",
2486 /* Keep the sockbuf locked and proceed */
2488 VERIFY((sb
->sb_flags
& SB_LOCK
) ||
2489 (so
->so_state
& SS_DEFUNCT
) ||
2490 (so
->so_flags1
& SOF1_DEFUNCTINPROG
));
2492 sb
->sb_flags
&= ~SB_LOCK
;
2494 if (sb
->sb_wantlock
> 0) {
2496 * We may get here from sorflush(), in which case "sb"
2497 * may not point to the real socket buffer. Use the
2498 * actual socket buffer address from the socket instead.
2500 wakeup((sb
->sb_flags
& SB_RECV
) ? &so
->so_rcv
.sb_flags
:
2501 &so
->so_snd
.sb_flags
);
2505 if (!keeplocked
) { /* unlock on exit */
2506 if (so
->so_flags
& SOF_MP_SUBFLOW
|| SOCK_DOM(so
) == PF_MULTIPATH
) {
2507 (*so
->so_proto
->pr_unlock
)(so
, 1, lr_saved
);
2509 lck_mtx_t
*mutex_held
;
2511 if (so
->so_proto
->pr_getlock
!= NULL
) {
2512 mutex_held
= (*so
->so_proto
->pr_getlock
)(so
, PR_F_WILLUNLOCK
);
2514 mutex_held
= so
->so_proto
->pr_domain
->dom_mtx
;
2517 LCK_MTX_ASSERT(mutex_held
, LCK_MTX_ASSERT_OWNED
);
2519 VERIFY(so
->so_usecount
> 0);
2521 so
->unlock_lr
[so
->next_unlock_lr
] = lr_saved
;
2522 so
->next_unlock_lr
= (so
->next_unlock_lr
+ 1) % SO_LCKDBG_MAX
;
2523 lck_mtx_unlock(mutex_held
);
2529 sorwakeup(struct socket
*so
)
2531 if (sb_notify(&so
->so_rcv
)) {
2532 sowakeup(so
, &so
->so_rcv
, NULL
);
2537 sowwakeup(struct socket
*so
)
2539 if (sb_notify(&so
->so_snd
)) {
2540 sowakeup(so
, &so
->so_snd
, NULL
);
2545 soevent(struct socket
*so
, long hint
)
2547 if (so
->so_flags
& SOF_KNOTE
) {
2548 KNOTE(&so
->so_klist
, hint
);
2551 soevupcall(so
, hint
);
2554 * Don't post an event if this a subflow socket or
2555 * the app has opted out of using cellular interface
2557 if ((hint
& SO_FILT_HINT_IFDENIED
) &&
2558 !(so
->so_flags
& SOF_MP_SUBFLOW
) &&
2559 !(so
->so_restrictions
& SO_RESTRICT_DENY_CELLULAR
) &&
2560 !(so
->so_restrictions
& SO_RESTRICT_DENY_EXPENSIVE
) &&
2561 !(so
->so_restrictions
& SO_RESTRICT_DENY_CONSTRAINED
)) {
2562 soevent_ifdenied(so
);
2567 soevupcall(struct socket
*so
, long hint
)
2569 if (so
->so_event
!= NULL
) {
2570 caddr_t so_eventarg
= so
->so_eventarg
;
2572 hint
&= so
->so_eventmask
;
2574 so
->so_event(so
, so_eventarg
, hint
);
2580 soevent_ifdenied(struct socket
*so
)
2582 struct kev_netpolicy_ifdenied ev_ifdenied
;
2584 bzero(&ev_ifdenied
, sizeof(ev_ifdenied
));
2586 * The event consumer is interested about the effective {upid,pid,uuid}
2587 * info which can be different than the those related to the process
2588 * that recently performed a system call on the socket, i.e. when the
2589 * socket is delegated.
2591 if (so
->so_flags
& SOF_DELEGATED
) {
2592 ev_ifdenied
.ev_data
.eupid
= so
->e_upid
;
2593 ev_ifdenied
.ev_data
.epid
= so
->e_pid
;
2594 uuid_copy(ev_ifdenied
.ev_data
.euuid
, so
->e_uuid
);
2596 ev_ifdenied
.ev_data
.eupid
= so
->last_upid
;
2597 ev_ifdenied
.ev_data
.epid
= so
->last_pid
;
2598 uuid_copy(ev_ifdenied
.ev_data
.euuid
, so
->last_uuid
);
2601 if (++so
->so_ifdenied_notifies
> 1) {
2603 * Allow for at most one kernel event to be generated per
2604 * socket; so_ifdenied_notifies is reset upon changes in
2605 * the UUID policy. See comments in inp_update_policy.
2607 if (net_io_policy_log
) {
2610 uuid_unparse(ev_ifdenied
.ev_data
.euuid
, buf
);
2611 log(LOG_DEBUG
, "%s[%d]: so 0x%llx [%d,%d] epid %llu "
2612 "euuid %s%s has %d redundant events supressed\n",
2613 __func__
, so
->last_pid
,
2614 (uint64_t)VM_KERNEL_ADDRPERM(so
), SOCK_DOM(so
),
2615 SOCK_TYPE(so
), ev_ifdenied
.ev_data
.epid
, buf
,
2616 ((so
->so_flags
& SOF_DELEGATED
) ?
2617 " [delegated]" : ""), so
->so_ifdenied_notifies
);
2620 if (net_io_policy_log
) {
2623 uuid_unparse(ev_ifdenied
.ev_data
.euuid
, buf
);
2624 log(LOG_DEBUG
, "%s[%d]: so 0x%llx [%d,%d] epid %llu "
2625 "euuid %s%s event posted\n", __func__
,
2626 so
->last_pid
, (uint64_t)VM_KERNEL_ADDRPERM(so
),
2627 SOCK_DOM(so
), SOCK_TYPE(so
),
2628 ev_ifdenied
.ev_data
.epid
, buf
,
2629 ((so
->so_flags
& SOF_DELEGATED
) ?
2630 " [delegated]" : ""));
2632 netpolicy_post_msg(KEV_NETPOLICY_IFDENIED
, &ev_ifdenied
.ev_data
,
2633 sizeof(ev_ifdenied
));
2638 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
2641 dup_sockaddr(struct sockaddr
*sa
, int canwait
)
2643 struct sockaddr
*sa2
;
2645 MALLOC(sa2
, struct sockaddr
*, sa
->sa_len
, M_SONAME
,
2646 canwait
? M_WAITOK
: M_NOWAIT
);
2648 bcopy(sa
, sa2
, sa
->sa_len
);
2654 * Create an external-format (``xsocket'') structure using the information
2655 * in the kernel-format socket structure pointed to by so. This is done
2656 * to reduce the spew of irrelevant information over this interface,
2657 * to isolate user code from changes in the kernel structure, and
2658 * potentially to provide information-hiding if we decide that
2659 * some of this information should be hidden from users.
2662 sotoxsocket(struct socket
*so
, struct xsocket
*xso
)
2664 xso
->xso_len
= sizeof(*xso
);
2665 xso
->xso_so
= (_XSOCKET_PTR(struct socket
*))VM_KERNEL_ADDRPERM(so
);
2666 xso
->so_type
= so
->so_type
;
2667 xso
->so_options
= (short)(so
->so_options
& 0xffff);
2668 xso
->so_linger
= so
->so_linger
;
2669 xso
->so_state
= so
->so_state
;
2670 xso
->so_pcb
= (_XSOCKET_PTR(caddr_t
))VM_KERNEL_ADDRPERM(so
->so_pcb
);
2672 xso
->xso_protocol
= SOCK_PROTO(so
);
2673 xso
->xso_family
= SOCK_DOM(so
);
2675 xso
->xso_protocol
= xso
->xso_family
= 0;
2677 xso
->so_qlen
= so
->so_qlen
;
2678 xso
->so_incqlen
= so
->so_incqlen
;
2679 xso
->so_qlimit
= so
->so_qlimit
;
2680 xso
->so_timeo
= so
->so_timeo
;
2681 xso
->so_error
= so
->so_error
;
2682 xso
->so_pgid
= so
->so_pgid
;
2683 xso
->so_oobmark
= so
->so_oobmark
;
2684 sbtoxsockbuf(&so
->so_snd
, &xso
->so_snd
);
2685 sbtoxsockbuf(&so
->so_rcv
, &xso
->so_rcv
);
2686 xso
->so_uid
= kauth_cred_getuid(so
->so_cred
);
2690 #if XNU_TARGET_OS_OSX
2693 sotoxsocket64(struct socket
*so
, struct xsocket64
*xso
)
2695 xso
->xso_len
= sizeof(*xso
);
2696 xso
->xso_so
= (u_int64_t
)VM_KERNEL_ADDRPERM(so
);
2697 xso
->so_type
= so
->so_type
;
2698 xso
->so_options
= (short)(so
->so_options
& 0xffff);
2699 xso
->so_linger
= so
->so_linger
;
2700 xso
->so_state
= so
->so_state
;
2701 xso
->so_pcb
= (u_int64_t
)VM_KERNEL_ADDRPERM(so
->so_pcb
);
2703 xso
->xso_protocol
= SOCK_PROTO(so
);
2704 xso
->xso_family
= SOCK_DOM(so
);
2706 xso
->xso_protocol
= xso
->xso_family
= 0;
2708 xso
->so_qlen
= so
->so_qlen
;
2709 xso
->so_incqlen
= so
->so_incqlen
;
2710 xso
->so_qlimit
= so
->so_qlimit
;
2711 xso
->so_timeo
= so
->so_timeo
;
2712 xso
->so_error
= so
->so_error
;
2713 xso
->so_pgid
= so
->so_pgid
;
2714 xso
->so_oobmark
= so
->so_oobmark
;
2715 sbtoxsockbuf(&so
->so_snd
, &xso
->so_snd
);
2716 sbtoxsockbuf(&so
->so_rcv
, &xso
->so_rcv
);
2717 xso
->so_uid
= kauth_cred_getuid(so
->so_cred
);
2720 #endif /* XNU_TARGET_OS_OSX */
2723 * This does the same for sockbufs. Note that the xsockbuf structure,
2724 * since it is always embedded in a socket, does not include a self
2725 * pointer nor a length. We make this entry point public in case
2726 * some other mechanism needs it.
2729 sbtoxsockbuf(struct sockbuf
*sb
, struct xsockbuf
*xsb
)
2731 xsb
->sb_cc
= sb
->sb_cc
;
2732 xsb
->sb_hiwat
= sb
->sb_hiwat
;
2733 xsb
->sb_mbcnt
= sb
->sb_mbcnt
;
2734 xsb
->sb_mbmax
= sb
->sb_mbmax
;
2735 xsb
->sb_lowat
= sb
->sb_lowat
;
2736 xsb
->sb_flags
= (short)sb
->sb_flags
;
2737 xsb
->sb_timeo
= (short)
2738 ((sb
->sb_timeo
.tv_sec
* hz
) + sb
->sb_timeo
.tv_usec
/ tick
);
2739 if (xsb
->sb_timeo
== 0 && sb
->sb_timeo
.tv_usec
!= 0) {
2745 * Based on the policy set by an all knowing decison maker, throttle sockets
2746 * that either have been marked as belonging to "background" process.
2749 soisthrottled(struct socket
*so
)
2751 return so
->so_flags1
& SOF1_TRAFFIC_MGT_SO_BACKGROUND
;
2755 soisprivilegedtraffic(struct socket
*so
)
2757 return (so
->so_flags
& SOF_PRIVILEGED_TRAFFIC_CLASS
) ? 1 : 0;
2761 soissrcbackground(struct socket
*so
)
2763 return (so
->so_flags1
& SOF1_TRAFFIC_MGT_SO_BACKGROUND
) ||
2764 IS_SO_TC_BACKGROUND(so
->so_traffic_class
);
2768 soissrcrealtime(struct socket
*so
)
2770 return so
->so_traffic_class
>= SO_TC_AV
&&
2771 so
->so_traffic_class
<= SO_TC_VO
;
2775 soissrcbesteffort(struct socket
*so
)
2777 return so
->so_traffic_class
== SO_TC_BE
||
2778 so
->so_traffic_class
== SO_TC_RD
||
2779 so
->so_traffic_class
== SO_TC_OAM
;
2783 soclearfastopen(struct socket
*so
)
2785 if (so
->so_flags1
& SOF1_PRECONNECT_DATA
) {
2786 so
->so_flags1
&= ~SOF1_PRECONNECT_DATA
;
2789 if (so
->so_flags1
& SOF1_DATA_IDEMPOTENT
) {
2790 so
->so_flags1
&= ~SOF1_DATA_IDEMPOTENT
;
2795 sonullevent(struct socket
*so
, void *arg
, long hint
)
2797 #pragma unused(so, arg, hint)
2801 * Here is the definition of some of the basic objects in the kern.ipc
2802 * branch of the MIB.
2804 SYSCTL_NODE(_kern
, KERN_IPC
, ipc
,
2805 CTLFLAG_RW
| CTLFLAG_LOCKED
| CTLFLAG_ANYBODY
, 0, "IPC");
2807 /* Check that the maximum socket buffer size is within a range */
2810 sysctl_sb_max SYSCTL_HANDLER_ARGS
2812 #pragma unused(oidp, arg1, arg2)
2813 u_int32_t new_value
;
2815 int error
= sysctl_io_number(req
, sb_max
, sizeof(u_int32_t
),
2816 &new_value
, &changed
);
2817 if (!error
&& changed
) {
2818 if (new_value
> LOW_SB_MAX
&& new_value
<= high_sb_max
) {
2827 SYSCTL_PROC(_kern_ipc
, KIPC_MAXSOCKBUF
, maxsockbuf
,
2828 CTLTYPE_INT
| CTLFLAG_RW
| CTLFLAG_LOCKED
,
2829 &sb_max
, 0, &sysctl_sb_max
, "IU", "Maximum socket buffer size");
2831 SYSCTL_INT(_kern_ipc
, KIPC_SOCKBUF_WASTE
, sockbuf_waste_factor
,
2832 CTLFLAG_RW
| CTLFLAG_LOCKED
, &sb_efficiency
, 0, "");
2834 SYSCTL_INT(_kern_ipc
, KIPC_NMBCLUSTERS
, nmbclusters
,
2835 CTLFLAG_RD
| CTLFLAG_LOCKED
, &nmbclusters
, 0, "");
2837 SYSCTL_INT(_kern_ipc
, OID_AUTO
, njcl
,
2838 CTLFLAG_RD
| CTLFLAG_LOCKED
, &njcl
, 0, "");
2840 SYSCTL_INT(_kern_ipc
, OID_AUTO
, njclbytes
,
2841 CTLFLAG_RD
| CTLFLAG_LOCKED
, &njclbytes
, 0, "");
2843 SYSCTL_INT(_kern_ipc
, KIPC_SOQLIMITCOMPAT
, soqlimitcompat
,
2844 CTLFLAG_RW
| CTLFLAG_LOCKED
, &soqlimitcompat
, 1,
2845 "Enable socket queue limit compatibility");
2848 * Hack alert -- rdar://33572856
2849 * A loopback test we cannot change was failing because it sets
2850 * SO_SENDTIMEO to 5 seconds and that's also the value
2851 * of the minimum persist timer. Because of the persist timer,
2852 * the connection was not idle for 5 seconds and SO_SNDTIMEO
2853 * was not triggering at 5 seconds causing the test failure.
2854 * As a workaround we check the sysctl soqlencomp the test is already
2855 * setting to set disable auto tuning of the receive buffer.
2858 extern u_int32_t tcp_do_autorcvbuf
;
2861 sysctl_soqlencomp SYSCTL_HANDLER_ARGS
2863 #pragma unused(oidp, arg1, arg2)
2864 u_int32_t new_value
;
2866 int error
= sysctl_io_number(req
, soqlencomp
, sizeof(u_int32_t
),
2867 &new_value
, &changed
);
2868 if (!error
&& changed
) {
2869 soqlencomp
= new_value
;
2870 if (new_value
!= 0) {
2871 tcp_do_autorcvbuf
= 0;
2872 tcptv_persmin_val
= 6 * TCP_RETRANSHZ
;
2877 SYSCTL_PROC(_kern_ipc
, OID_AUTO
, soqlencomp
,
2878 CTLTYPE_INT
| CTLFLAG_RW
| CTLFLAG_LOCKED
,
2879 &soqlencomp
, 0, &sysctl_soqlencomp
, "IU", "");
2881 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sbmb_cnt
, CTLFLAG_RD
| CTLFLAG_LOCKED
,
2882 &total_sbmb_cnt
, 0, "");
2883 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sbmb_cnt_peak
, CTLFLAG_RD
| CTLFLAG_LOCKED
,
2884 &total_sbmb_cnt_peak
, 0, "");
2885 SYSCTL_INT(_kern_ipc
, OID_AUTO
, sbmb_cnt_floor
, CTLFLAG_RD
| CTLFLAG_LOCKED
,
2886 &total_sbmb_cnt_floor
, 0, "");
2887 SYSCTL_QUAD(_kern_ipc
, OID_AUTO
, sbmb_limreached
, CTLFLAG_RD
| CTLFLAG_LOCKED
,
2888 &sbmb_limreached
, "");
2891 SYSCTL_NODE(_kern_ipc
, OID_AUTO
, io_policy
, CTLFLAG_RW
, 0, "network IO policy");
2893 SYSCTL_INT(_kern_ipc_io_policy
, OID_AUTO
, log
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
2894 &net_io_policy_log
, 0, "");
2896 #if CONFIG_PROC_UUID_POLICY
2897 SYSCTL_INT(_kern_ipc_io_policy
, OID_AUTO
, uuid
, CTLFLAG_RW
| CTLFLAG_LOCKED
,
2898 &net_io_policy_uuid
, 0, "");
2899 #endif /* CONFIG_PROC_UUID_POLICY */