]> git.saurik.com Git - apple/xnu.git/blame - bsd/kern/uipc_socket.c
xnu-7195.101.1.tar.gz
[apple/xnu.git] / bsd / kern / uipc_socket.c
CommitLineData
1c79356b 1/*
c3c9b80d 2 * Copyright (c) 1998-2021 Apple Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
39236c6e 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
39236c6e 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
39236c6e 17 *
2d21ac55
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
39236c6e 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b 27 */
1c79356b
A
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
9bccf70c 61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
1c79356b 62 */
2d21ac55
A
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
1c79356b
A
69
70#include <sys/param.h>
71#include <sys/systm.h>
55e303ae 72#include <sys/filedesc.h>
2d21ac55 73#include <sys/proc.h>
91447636
A
74#include <sys/proc_internal.h>
75#include <sys/kauth.h>
76#include <sys/file_internal.h>
1c79356b
A
77#include <sys/fcntl.h>
78#include <sys/malloc.h>
79#include <sys/mbuf.h>
80#include <sys/domain.h>
81#include <sys/kernel.h>
55e303ae 82#include <sys/event.h>
1c79356b
A
83#include <sys/poll.h>
84#include <sys/protosw.h>
85#include <sys/socket.h>
86#include <sys/socketvar.h>
87#include <sys/resourcevar.h>
88#include <sys/signalvar.h>
89#include <sys/sysctl.h>
39236c6e 90#include <sys/syslog.h>
1c79356b 91#include <sys/uio.h>
fe8ab488 92#include <sys/uio_internal.h>
1c79356b
A
93#include <sys/ev.h>
94#include <sys/kdebug.h>
2d21ac55 95#include <sys/un.h>
d1ecb069 96#include <sys/user.h>
316670eb 97#include <sys/priv.h>
39236c6e 98#include <sys/kern_event.h>
1c79356b 99#include <net/route.h>
39236c6e 100#include <net/init.h>
5ba3f43e 101#include <net/net_api_stats.h>
316670eb 102#include <net/ntstat.h>
fe8ab488 103#include <net/content_filter.h>
1c79356b
A
104#include <netinet/in.h>
105#include <netinet/in_pcb.h>
39037602 106#include <netinet/in_tclass.h>
cb323159 107#include <netinet/in_var.h>
39037602 108#include <netinet/tcp_var.h>
6d2010ae
A
109#include <netinet/ip6.h>
110#include <netinet6/ip6_var.h>
39236c6e 111#include <netinet/flow_divert.h>
1c79356b 112#include <kern/zalloc.h>
91447636 113#include <kern/locks.h>
1c79356b 114#include <machine/limits.h>
2d21ac55
A
115#include <libkern/OSAtomic.h>
116#include <pexpert/pexpert.h>
b0d623f7 117#include <kern/assert.h>
6d2010ae 118#include <kern/task.h>
39037602
A
119#include <kern/policy_internal.h>
120
316670eb 121#include <sys/kpi_mbuf.h>
6d2010ae 122#include <sys/mcache.h>
fe8ab488 123#include <sys/unpcb.h>
5ba3f43e 124#include <libkern/section_keywords.h>
2d21ac55
A
125
126#if CONFIG_MACF
2d21ac55
A
127#include <security/mac_framework.h>
128#endif /* MAC */
129
39236c6e
A
130#if MULTIPATH
131#include <netinet/mp_pcb.h>
fe8ab488 132#include <netinet/mptcp_var.h>
39236c6e
A
133#endif /* MULTIPATH */
134
3e170ce0
A
135#define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
136
137#if DEBUG || DEVELOPMENT
0a7de745 138#define DEBUG_KERNEL_ADDRPERM(_v) (_v)
3e170ce0 139#else
0a7de745 140#define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
3e170ce0
A
141#endif
142
39236c6e
A
143/* TODO: this should be in a header file somewhere */
144extern char *proc_name_address(void *p);
145
0a7de745
A
146static u_int32_t so_cache_hw; /* High water mark for socache */
147static u_int32_t so_cache_timeouts; /* number of timeouts */
148static u_int32_t so_cache_max_freed; /* max freed per timeout */
149static u_int32_t cached_sock_count = 0;
150STAILQ_HEAD(, socket) so_cache_head;
151int max_cached_sock_count = MAX_CACHED_SOCKETS;
152static u_int32_t so_cache_time;
153static int socketinit_done;
154static struct zone *so_cache_zone;
155
c3c9b80d
A
156static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache");
157static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp);
91447636 158
1c79356b
A
159#include <machine/limits.h>
160
cb323159 161static int filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
0a7de745
A
162static void filt_sordetach(struct knote *kn);
163static int filt_soread(struct knote *kn, long hint);
cb323159
A
164static int filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
165static int filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
39037602 166
cb323159 167static int filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
0a7de745
A
168static void filt_sowdetach(struct knote *kn);
169static int filt_sowrite(struct knote *kn, long hint);
cb323159
A
170static int filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
171static int filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
39037602 172
cb323159 173static int filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
0a7de745
A
174static void filt_sockdetach(struct knote *kn);
175static int filt_sockev(struct knote *kn, long hint);
cb323159
A
176static int filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
177static int filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
2d21ac55 178
39236c6e
A
179static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
180static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
55e303ae 181
5ba3f43e 182SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
39236c6e 183 .f_isfd = 1,
39037602 184 .f_attach = filt_sorattach,
39236c6e
A
185 .f_detach = filt_sordetach,
186 .f_event = filt_soread,
39037602
A
187 .f_touch = filt_sortouch,
188 .f_process = filt_sorprocess,
b0d623f7 189};
39236c6e 190
5ba3f43e 191SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
39236c6e 192 .f_isfd = 1,
39037602 193 .f_attach = filt_sowattach,
39236c6e
A
194 .f_detach = filt_sowdetach,
195 .f_event = filt_sowrite,
39037602
A
196 .f_touch = filt_sowtouch,
197 .f_process = filt_sowprocess,
b0d623f7 198};
39236c6e 199
5ba3f43e 200SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
316670eb 201 .f_isfd = 1,
39037602 202 .f_attach = filt_sockattach,
316670eb
A
203 .f_detach = filt_sockdetach,
204 .f_event = filt_sockev,
3e170ce0 205 .f_touch = filt_socktouch,
39037602
A
206 .f_process = filt_sockprocess,
207};
208
5ba3f43e 209SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
39037602
A
210 .f_isfd = 1,
211 .f_attach = filt_sorattach,
212 .f_detach = filt_sordetach,
213 .f_event = filt_soread,
214 .f_touch = filt_sortouch,
215 .f_process = filt_sorprocess,
316670eb 216};
55e303ae 217
fe8ab488
A
218SYSCTL_DECL(_kern_ipc);
219
0a7de745 220#define EVEN_MORE_LOCKING_DEBUG 0
fe8ab488 221
1c79356b 222int socket_debug = 0;
fe8ab488 223SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
0a7de745 224 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
fe8ab488 225
39037602
A
226static unsigned long sodefunct_calls = 0;
227SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
228 &sodefunct_calls, "");
229
f427ee49 230ZONE_DECLARE(socket_zone, "socket", sizeof(struct socket), ZC_ZFREE_CLEARMEM);
0a7de745 231so_gen_t so_gencnt; /* generation count for sockets */
1c79356b
A
232
233MALLOC_DEFINE(M_SONAME, "soname", "socket name");
234MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
235
0a7de745
A
236#define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
237#define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
238#define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
239#define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
240#define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
241#define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
242#define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
243#define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
244#define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
1c79356b 245
0a7de745 246#define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
1c79356b 247
2d21ac55 248int somaxconn = SOMAXCONN;
39236c6e 249SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
0a7de745 250 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
1c79356b
A
251
252/* Should we get a maximum also ??? */
fa4905b1 253static int sosendmaxchain = 65536;
1c79356b 254static int sosendminchain = 16384;
55e303ae 255static int sorecvmincopy = 16384;
39236c6e 256SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
0a7de745 257 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
39236c6e 258SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
0a7de745 259 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
2d21ac55
A
260
261/*
262 * Set to enable jumbo clusters (if available) for large writes when
263 * the socket is marked with SOF_MULTIPAGES; see below.
264 */
265int sosendjcl = 1;
39236c6e 266SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
0a7de745 267 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
1c79356b 268
2d21ac55
A
269/*
270 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
271 * writes on the socket for all protocols on any network interfaces,
272 * depending upon sosendjcl above. Be extra careful when setting this
273 * to 1, because sending down packets that cross physical pages down to
274 * broken drivers (those that falsely assume that the physical pages
275 * are contiguous) might lead to system panics or silent data corruption.
276 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
277 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
278 * capable. Set this to 1 only for testing/debugging purposes.
279 */
280int sosendjcl_ignore_capab = 0;
39236c6e 281SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
0a7de745 282 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
1c79356b 283
3e170ce0
A
284/*
285 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
286 * writes on the socket for all protocols on any network interfaces.
287 * Be extra careful when setting this to 1, because sending down packets with
288 * clusters larger that 2 KB might lead to system panics or data corruption.
289 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
290 * on the outgoing interface
291 * Set this to 1 for testing/debugging purposes only.
292 */
fe8ab488
A
293int sosendbigcl_ignore_capab = 0;
294SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
0a7de745 295 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
fe8ab488 296
6d2010ae
A
297int sodefunctlog = 0;
298SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 299 &sodefunctlog, 0, "");
6d2010ae 300
316670eb
A
301int sothrottlelog = 0;
302SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 303 &sothrottlelog, 0, "");
39236c6e
A
304
305int sorestrictrecv = 1;
306SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 307 &sorestrictrecv, 0, "Enable inbound interface restrictions");
316670eb 308
fe8ab488
A
309int sorestrictsend = 1;
310SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 311 &sorestrictsend, 0, "Enable outbound interface restrictions");
1c79356b 312
3e170ce0
A
313int soreserveheadroom = 1;
314SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 315 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
3e170ce0 316
39037602
A
317#if (DEBUG || DEVELOPMENT)
318int so_notsent_lowat_check = 1;
0a7de745 319SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
39037602
A
320 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
321#endif /* DEBUG || DEVELOPMENT */
322
813fb2f6
A
323int so_accept_list_waits = 0;
324#if (DEBUG || DEVELOPMENT)
0a7de745 325SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
813fb2f6
A
326 &so_accept_list_waits, 0, "number of waits for listener incomp list");
327#endif /* DEBUG || DEVELOPMENT */
328
39236c6e 329extern struct inpcbinfo tcbinfo;
2d21ac55
A
330
331/* TODO: these should be in header file */
332extern int get_inpcb_str_size(void);
333extern int get_tcp_str_size(void);
2d21ac55 334
0a7de745 335vm_size_t so_cache_zone_element_size;
91447636 336
3e170ce0
A
337static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
338 user_ssize_t *);
f427ee49 339static void cached_sock_alloc(struct socket **, zalloc_flags_t);
2d21ac55 340static void cached_sock_free(struct socket *);
91447636 341
3e170ce0
A
342/*
343 * Maximum of extended background idle sockets per process
344 * Set to zero to disable further setting of the option
345 */
346
0a7de745
A
347#define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
348#define SO_IDLE_BK_IDLE_TIME 600
349#define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
3e170ce0
A
350
351struct soextbkidlestat soextbkidlestat;
352
353SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
0a7de745
A
354 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
355 "Maximum of extended background idle sockets per process");
3e170ce0
A
356
357SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745
A
358 &soextbkidlestat.so_xbkidle_time, 0,
359 "Time in seconds to keep extended background idle sockets");
3e170ce0
A
360
361SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745
A
362 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
363 "High water mark for extended background idle sockets");
3e170ce0
A
364
365SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
0a7de745 366 &soextbkidlestat, soextbkidlestat, "");
3e170ce0
A
367
368int so_set_extended_bk_idle(struct socket *, int);
369
5ba3f43e 370
316670eb
A
371/*
372 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
373 * setting the DSCP code on the packet based on the service class; see
374 * <rdar://problem/11277343> for details.
375 */
39037602 376__private_extern__ u_int32_t sotcdb = 0;
6d2010ae 377SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 378 &sotcdb, 0, "");
91447636 379
2d21ac55
A
380void
381socketinit(void)
1c79356b 382{
fe8ab488
A
383 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
384 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
385
3e170ce0
A
386#ifdef __LP64__
387 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
388 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
389 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
390 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
391 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
392 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
393#else
394 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
395 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
396 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
397 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
398 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
399 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
400#endif
401
39236c6e 402 if (socketinit_done) {
91447636
A
403 printf("socketinit: already called...\n");
404 return;
405 }
39236c6e 406 socketinit_done = 1;
91447636 407
39236c6e 408 PE_parse_boot_argn("socket_debug", &socket_debug,
0a7de745 409 sizeof(socket_debug));
2d21ac55 410
39236c6e 411 STAILQ_INIT(&so_cache_head);
1c79356b 412
0a7de745 413 so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
39236c6e 414 + get_inpcb_str_size() + 4 + get_tcp_str_size());
2d21ac55 415
f427ee49
A
416 so_cache_zone = zone_create("socache zone", so_cache_zone_element_size,
417 ZC_ZFREE_CLEARMEM | ZC_NOENCRYPT);
1c79356b 418
3e170ce0
A
419 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
420 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
421 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
422 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
316670eb 423
39236c6e 424 in_pcbinit();
6d2010ae 425 socket_tclass_init();
39236c6e
A
426#if MULTIPATH
427 mp_pcbinit();
428#endif /* MULTIPATH */
1c79356b
A
429}
430
2d21ac55 431static void
f427ee49 432cached_sock_alloc(struct socket **so, zalloc_flags_t how)
1c79356b 433{
0a7de745 434 caddr_t temp;
39236c6e 435 uintptr_t offset;
1c79356b 436
c3c9b80d 437 lck_mtx_lock(&so_cache_mtx);
91447636 438
39236c6e
A
439 if (!STAILQ_EMPTY(&so_cache_head)) {
440 VERIFY(cached_sock_count > 0);
1c79356b 441
39236c6e
A
442 *so = STAILQ_FIRST(&so_cache_head);
443 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
444 STAILQ_NEXT((*so), so_cache_ent) = NULL;
91447636 445
39236c6e 446 cached_sock_count--;
c3c9b80d 447 lck_mtx_unlock(&so_cache_mtx);
1c79356b 448
2d21ac55 449 temp = (*so)->so_saved_pcb;
0a7de745 450 bzero((caddr_t)*so, sizeof(struct socket));
39236c6e 451
2d21ac55 452 (*so)->so_saved_pcb = temp;
2d21ac55 453 } else {
c3c9b80d 454 lck_mtx_unlock(&so_cache_mtx);
1c79356b 455
f427ee49 456 *so = zalloc_flags(so_cache_zone, how | Z_ZERO);
1c79356b 457
2d21ac55 458 /*
3e170ce0
A
459 * Define offsets for extra structures into our
460 * single block of memory. Align extra structures
39236c6e 461 * on longword boundaries.
2d21ac55 462 */
b0d623f7 463
39236c6e 464 offset = (uintptr_t)*so;
0a7de745 465 offset += sizeof(struct socket);
b0d623f7
A
466
467 offset = ALIGN(offset);
468
2d21ac55
A
469 (*so)->so_saved_pcb = (caddr_t)offset;
470 offset += get_inpcb_str_size();
b0d623f7
A
471
472 offset = ALIGN(offset);
1c79356b 473
316670eb 474 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
2d21ac55 475 (caddr_t)offset;
2d21ac55 476 }
1c79356b 477
3e170ce0 478 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
1c79356b
A
479}
480
2d21ac55
A
481static void
482cached_sock_free(struct socket *so)
1c79356b 483{
c3c9b80d 484 lck_mtx_lock(&so_cache_mtx);
1c79356b 485
39236c6e 486 so_cache_time = net_uptime();
b0d623f7 487 if (++cached_sock_count > max_cached_sock_count) {
1c79356b 488 --cached_sock_count;
c3c9b80d 489 lck_mtx_unlock(&so_cache_mtx);
91447636 490 zfree(so_cache_zone, so);
2d21ac55 491 } else {
0a7de745 492 if (so_cache_hw < cached_sock_count) {
1c79356b 493 so_cache_hw = cached_sock_count;
0a7de745 494 }
1c79356b 495
39236c6e 496 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
1c79356b
A
497
498 so->cache_timestamp = so_cache_time;
c3c9b80d 499 lck_mtx_unlock(&so_cache_mtx);
1c79356b 500 }
1c79356b
A
501}
502
39236c6e
A
503void
504so_update_last_owner_locked(struct socket *so, proc_t self)
6d2010ae 505{
39236c6e
A
506 if (so->last_pid != 0) {
507 /*
508 * last_pid and last_upid should remain zero for sockets
509 * created using sock_socket. The check above achieves that
510 */
0a7de745 511 if (self == PROC_NULL) {
316670eb 512 self = current_proc();
0a7de745 513 }
39236c6e
A
514
515 if (so->last_upid != proc_uniqueid(self) ||
516 so->last_pid != proc_pid(self)) {
316670eb
A
517 so->last_upid = proc_uniqueid(self);
518 so->last_pid = proc_pid(self);
39236c6e 519 proc_getexecutableuuid(self, so->last_uuid,
0a7de745 520 sizeof(so->last_uuid));
cb323159
A
521 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
522 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
523 }
316670eb 524 }
fe8ab488 525 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
6d2010ae
A
526 }
527}
528
39236c6e
A
529void
530so_update_policy(struct socket *so)
1c79356b 531{
0a7de745 532 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
39236c6e 533 (void) inp_update_policy(sotoinpcb(so));
0a7de745 534 }
39236c6e 535}
1c79356b 536
fe8ab488
A
537#if NECP
538static void
3e170ce0
A
539so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
540 struct sockaddr *override_remote_addr)
fe8ab488 541{
0a7de745 542 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
3e170ce0
A
543 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
544 override_remote_addr, 0);
0a7de745 545 }
fe8ab488
A
546}
547#endif /* NECP */
548
39236c6e
A
549boolean_t
550so_cache_timer(void)
551{
0a7de745
A
552 struct socket *p;
553 int n_freed = 0;
39236c6e 554 boolean_t rc = FALSE;
1c79356b 555
c3c9b80d 556 lck_mtx_lock(&so_cache_mtx);
39236c6e
A
557 so_cache_timeouts++;
558 so_cache_time = net_uptime();
559
560 while (!STAILQ_EMPTY(&so_cache_head)) {
561 VERIFY(cached_sock_count > 0);
562 p = STAILQ_FIRST(&so_cache_head);
3e170ce0 563 if ((so_cache_time - p->cache_timestamp) <
0a7de745 564 SO_CACHE_TIME_LIMIT) {
2d21ac55 565 break;
0a7de745 566 }
1c79356b 567
39236c6e
A
568 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
569 --cached_sock_count;
1c79356b 570
91447636 571 zfree(so_cache_zone, p);
2d21ac55
A
572
573 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
574 so_cache_max_freed++;
1c79356b
A
575 break;
576 }
577 }
1c79356b 578
39236c6e 579 /* Schedule again if there is more to cleanup */
0a7de745 580 if (!STAILQ_EMPTY(&so_cache_head)) {
39236c6e 581 rc = TRUE;
0a7de745 582 }
39236c6e 583
c3c9b80d 584 lck_mtx_unlock(&so_cache_mtx);
0a7de745 585 return rc;
1c79356b 586}
1c79356b
A
587
588/*
589 * Get a socket structure from our zone, and initialize it.
590 * We don't implement `waitok' yet (see comments in uipc_domain.c).
591 * Note that it would probably be better to allocate socket
592 * and PCB at the same time, but I'm not convinced that all
593 * the protocols can be easily modified to do this.
594 */
595struct socket *
2d21ac55 596soalloc(int waitok, int dom, int type)
1c79356b 597{
f427ee49 598 zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
1c79356b
A
599 struct socket *so;
600
2d21ac55 601 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
f427ee49 602 cached_sock_alloc(&so, how);
2d21ac55 603 } else {
f427ee49 604 so = zalloc_flags(socket_zone, how | Z_ZERO);
1c79356b 605 }
2d21ac55 606 if (so != NULL) {
fe8ab488 607 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
5ba3f43e
A
608
609 /*
610 * Increment the socket allocation statistics
611 */
612 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
1c79356b
A
613 }
614
0a7de745 615 return so;
1c79356b
A
616}
617
618int
39236c6e
A
619socreate_internal(int dom, struct socket **aso, int type, int proto,
620 struct proc *p, uint32_t flags, struct proc *ep)
1c79356b 621{
39236c6e
A
622 struct protosw *prp;
623 struct socket *so;
624 int error = 0;
ea3f0419
A
625#if defined(XNU_TARGET_OS_OSX)
626 pid_t rpid = -1;
627#endif
d1ecb069 628
55e303ae
A
629#if TCPDEBUG
630 extern int tcpconsdebug;
631#endif
39236c6e
A
632
633 VERIFY(aso != NULL);
634 *aso = NULL;
635
0a7de745 636 if (proto != 0) {
1c79356b 637 prp = pffindproto(dom, proto, type);
0a7de745 638 } else {
1c79356b 639 prp = pffindtype(dom, type);
0a7de745 640 }
9bccf70c 641
39236c6e 642 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
0a7de745
A
643 if (pffinddomain(dom) == NULL) {
644 return EAFNOSUPPORT;
645 }
2d21ac55 646 if (proto != 0) {
0a7de745
A
647 if (pffindprotonotype(dom, proto) != NULL) {
648 return EPROTOTYPE;
649 }
2d21ac55 650 }
0a7de745
A
651 return EPROTONOSUPPORT;
652 }
653 if (prp->pr_type != type) {
654 return EPROTOTYPE;
9bccf70c 655 }
b0d623f7 656 so = soalloc(1, dom, type);
0a7de745
A
657 if (so == NULL) {
658 return ENOBUFS;
659 }
1c79356b 660
5ba3f43e 661 switch (dom) {
0a7de745
A
662 case PF_LOCAL:
663 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
664 break;
665 case PF_INET:
666 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
667 if (type == SOCK_STREAM) {
668 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
669 } else {
670 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
671 }
672 break;
673 case PF_ROUTE:
674 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
675 break;
676 case PF_NDRV:
677 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
678 break;
679 case PF_KEY:
680 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
681 break;
682 case PF_INET6:
683 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
684 if (type == SOCK_STREAM) {
685 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
686 } else {
687 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
688 }
689 break;
690 case PF_SYSTEM:
691 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
692 break;
693 case PF_MULTIPATH:
694 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
695 break;
696 default:
697 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
698 break;
5ba3f43e
A
699 }
700
cb323159 701 if (flags & SOCF_MPTCP) {
39236c6e 702 so->so_state |= SS_NBIO;
0a7de745 703 }
39236c6e 704
1c79356b
A
705 TAILQ_INIT(&so->so_incomp);
706 TAILQ_INIT(&so->so_comp);
707 so->so_type = type;
316670eb
A
708 so->last_upid = proc_uniqueid(p);
709 so->last_pid = proc_pid(p);
0a7de745 710 proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
fe8ab488 711 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
39236c6e
A
712
713 if (ep != PROC_NULL && ep != p) {
714 so->e_upid = proc_uniqueid(ep);
715 so->e_pid = proc_pid(ep);
0a7de745 716 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
39236c6e 717 so->so_flags |= SOF_DELEGATED;
ea3f0419
A
718#if defined(XNU_TARGET_OS_OSX)
719 if (ep->p_responsible_pid != so->e_pid) {
720 rpid = ep->p_responsible_pid;
721 }
722#endif
723 }
724
725#if defined(XNU_TARGET_OS_OSX)
726 if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
727 rpid = p->p_responsible_pid;
728 }
729
730 so->so_rpid = -1;
731 uuid_clear(so->so_ruuid);
732 if (rpid >= 0) {
733 proc_t rp = proc_find(rpid);
734 if (rp != PROC_NULL) {
735 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
736 so->so_rpid = rpid;
737 proc_rele(rp);
738 }
39236c6e 739 }
ea3f0419 740#endif
1c79356b 741
316670eb 742 so->so_cred = kauth_cred_proc_ref(p);
0a7de745 743 if (!suser(kauth_cred_get(), NULL)) {
39236c6e 744 so->so_state |= SS_PRIV;
0a7de745 745 }
b0d623f7 746
1c79356b 747 so->so_proto = prp;
39236c6e 748 so->so_rcv.sb_flags |= SB_RECV;
91447636 749 so->so_rcv.sb_so = so->so_snd.sb_so = so;
0c530ab8
A
750 so->next_lock_lr = 0;
751 so->next_unlock_lr = 0;
2d21ac55 752
2d21ac55 753 /*
39236c6e
A
754 * Attachment will create the per pcb lock if necessary and
755 * increase refcount for creation, make sure it's done before
756 * socket is inserted in lists.
2d21ac55
A
757 */
758 so->so_usecount++;
91447636
A
759
760 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
39236c6e 761 if (error != 0) {
2d21ac55
A
762 /*
763 * Warning:
764 * If so_pcb is not zero, the socket will be leaked,
765 * so protocol attachment handler must be coded carefuly
55e303ae 766 */
1c79356b 767 so->so_state |= SS_NOFDREF;
d190cdc3 768 VERIFY(so->so_usecount > 0);
37839358 769 so->so_usecount--;
0a7de745
A
770 sofreelastref(so, 1); /* will deallocate the socket */
771 return error;
1c79356b 772 }
39236c6e 773
cb323159
A
774 /*
775 * Note: needs so_pcb to be set after pru_attach
776 */
777 if (prp->pr_update_last_owner != NULL) {
778 (*prp->pr_update_last_owner)(so, p, ep);
779 }
780
39236c6e 781 atomic_add_32(&prp->pr_domain->dom_refs, 1);
91447636
A
782
783 /* Attach socket filters for this protocol */
784 sflt_initsock(so);
55e303ae 785#if TCPDEBUG
0a7de745 786 if (tcpconsdebug == 2) {
55e303ae 787 so->so_options |= SO_DEBUG;
0a7de745 788 }
9bccf70c 789#endif
6d2010ae 790 so_set_default_traffic_class(so);
39236c6e 791
d1ecb069 792 /*
39236c6e
A
793 * If this thread or task is marked to create backgrounded sockets,
794 * mark the socket as background.
d1ecb069 795 */
cb323159
A
796 if (!(flags & SOCF_MPTCP) &&
797 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
d1ecb069 798 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
6d2010ae
A
799 so->so_background_thread = current_thread();
800 }
801
802 switch (dom) {
316670eb 803 /*
94ff46dc 804 * Don't mark Unix domain or system
39236c6e
A
805 * eligible for defunct by default.
806 */
6d2010ae 807 case PF_LOCAL:
316670eb 808 case PF_SYSTEM:
6d2010ae
A
809 so->so_flags |= SOF_NODEFUNCT;
810 break;
316670eb
A
811 default:
812 break;
d1ecb069
A
813 }
814
fe8ab488
A
815 /*
816 * Entitlements can't be checked at socket creation time except if the
817 * application requested a feature guarded by a privilege (c.f., socket
818 * delegation).
819 * The priv(9) and the Sandboxing APIs are designed with the idea that
820 * a privilege check should only be triggered by a userland request.
821 * A privilege check at socket creation time is time consuming and
822 * could trigger many authorisation error messages from the security
823 * APIs.
824 */
825
1c79356b 826 *aso = so;
39236c6e 827
0a7de745 828 return 0;
1c79356b
A
829}
830
39236c6e
A
831/*
832 * Returns: 0 Success
833 * EAFNOSUPPORT
834 * EPROTOTYPE
835 * EPROTONOSUPPORT
836 * ENOBUFS
837 * <pru_attach>:ENOBUFS[AF_UNIX]
838 * <pru_attach>:ENOBUFS[TCP]
839 * <pru_attach>:ENOMEM[TCP]
840 * <pru_attach>:??? [other protocol families, IPSEC]
841 */
842int
843socreate(int dom, struct socket **aso, int type, int proto)
844{
0a7de745
A
845 return socreate_internal(dom, aso, type, proto, current_proc(), 0,
846 PROC_NULL);
39236c6e
A
847}
848
849int
850socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
851{
852 int error = 0;
853 struct proc *ep = PROC_NULL;
854
855 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
856 error = ESRCH;
857 goto done;
858 }
859
860 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
861
862 /*
863 * It might not be wise to hold the proc reference when calling
864 * socreate_internal since it calls soalloc with M_WAITOK
865 */
866done:
0a7de745 867 if (ep != PROC_NULL) {
39236c6e 868 proc_rele(ep);
0a7de745 869 }
39236c6e 870
0a7de745 871 return error;
39236c6e
A
872}
873
2d21ac55
A
874/*
875 * Returns: 0 Success
876 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
877 * <pru_bind>:EAFNOSUPPORT Address family not supported
878 * <pru_bind>:EADDRNOTAVAIL Address not available.
879 * <pru_bind>:EINVAL Invalid argument
880 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
881 * <pru_bind>:EACCES Permission denied
882 * <pru_bind>:EADDRINUSE Address in use
883 * <pru_bind>:EAGAIN Resource unavailable, try again
884 * <pru_bind>:EPERM Operation not permitted
885 * <pru_bind>:???
886 * <sf_bind>:???
887 *
888 * Notes: It's not possible to fully enumerate the return codes above,
889 * since socket filter authors and protocol family authors may
890 * not choose to limit their error returns to those listed, even
891 * though this may result in some software operating incorrectly.
892 *
893 * The error codes which are enumerated above are those known to
894 * be returned by the tcp_usr_bind function supplied.
895 */
1c79356b 896int
39236c6e 897sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
1c79356b
A
898{
899 struct proc *p = current_proc();
91447636 900 int error = 0;
1c79356b 901
0a7de745 902 if (dolock) {
39236c6e 903 socket_lock(so, 1);
0a7de745 904 }
39236c6e 905
6d2010ae 906 so_update_last_owner_locked(so, p);
39236c6e 907 so_update_policy(so);
3e170ce0 908
fe8ab488
A
909#if NECP
910 so_update_necp_policy(so, nam, NULL);
911#endif /* NECP */
3e170ce0 912
2d21ac55 913 /*
6d2010ae
A
914 * If this is a bind request on a socket that has been marked
915 * as inactive, reject it now before we go any further.
2d21ac55
A
916 */
917 if (so->so_flags & SOF_DEFUNCT) {
918 error = EINVAL;
39037602
A
919 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
920 __func__, proc_pid(p), proc_best_name(p),
921 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
922 SOCK_DOM(so), SOCK_TYPE(so), error);
2d21ac55
A
923 goto out;
924 }
925
91447636 926 /* Socket filter */
6d2010ae 927 error = sflt_bind(so, nam);
2d21ac55 928
0a7de745 929 if (error == 0) {
91447636 930 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
0a7de745 931 }
2d21ac55 932out:
0a7de745 933 if (dolock) {
39236c6e 934 socket_unlock(so, 1);
0a7de745 935 }
2d21ac55 936
0a7de745 937 if (error == EJUSTRETURN) {
91447636 938 error = 0;
0a7de745 939 }
2d21ac55 940
0a7de745 941 return error;
1c79356b
A
942}
943
944void
2d21ac55 945sodealloc(struct socket *so)
1c79356b 946{
316670eb
A
947 kauth_cred_unref(&so->so_cred);
948
6d2010ae
A
949 /* Remove any filters */
950 sflt_termsock(so);
951
fe8ab488
A
952#if CONTENT_FILTER
953 cfil_sock_detach(so);
954#endif /* CONTENT_FILTER */
955
fe8ab488 956 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
1c79356b 957
3e170ce0 958 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
2d21ac55
A
959 cached_sock_free(so);
960 } else {
f427ee49 961 zfree(socket_zone, so);
91447636 962 }
1c79356b
A
963}
964
2d21ac55
A
965/*
966 * Returns: 0 Success
967 * EINVAL
968 * EOPNOTSUPP
969 * <pru_listen>:EINVAL[AF_UNIX]
970 * <pru_listen>:EINVAL[TCP]
971 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
972 * <pru_listen>:EINVAL[TCP] Invalid argument
973 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
974 * <pru_listen>:EACCES[TCP] Permission denied
975 * <pru_listen>:EADDRINUSE[TCP] Address in use
976 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
977 * <pru_listen>:EPERM[TCP] Operation not permitted
978 * <sf_listen>:???
979 *
980 * Notes: Other <pru_listen> returns depend on the protocol family; all
981 * <sf_listen> returns depend on what the filter author causes
982 * their filter to return.
983 */
1c79356b 984int
2d21ac55 985solisten(struct socket *so, int backlog)
1c79356b 986{
1c79356b 987 struct proc *p = current_proc();
2d21ac55 988 int error = 0;
1c79356b 989
91447636 990 socket_lock(so, 1);
39236c6e
A
991
992 so_update_last_owner_locked(so, p);
993 so_update_policy(so);
3e170ce0 994
fe8ab488
A
995#if NECP
996 so_update_necp_policy(so, NULL, NULL);
997#endif /* NECP */
3e170ce0 998
2d21ac55
A
999 if (so->so_proto == NULL) {
1000 error = EINVAL;
1001 goto out;
1002 }
1003 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1004 error = EOPNOTSUPP;
1005 goto out;
1006 }
1007
1008 /*
1009 * If the listen request is made on a socket that is not fully
6d2010ae
A
1010 * disconnected, or on a socket that has been marked as inactive,
1011 * reject the request now.
2d21ac55
A
1012 */
1013 if ((so->so_state &
0a7de745 1014 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
2d21ac55
A
1015 (so->so_flags & SOF_DEFUNCT)) {
1016 error = EINVAL;
6d2010ae 1017 if (so->so_flags & SOF_DEFUNCT) {
39037602 1018 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
39236c6e 1019 "(%d)\n", __func__, proc_pid(p),
39037602 1020 proc_best_name(p),
3e170ce0 1021 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 1022 SOCK_DOM(so), SOCK_TYPE(so), error);
6d2010ae 1023 }
2d21ac55
A
1024 goto out;
1025 }
1026
39236c6e 1027 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
2d21ac55
A
1028 error = EPERM;
1029 goto out;
1030 }
1031
6d2010ae 1032 error = sflt_listen(so);
0a7de745 1033 if (error == 0) {
91447636 1034 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
0a7de745 1035 }
2d21ac55 1036
1c79356b 1037 if (error) {
0a7de745 1038 if (error == EJUSTRETURN) {
91447636 1039 error = 0;
0a7de745 1040 }
2d21ac55 1041 goto out;
1c79356b 1042 }
2d21ac55 1043
0a7de745 1044 if (TAILQ_EMPTY(&so->so_comp)) {
1c79356b 1045 so->so_options |= SO_ACCEPTCONN;
0a7de745 1046 }
2d21ac55
A
1047 /*
1048 * POSIX: The implementation may have an upper limit on the length of
1049 * the listen queue-either global or per accepting socket. If backlog
1050 * exceeds this limit, the length of the listen queue is set to the
1051 * limit.
1052 *
1053 * If listen() is called with a backlog argument value that is less
1054 * than 0, the function behaves as if it had been called with a backlog
1055 * argument value of 0.
1056 *
1057 * A backlog argument of 0 may allow the socket to accept connections,
1058 * in which case the length of the listen queue may be set to an
1059 * implementation-defined minimum value.
1060 */
0a7de745 1061 if (backlog <= 0 || backlog > somaxconn) {
1c79356b 1062 backlog = somaxconn;
0a7de745 1063 }
1c79356b 1064
2d21ac55
A
1065 so->so_qlimit = backlog;
1066out:
91447636 1067 socket_unlock(so, 1);
0a7de745 1068 return error;
1c79356b
A
1069}
1070
813fb2f6
A
1071/*
1072 * The "accept list lock" protects the fields related to the listener queues
1073 * because we can unlock a socket to respect the lock ordering between
1074 * the listener socket and its clients sockets. The lock ordering is first to
1075 * acquire the client socket before the listener socket.
1076 *
1077 * The accept list lock serializes access to the following fields:
1078 * - of the listener socket:
1079 * - so_comp
1080 * - so_incomp
1081 * - so_qlen
1082 * - so_inqlen
1083 * - of client sockets that are in so_comp or so_incomp:
1084 * - so_head
1085 * - so_list
1086 *
1087 * As one can see the accept list lock protects the consistent of the
1088 * linkage of the client sockets.
1089 *
1090 * Note that those fields may be read without holding the accept list lock
1091 * for a preflight provided the accept list lock is taken when committing
1092 * to take an action based on the result of the preflight. The preflight
1093 * saves the cost of doing the unlock/lock dance.
1094 */
1095void
1096so_acquire_accept_list(struct socket *head, struct socket *so)
1097{
1098 lck_mtx_t *mutex_held;
1099
1100 if (head->so_proto->pr_getlock == NULL) {
1101 return;
1102 }
5ba3f43e
A
1103 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1104 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
813fb2f6
A
1105
1106 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1107 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1108 return;
1109 }
1110 if (so != NULL) {
1111 socket_unlock(so, 0);
1112 }
1113 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1114 so_accept_list_waits += 1;
1115 msleep((caddr_t)&head->so_incomp, mutex_held,
1116 PSOCK | PCATCH, __func__, NULL);
1117 }
1118 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1119 if (so != NULL) {
1120 socket_unlock(head, 0);
1121 socket_lock(so, 0);
1122 socket_lock(head, 0);
1123 }
1124}
1125
1126void
1127so_release_accept_list(struct socket *head)
1128{
1129 if (head->so_proto->pr_getlock != NULL) {
1130 lck_mtx_t *mutex_held;
1131
1132 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
5ba3f43e
A
1133 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1134
813fb2f6
A
1135 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1136 wakeup((caddr_t)&head->so_incomp);
1137 }
1138}
1139
1c79356b 1140void
2d21ac55 1141sofreelastref(struct socket *so, int dealloc)
9bccf70c 1142{
1c79356b
A
1143 struct socket *head = so->so_head;
1144
2d21ac55 1145 /* Assume socket is locked */
1c79356b 1146
39236c6e 1147 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
0b4e3aa0
A
1148 selthreadclear(&so->so_snd.sb_sel);
1149 selthreadclear(&so->so_rcv.sb_sel);
0a7de745
A
1150 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1151 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
fe8ab488 1152 so->so_event = sonullevent;
1c79356b 1153 return;
0b4e3aa0 1154 }
9bccf70c 1155 if (head != NULL) {
d190cdc3
A
1156 /*
1157 * Need to lock the listener when the protocol has
1158 * per socket locks
1159 */
813fb2f6 1160 if (head->so_proto->pr_getlock != NULL) {
d190cdc3 1161 socket_lock(head, 1);
813fb2f6
A
1162 so_acquire_accept_list(head, so);
1163 }
9bccf70c 1164 if (so->so_state & SS_INCOMP) {
d190cdc3 1165 so->so_state &= ~SS_INCOMP;
9bccf70c
A
1166 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1167 head->so_incqlen--;
d190cdc3
A
1168 head->so_qlen--;
1169 so->so_head = NULL;
813fb2f6
A
1170
1171 if (head->so_proto->pr_getlock != NULL) {
1172 so_release_accept_list(head);
1173 socket_unlock(head, 1);
1174 }
9bccf70c 1175 } else if (so->so_state & SS_COMP) {
813fb2f6
A
1176 if (head->so_proto->pr_getlock != NULL) {
1177 so_release_accept_list(head);
1178 socket_unlock(head, 1);
1179 }
9bccf70c
A
1180 /*
1181 * We must not decommission a socket that's
1182 * on the accept(2) queue. If we do, then
1183 * accept(2) may hang after select(2) indicated
1184 * that the listening socket was ready.
1185 */
9bccf70c
A
1186 selthreadclear(&so->so_snd.sb_sel);
1187 selthreadclear(&so->so_rcv.sb_sel);
0a7de745
A
1188 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1189 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
fe8ab488 1190 so->so_event = sonullevent;
9bccf70c
A
1191 return;
1192 } else {
813fb2f6
A
1193 if (head->so_proto->pr_getlock != NULL) {
1194 so_release_accept_list(head);
0a7de745
A
1195 socket_unlock(head, 1);
1196 }
813fb2f6 1197 printf("sofree: not queued\n");
9bccf70c 1198 }
1c79356b 1199 }
39236c6e 1200 sowflush(so);
1c79356b 1201 sorflush(so);
2d21ac55 1202
39236c6e
A
1203#if FLOW_DIVERT
1204 if (so->so_flags & SOF_FLOW_DIVERT) {
1205 flow_divert_detach(so);
1206 }
0a7de745 1207#endif /* FLOW_DIVERT */
39236c6e 1208
91447636
A
1209 /* 3932268: disable upcall */
1210 so->so_rcv.sb_flags &= ~SB_UPCALL;
0a7de745 1211 so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
fe8ab488 1212 so->so_event = sonullevent;
2d21ac55 1213
0a7de745 1214 if (dealloc) {
91447636 1215 sodealloc(so);
0a7de745 1216 }
1c79356b
A
1217}
1218
2d21ac55
A
1219void
1220soclose_wait_locked(struct socket *so)
1221{
1222 lck_mtx_t *mutex_held;
1223
0a7de745 1224 if (so->so_proto->pr_getlock != NULL) {
5ba3f43e 1225 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
0a7de745 1226 } else {
2d21ac55 1227 mutex_held = so->so_proto->pr_domain->dom_mtx;
0a7de745 1228 }
5ba3f43e 1229 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
2d21ac55 1230
4a3eedf9
A
1231 /*
1232 * Double check here and return if there's no outstanding upcall;
1233 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1234 */
0a7de745 1235 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
2d21ac55 1236 return;
0a7de745 1237 }
316670eb
A
1238 so->so_rcv.sb_flags &= ~SB_UPCALL;
1239 so->so_snd.sb_flags &= ~SB_UPCALL;
2d21ac55 1240 so->so_flags |= SOF_CLOSEWAIT;
5ba3f43e 1241
39236c6e 1242 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
2d21ac55 1243 "soclose_wait_locked", NULL);
5ba3f43e 1244 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
2d21ac55
A
1245 so->so_flags &= ~SOF_CLOSEWAIT;
1246}
1247
1c79356b
A
1248/*
1249 * Close a socket on last file table reference removal.
1250 * Initiate disconnect if connected.
1251 * Free socket when disconnect complete.
1252 */
1253int
2d21ac55 1254soclose_locked(struct socket *so)
1c79356b 1255{
1c79356b 1256 int error = 0;
91447636 1257 struct timespec ts;
1c79356b 1258
91447636 1259 if (so->so_usecount == 0) {
2d21ac55 1260 panic("soclose: so=%p refcount=0\n", so);
39236c6e 1261 /* NOTREACHED */
1c79356b
A
1262 }
1263
91447636 1264 sflt_notify(so, sock_evt_closing, NULL);
2d21ac55 1265
0a7de745 1266 if (so->so_upcallusecount) {
39236c6e 1267 soclose_wait_locked(so);
0a7de745 1268 }
39236c6e 1269
fe8ab488
A
1270#if CONTENT_FILTER
1271 /*
1272 * We have to wait until the content filters are done
1273 */
1274 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1275 cfil_sock_close_wait(so);
1276 cfil_sock_is_closed(so);
1277 cfil_sock_detach(so);
1278 }
1279#endif /* CONTENT_FILTER */
1280
3e170ce0
A
1281 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1282 soresume(current_proc(), so, 1);
1283 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1284 }
1285
91447636 1286 if ((so->so_options & SO_ACCEPTCONN)) {
813fb2f6
A
1287 struct socket *sp, *sonext;
1288 int persocklock = 0;
1289 int incomp_overflow_only;
2d21ac55
A
1290
1291 /*
1292 * We do not want new connection to be added
1293 * to the connection queues
1294 */
91447636 1295 so->so_options &= ~SO_ACCEPTCONN;
2d21ac55 1296
813fb2f6
A
1297 /*
1298 * We can drop the lock on the listener once
1299 * we've acquired the incoming list
1300 */
1301 if (so->so_proto->pr_getlock != NULL) {
1302 persocklock = 1;
1303 so_acquire_accept_list(so, NULL);
1304 socket_unlock(so, 0);
1305 }
1306again:
1307 incomp_overflow_only = 1;
2d21ac55 1308
813fb2f6 1309 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
39236c6e
A
1310 /*
1311 * Radar 5350314
2d21ac55
A
1312 * skip sockets thrown away by tcpdropdropblreq
1313 * they will get cleanup by the garbage collection.
1314 * otherwise, remove the incomp socket from the queue
1315 * and let soabort trigger the appropriate cleanup.
91447636 1316 */
0a7de745 1317 if (sp->so_flags & SOF_OVERFLOW) {
2d21ac55 1318 continue;
0a7de745 1319 }
2d21ac55 1320
0a7de745 1321 if (persocklock != 0) {
ff6e181a 1322 socket_lock(sp, 1);
0a7de745 1323 }
2d21ac55 1324
d190cdc3
A
1325 /*
1326 * Radar 27945981
1327 * The extra reference for the list insure the
1328 * validity of the socket pointer when we perform the
1329 * unlock of the head above
1330 */
2d21ac55
A
1331 if (sp->so_state & SS_INCOMP) {
1332 sp->so_state &= ~SS_INCOMP;
1333 sp->so_head = NULL;
d190cdc3
A
1334 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1335 so->so_incqlen--;
1336 so->so_qlen--;
2d21ac55
A
1337
1338 (void) soabort(sp);
813fb2f6
A
1339 } else {
1340 panic("%s sp %p in so_incomp but !SS_INCOMP",
1341 __func__, sp);
ff6e181a 1342 }
2d21ac55 1343
0a7de745 1344 if (persocklock != 0) {
2d21ac55 1345 socket_unlock(sp, 1);
0a7de745 1346 }
91447636
A
1347 }
1348
813fb2f6 1349 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
91447636 1350 /* Dequeue from so_comp since sofree() won't do it */
0a7de745 1351 if (persocklock != 0) {
ff6e181a 1352 socket_lock(sp, 1);
0a7de745 1353 }
ff6e181a 1354
2d21ac55
A
1355 if (sp->so_state & SS_COMP) {
1356 sp->so_state &= ~SS_COMP;
1357 sp->so_head = NULL;
d190cdc3
A
1358 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1359 so->so_qlen--;
2d21ac55
A
1360
1361 (void) soabort(sp);
813fb2f6
A
1362 } else {
1363 panic("%s sp %p in so_comp but !SS_COMP",
1364 __func__, sp);
2d21ac55 1365 }
91447636 1366
0a7de745 1367 if (persocklock) {
91447636 1368 socket_unlock(sp, 1);
ff6e181a 1369 }
0a7de745 1370 }
813fb2f6
A
1371
1372 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
0a7de745 1373#if (DEBUG | DEVELOPMENT)
813fb2f6
A
1374 panic("%s head %p so_comp not empty\n", __func__, so);
1375#endif /* (DEVELOPMENT || DEBUG) */
1376
1377 goto again;
91447636 1378 }
813fb2f6
A
1379
1380 if (!TAILQ_EMPTY(&so->so_comp)) {
0a7de745 1381#if (DEBUG | DEVELOPMENT)
813fb2f6
A
1382 panic("%s head %p so_comp not empty\n", __func__, so);
1383#endif /* (DEVELOPMENT || DEBUG) */
1384
1385 goto again;
1386 }
1387
1388 if (persocklock) {
1389 socket_lock(so, 0);
1390 so_release_accept_list(so);
1391 }
1392 }
39236c6e 1393 if (so->so_pcb == NULL) {
91447636
A
1394 /* 3915887: mark the socket as ready for dealloc */
1395 so->so_flags |= SOF_PCBCLEARING;
1c79356b 1396 goto discard;
91447636 1397 }
1c79356b
A
1398 if (so->so_state & SS_ISCONNECTED) {
1399 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
91447636 1400 error = sodisconnectlocked(so);
0a7de745 1401 if (error) {
1c79356b 1402 goto drop;
0a7de745 1403 }
1c79356b
A
1404 }
1405 if (so->so_options & SO_LINGER) {
813fb2f6
A
1406 lck_mtx_t *mutex_held;
1407
1c79356b 1408 if ((so->so_state & SS_ISDISCONNECTING) &&
0a7de745 1409 (so->so_state & SS_NBIO)) {
1c79356b 1410 goto drop;
0a7de745
A
1411 }
1412 if (so->so_proto->pr_getlock != NULL) {
5ba3f43e 1413 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
0a7de745 1414 } else {
91447636 1415 mutex_held = so->so_proto->pr_domain->dom_mtx;
0a7de745 1416 }
1c79356b 1417 while (so->so_state & SS_ISCONNECTED) {
0a7de745 1418 ts.tv_sec = (so->so_linger / 100);
2d21ac55
A
1419 ts.tv_nsec = (so->so_linger % 100) *
1420 NSEC_PER_USEC * 1000 * 10;
1421 error = msleep((caddr_t)&so->so_timeo,
1422 mutex_held, PSOCK | PCATCH, "soclose", &ts);
91447636 1423 if (error) {
2d21ac55
A
1424 /*
1425 * It's OK when the time fires,
1426 * don't report an error
1427 */
0a7de745 1428 if (error == EWOULDBLOCK) {
91447636 1429 error = 0;
0a7de745 1430 }
1c79356b 1431 break;
91447636 1432 }
1c79356b
A
1433 }
1434 }
1435 }
1436drop:
39236c6e 1437 if (so->so_usecount == 0) {
2d21ac55 1438 panic("soclose: usecount is zero so=%p\n", so);
39236c6e
A
1439 /* NOTREACHED */
1440 }
1441 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1c79356b 1442 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
0a7de745 1443 if (error == 0) {
1c79356b 1444 error = error2;
0a7de745 1445 }
1c79356b 1446 }
39236c6e 1447 if (so->so_usecount <= 0) {
2d21ac55 1448 panic("soclose: usecount is zero so=%p\n", so);
39236c6e
A
1449 /* NOTREACHED */
1450 }
1c79356b 1451discard:
39236c6e
A
1452 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1453 (so->so_state & SS_NOFDREF)) {
1c79356b 1454 panic("soclose: NOFDREF");
39236c6e
A
1455 /* NOTREACHED */
1456 }
1c79356b 1457 so->so_state |= SS_NOFDREF;
39236c6e 1458
0a7de745 1459 if ((so->so_flags & SOF_KNOTE) != 0) {
316670eb 1460 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
0a7de745 1461 }
39236c6e
A
1462
1463 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
39236c6e 1464
d190cdc3 1465 VERIFY(so->so_usecount > 0);
91447636 1466 so->so_usecount--;
1c79356b 1467 sofree(so);
0a7de745 1468 return error;
1c79356b
A
1469}
1470
91447636 1471int
2d21ac55 1472soclose(struct socket *so)
91447636
A
1473{
1474 int error = 0;
1475 socket_lock(so, 1);
2d21ac55 1476
2d21ac55 1477 if (so->so_retaincnt == 0) {
91447636 1478 error = soclose_locked(so);
2d21ac55
A
1479 } else {
1480 /*
1481 * if the FD is going away, but socket is
1482 * retained in kernel remove its reference
1483 */
91447636 1484 so->so_usecount--;
0a7de745 1485 if (so->so_usecount < 2) {
2d21ac55
A
1486 panic("soclose: retaincnt non null and so=%p "
1487 "usecount=%d\n", so, so->so_usecount);
0a7de745 1488 }
91447636
A
1489 }
1490 socket_unlock(so, 1);
0a7de745 1491 return error;
91447636
A
1492}
1493
1c79356b
A
1494/*
1495 * Must be called at splnet...
1496 */
2d21ac55 1497/* Should already be locked */
1c79356b 1498int
2d21ac55 1499soabort(struct socket *so)
1c79356b 1500{
9bccf70c 1501 int error;
1c79356b 1502
91447636 1503#ifdef MORE_LOCKING_DEBUG
2d21ac55 1504 lck_mtx_t *mutex_held;
91447636 1505
0a7de745 1506 if (so->so_proto->pr_getlock != NULL) {
91447636 1507 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
0a7de745 1508 } else {
91447636 1509 mutex_held = so->so_proto->pr_domain->dom_mtx;
0a7de745 1510 }
5ba3f43e 1511 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
91447636
A
1512#endif
1513
2d21ac55
A
1514 if ((so->so_flags & SOF_ABORTED) == 0) {
1515 so->so_flags |= SOF_ABORTED;
1516 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1517 if (error) {
1518 sofree(so);
0a7de745 1519 return error;
2d21ac55 1520 }
9bccf70c 1521 }
0a7de745 1522 return 0;
1c79356b
A
1523}
1524
1525int
2d21ac55 1526soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
9bccf70c 1527{
1c79356b 1528 int error;
91447636 1529
0a7de745 1530 if (dolock) {
2d21ac55 1531 socket_lock(so, 1);
0a7de745 1532 }
1c79356b 1533
39236c6e
A
1534 so_update_last_owner_locked(so, PROC_NULL);
1535 so_update_policy(so);
fe8ab488
A
1536#if NECP
1537 so_update_necp_policy(so, NULL, NULL);
1538#endif /* NECP */
39236c6e 1539
0a7de745 1540 if ((so->so_state & SS_NOFDREF) == 0) {
1c79356b 1541 panic("soaccept: !NOFDREF");
0a7de745 1542 }
1c79356b
A
1543 so->so_state &= ~SS_NOFDREF;
1544 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
2d21ac55 1545
0a7de745 1546 if (dolock) {
2d21ac55 1547 socket_unlock(so, 1);
0a7de745
A
1548 }
1549 return error;
1c79356b 1550}
2d21ac55 1551
91447636 1552int
2d21ac55 1553soaccept(struct socket *so, struct sockaddr **nam)
91447636 1554{
0a7de745 1555 return soacceptlock(so, nam, 1);
91447636 1556}
1c79356b
A
1557
1558int
d190cdc3 1559soacceptfilter(struct socket *so, struct socket *head)
2d21ac55
A
1560{
1561 struct sockaddr *local = NULL, *remote = NULL;
6d2010ae 1562 int error = 0;
2d21ac55
A
1563
1564 /*
39236c6e
A
1565 * Hold the lock even if this socket has not been made visible
1566 * to the filter(s). For sockets with global locks, this protects
1567 * against the head or peer going away
2d21ac55 1568 */
b0d623f7
A
1569 socket_lock(so, 1);
1570 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1571 sogetaddr_locked(so, &local, 0) != 0) {
d190cdc3 1572 so->so_state &= ~SS_NOFDREF;
b0d623f7 1573 socket_unlock(so, 1);
2d21ac55
A
1574 soclose(so);
1575 /* Out of resources; try it again next time */
1576 error = ECONNABORTED;
1577 goto done;
1578 }
1579
6d2010ae 1580 error = sflt_accept(head, so, local, remote);
2d21ac55
A
1581
1582 /*
1583 * If we get EJUSTRETURN from one of the filters, mark this socket
1584 * as inactive and return it anyway. This newly accepted socket
1585 * will be disconnected later before we hand it off to the caller.
1586 */
1587 if (error == EJUSTRETURN) {
1588 error = 0;
6d2010ae
A
1589 (void) sosetdefunct(current_proc(), so,
1590 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
2d21ac55
A
1591 }
1592
1593 if (error != 0) {
1594 /*
1595 * This may seem like a duplication to the above error
1596 * handling part when we return ECONNABORTED, except
1597 * the following is done while holding the lock since
1598 * the socket has been exposed to the filter(s) earlier.
1599 */
5ba3f43e 1600 so->so_state &= ~SS_NOFDREF;
2d21ac55
A
1601 socket_unlock(so, 1);
1602 soclose(so);
1603 /* Propagate socket filter's error code to the caller */
1604 } else {
1605 socket_unlock(so, 1);
1606 }
1607done:
1608 /* Callee checks for NULL pointer */
1609 sock_freeaddr(remote);
1610 sock_freeaddr(local);
0a7de745 1611 return error;
2d21ac55 1612}
1c79356b 1613
2d21ac55
A
1614/*
1615 * Returns: 0 Success
1616 * EOPNOTSUPP Operation not supported on socket
1617 * EISCONN Socket is connected
1618 * <pru_connect>:EADDRNOTAVAIL Address not available.
1619 * <pru_connect>:EINVAL Invalid argument
1620 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1621 * <pru_connect>:EACCES Permission denied
1622 * <pru_connect>:EADDRINUSE Address in use
1623 * <pru_connect>:EAGAIN Resource unavailable, try again
1624 * <pru_connect>:EPERM Operation not permitted
1625 * <sf_connect_out>:??? [anything a filter writer might set]
1626 */
1627int
1628soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1c79356b 1629{
1c79356b
A
1630 int error;
1631 struct proc *p = current_proc();
1c79356b 1632
0a7de745 1633 if (dolock) {
2d21ac55 1634 socket_lock(so, 1);
0a7de745 1635 }
39236c6e
A
1636
1637 so_update_last_owner_locked(so, p);
1638 so_update_policy(so);
1639
fe8ab488
A
1640#if NECP
1641 so_update_necp_policy(so, NULL, nam);
1642#endif /* NECP */
1643
2d21ac55
A
1644 /*
1645 * If this is a listening socket or if this is a previously-accepted
1646 * socket that has been marked as inactive, reject the connect request.
1647 */
1648 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
6d2010ae
A
1649 error = EOPNOTSUPP;
1650 if (so->so_flags & SOF_DEFUNCT) {
39037602 1651 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
39236c6e 1652 "(%d)\n", __func__, proc_pid(p),
39037602 1653 proc_best_name(p),
3e170ce0 1654 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 1655 SOCK_DOM(so), SOCK_TYPE(so), error);
6d2010ae 1656 }
0a7de745 1657 if (dolock) {
2d21ac55 1658 socket_unlock(so, 1);
0a7de745
A
1659 }
1660 return error;
91447636 1661 }
2d21ac55 1662
39236c6e 1663 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
0a7de745 1664 if (dolock) {
2d21ac55 1665 socket_unlock(so, 1);
0a7de745
A
1666 }
1667 return EPERM;
2d21ac55
A
1668 }
1669
1c79356b
A
1670 /*
1671 * If protocol is connection-based, can only connect once.
1672 * Otherwise, if connected, try to disconnect first.
1673 * This allows user to disconnect by connecting to, e.g.,
1674 * a null address.
1675 */
0a7de745 1676 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1c79356b 1677 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
2d21ac55 1678 (error = sodisconnectlocked(so)))) {
1c79356b 1679 error = EISCONN;
2d21ac55 1680 } else {
91447636
A
1681 /*
1682 * Run connect filter before calling protocol:
1683 * - non-blocking connect returns before completion;
1684 */
6d2010ae 1685 error = sflt_connectout(so, nam);
39236c6e 1686 if (error != 0) {
0a7de745 1687 if (error == EJUSTRETURN) {
91447636 1688 error = 0;
0a7de745 1689 }
6d2010ae 1690 } else {
39236c6e
A
1691 error = (*so->so_proto->pr_usrreqs->pru_connect)
1692 (so, nam, p);
4ba76501
A
1693 if (error != 0) {
1694 so->so_state &= ~SS_ISCONNECTING;
1695 }
91447636 1696 }
1c79356b 1697 }
0a7de745 1698 if (dolock) {
2d21ac55 1699 socket_unlock(so, 1);
0a7de745
A
1700 }
1701 return error;
1c79356b
A
1702}
1703
91447636 1704int
2d21ac55 1705soconnect(struct socket *so, struct sockaddr *nam)
91447636 1706{
0a7de745 1707 return soconnectlock(so, nam, 1);
91447636
A
1708}
1709
2d21ac55
A
1710/*
1711 * Returns: 0 Success
1712 * <pru_connect2>:EINVAL[AF_UNIX]
1713 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1714 * <pru_connect2>:??? [other protocol families]
1715 *
1716 * Notes: <pru_connect2> is not supported by [TCP].
1717 */
1c79356b 1718int
2d21ac55 1719soconnect2(struct socket *so1, struct socket *so2)
1c79356b 1720{
1c79356b 1721 int error;
91447636 1722
0c530ab8 1723 socket_lock(so1, 1);
0a7de745 1724 if (so2->so_proto->pr_lock) {
0c530ab8 1725 socket_lock(so2, 1);
0a7de745 1726 }
1c79356b
A
1727
1728 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
2d21ac55 1729
0c530ab8 1730 socket_unlock(so1, 1);
0a7de745 1731 if (so2->so_proto->pr_lock) {
0c530ab8 1732 socket_unlock(so2, 1);
0a7de745
A
1733 }
1734 return error;
1c79356b
A
1735}
1736
39236c6e 1737int
813fb2f6
A
1738soconnectxlocked(struct socket *so, struct sockaddr *src,
1739 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
3e170ce0
A
1740 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1741 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
39236c6e
A
1742{
1743 int error;
1744
fe8ab488
A
1745 so_update_last_owner_locked(so, p);
1746 so_update_policy(so);
3e170ce0 1747
39236c6e
A
1748 /*
1749 * If this is a listening socket or if this is a previously-accepted
1750 * socket that has been marked as inactive, reject the connect request.
1751 */
1752 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1753 error = EOPNOTSUPP;
1754 if (so->so_flags & SOF_DEFUNCT) {
39037602 1755 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
39236c6e 1756 "(%d)\n", __func__, proc_pid(p),
39037602 1757 proc_best_name(p),
3e170ce0 1758 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 1759 SOCK_DOM(so), SOCK_TYPE(so), error);
39236c6e 1760 }
0a7de745 1761 return error;
39236c6e
A
1762 }
1763
0a7de745
A
1764 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1765 return EPERM;
1766 }
39236c6e
A
1767
1768 /*
1769 * If protocol is connection-based, can only connect once
1770 * unless PR_MULTICONN is set. Otherwise, if connected,
1771 * try to disconnect first. This allows user to disconnect
1772 * by connecting to, e.g., a null address.
1773 */
0a7de745 1774 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
39236c6e
A
1775 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1776 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1777 (error = sodisconnectlocked(so)) != 0)) {
1778 error = EISCONN;
1779 } else {
bca245ac
A
1780 if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1781 (flags & CONNECT_DATA_IDEMPOTENT)) {
1782 so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1783
1784 if (flags & CONNECT_DATA_AUTHENTICATED) {
1785 so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1786 }
1787 }
1788
1789 /*
1790 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1791 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1792 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1793 * Case 3 allows user to combine write with connect even if they have
1794 * no use for TFO (such as regular TCP, and UDP).
1795 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1796 */
1797 if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1798 ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1799 so->so_flags1 |= SOF1_PRECONNECT_DATA;
1800 }
1801
1802 /*
1803 * If a user sets data idempotent and does not pass an uio, or
1804 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1805 * SOF1_DATA_IDEMPOTENT.
1806 */
1807 if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1808 (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1809 /* We should return EINVAL instead perhaps. */
1810 so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1811 }
1812
39236c6e
A
1813 /*
1814 * Run connect filter before calling protocol:
1815 * - non-blocking connect returns before completion;
1816 */
813fb2f6 1817 error = sflt_connectout(so, dst);
39236c6e 1818 if (error != 0) {
490019cf
A
1819 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1820 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
0a7de745 1821 if (error == EJUSTRETURN) {
39236c6e 1822 error = 0;
0a7de745 1823 }
39236c6e
A
1824 } else {
1825 error = (*so->so_proto->pr_usrreqs->pru_connectx)
813fb2f6 1826 (so, src, dst, p, ifscope, aid, pcid,
3e170ce0 1827 flags, arg, arglen, auio, bytes_written);
4ba76501
A
1828 if (error != 0) {
1829 so->so_state &= ~SS_ISCONNECTING;
bca245ac
A
1830 if (error != EINPROGRESS) {
1831 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1832 }
4ba76501 1833 }
39236c6e
A
1834 }
1835 }
1836
0a7de745 1837 return error;
39236c6e
A
1838}
1839
1c79356b 1840int
2d21ac55 1841sodisconnectlocked(struct socket *so)
1c79356b 1842{
1c79356b 1843 int error;
1c79356b
A
1844
1845 if ((so->so_state & SS_ISCONNECTED) == 0) {
1846 error = ENOTCONN;
1847 goto bad;
1848 }
1849 if (so->so_state & SS_ISDISCONNECTING) {
1850 error = EALREADY;
1851 goto bad;
1852 }
2d21ac55 1853
1c79356b 1854 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
0a7de745 1855 if (error == 0) {
91447636 1856 sflt_notify(so, sock_evt_disconnected, NULL);
0a7de745 1857 }
39236c6e 1858
1c79356b 1859bad:
0a7de745 1860 return error;
1c79356b 1861}
2d21ac55
A
1862
1863/* Locking version */
91447636 1864int
2d21ac55 1865sodisconnect(struct socket *so)
91447636 1866{
2d21ac55 1867 int error;
91447636
A
1868
1869 socket_lock(so, 1);
1870 error = sodisconnectlocked(so);
1871 socket_unlock(so, 1);
0a7de745 1872 return error;
91447636 1873}
1c79356b 1874
39236c6e 1875int
3e170ce0 1876sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
39236c6e
A
1877{
1878 int error;
1879
1880 /*
1881 * Call the protocol disconnectx handler; let it handle all
1882 * matters related to the connection state of this session.
1883 */
1884 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1885 if (error == 0) {
1886 /*
1887 * The event applies only for the session, not for
1888 * the disconnection of individual subflows.
1889 */
0a7de745 1890 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
39236c6e 1891 sflt_notify(so, sock_evt_disconnected, NULL);
0a7de745 1892 }
39236c6e 1893 }
0a7de745 1894 return error;
39236c6e
A
1895}
1896
1897int
3e170ce0 1898sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
39236c6e
A
1899{
1900 int error;
1901
1902 socket_lock(so, 1);
1903 error = sodisconnectxlocked(so, aid, cid);
1904 socket_unlock(so, 1);
0a7de745 1905 return error;
39236c6e
A
1906}
1907
0a7de745 1908#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
91447636
A
1909
1910/*
1911 * sosendcheck will lock the socket buffer if it isn't locked and
1912 * verify that there is space for the data being inserted.
2d21ac55
A
1913 *
1914 * Returns: 0 Success
1915 * EPIPE
1916 * sblock:EWOULDBLOCK
1917 * sblock:EINTR
1918 * sbwait:EBADF
1919 * sbwait:EINTR
1920 * [so_error]:???
91447636 1921 */
39236c6e
A
1922int
1923sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
f427ee49 1924 int32_t clen, int32_t atomic, int flags, int *sblocked)
91447636 1925{
0a7de745 1926 int error = 0;
b0d623f7 1927 int32_t space;
0a7de745 1928 int assumelock = 0;
91447636
A
1929
1930restart:
1931 if (*sblocked == 0) {
3a60a9f5 1932 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
2d21ac55
A
1933 so->so_send_filt_thread != 0 &&
1934 so->so_send_filt_thread == current_thread()) {
3a60a9f5
A
1935 /*
1936 * We're being called recursively from a filter,
1937 * allow this to continue. Radar 4150520.
1938 * Don't set sblocked because we don't want
1939 * to perform an unlock later.
1940 */
1941 assumelock = 1;
2d21ac55 1942 } else {
3a60a9f5
A
1943 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1944 if (error) {
0a7de745 1945 if (so->so_flags & SOF_DEFUNCT) {
6d2010ae 1946 goto defunct;
0a7de745
A
1947 }
1948 return error;
3a60a9f5
A
1949 }
1950 *sblocked = 1;
1951 }
91447636 1952 }
2d21ac55
A
1953
1954 /*
6d2010ae
A
1955 * If a send attempt is made on a socket that has been marked
1956 * as inactive (disconnected), reject the request.
2d21ac55 1957 */
6d2010ae
A
1958 if (so->so_flags & SOF_DEFUNCT) {
1959defunct:
1960 error = EPIPE;
39037602
A
1961 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1962 __func__, proc_selfpid(), proc_best_name(current_proc()),
3e170ce0 1963 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 1964 SOCK_DOM(so), SOCK_TYPE(so), error);
0a7de745 1965 return error;
6d2010ae 1966 }
2d21ac55 1967
fe8ab488
A
1968 if (so->so_state & SS_CANTSENDMORE) {
1969#if CONTENT_FILTER
1970 /*
1971 * Can re-inject data of half closed connections
1972 */
1973 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
0a7de745
A
1974 so->so_snd.sb_cfil_thread == current_thread() &&
1975 cfil_sock_data_pending(&so->so_snd) != 0) {
fe8ab488 1976 CFIL_LOG(LOG_INFO,
0a7de745
A
1977 "so %llx ignore SS_CANTSENDMORE",
1978 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1979 } else
fe8ab488 1980#endif /* CONTENT_FILTER */
0a7de745 1981 return EPIPE;
fe8ab488 1982 }
91447636
A
1983 if (so->so_error) {
1984 error = so->so_error;
1985 so->so_error = 0;
0a7de745 1986 return error;
91447636 1987 }
2d21ac55 1988
91447636 1989 if ((so->so_state & SS_ISCONNECTED) == 0) {
2d21ac55 1990 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
fe8ab488 1991 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
3e170ce0 1992 (resid != 0 || clen == 0) &&
0a7de745
A
1993 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1994 return ENOTCONN;
1995 }
cb323159 1996 } else if (addr == 0) {
0a7de745
A
1997 return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1998 ENOTCONN : EDESTADDRREQ;
2d21ac55 1999 }
91447636 2000 }
3e170ce0 2001
f427ee49 2002 space = sbspace(&so->so_snd);
39236c6e 2003
0a7de745 2004 if (flags & MSG_OOB) {
91447636 2005 space += 1024;
0a7de745 2006 }
91447636 2007 if ((atomic && resid > so->so_snd.sb_hiwat) ||
0a7de745
A
2008 clen > so->so_snd.sb_hiwat) {
2009 return EMSGSIZE;
2010 }
39236c6e 2011
316670eb 2012 if ((space < resid + clen &&
3e170ce0
A
2013 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2014 space < clen)) ||
316670eb 2015 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
3e170ce0
A
2016 /*
2017 * don't block the connectx call when there's more data
2018 * than can be copied.
2019 */
2020 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2021 if (space == 0) {
0a7de745 2022 return EWOULDBLOCK;
3e170ce0
A
2023 }
2024 if (space < (int32_t)so->so_snd.sb_lowat) {
0a7de745 2025 return 0;
3e170ce0
A
2026 }
2027 }
2d21ac55
A
2028 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2029 assumelock) {
0a7de745 2030 return EWOULDBLOCK;
3a60a9f5 2031 }
0a7de745 2032 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
6d2010ae 2033 *sblocked = 0;
91447636
A
2034 error = sbwait(&so->so_snd);
2035 if (error) {
0a7de745 2036 if (so->so_flags & SOF_DEFUNCT) {
6d2010ae 2037 goto defunct;
0a7de745
A
2038 }
2039 return error;
91447636
A
2040 }
2041 goto restart;
2042 }
0a7de745 2043 return 0;
91447636
A
2044}
2045
1c79356b
A
2046/*
2047 * Send on a socket.
2048 * If send must go all at once and message is larger than
2049 * send buffering, then hard error.
2050 * Lock against other senders.
2051 * If must go all at once and not enough room now, then
2052 * inform user that this would block and do nothing.
2053 * Otherwise, if nonblocking, send as much as possible.
2054 * The data to be sent is described by "uio" if nonzero,
2055 * otherwise by the mbuf chain "top" (which must be null
2056 * if uio is not). Data provided in mbuf chain must be small
2057 * enough to send all at once.
2058 *
2059 * Returns nonzero on error, timeout or signal; callers
2060 * must check for short counts if EINTR/ERESTART are returned.
2061 * Data and control buffers are freed on return.
2d21ac55
A
2062 *
2063 * Returns: 0 Success
2064 * EOPNOTSUPP
2065 * EINVAL
2066 * ENOBUFS
2067 * uiomove:EFAULT
2068 * sosendcheck:EPIPE
2069 * sosendcheck:EWOULDBLOCK
2070 * sosendcheck:EINTR
2071 * sosendcheck:EBADF
2072 * sosendcheck:EINTR
2073 * sosendcheck:??? [value from so_error]
2074 * <pru_send>:ECONNRESET[TCP]
2075 * <pru_send>:EINVAL[TCP]
2076 * <pru_send>:ENOBUFS[TCP]
2077 * <pru_send>:EADDRINUSE[TCP]
2078 * <pru_send>:EADDRNOTAVAIL[TCP]
2079 * <pru_send>:EAFNOSUPPORT[TCP]
2080 * <pru_send>:EACCES[TCP]
2081 * <pru_send>:EAGAIN[TCP]
2082 * <pru_send>:EPERM[TCP]
2083 * <pru_send>:EMSGSIZE[TCP]
2084 * <pru_send>:EHOSTUNREACH[TCP]
2085 * <pru_send>:ENETUNREACH[TCP]
2086 * <pru_send>:ENETDOWN[TCP]
2087 * <pru_send>:ENOMEM[TCP]
2088 * <pru_send>:ENOBUFS[TCP]
2089 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2090 * <pru_send>:EINVAL[AF_UNIX]
2091 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2092 * <pru_send>:EPIPE[AF_UNIX]
2093 * <pru_send>:ENOTCONN[AF_UNIX]
2094 * <pru_send>:EISCONN[AF_UNIX]
2095 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2096 * <sf_data_out>:??? [whatever a filter author chooses]
2097 *
2098 * Notes: Other <pru_send> returns depend on the protocol family; all
2099 * <sf_data_out> returns depend on what the filter author causes
2100 * their filter to return.
1c79356b
A
2101 */
2102int
2d21ac55
A
2103sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2104 struct mbuf *top, struct mbuf *control, int flags)
1c79356b
A
2105{
2106 struct mbuf **mp;
39236c6e 2107 struct mbuf *m, *freelist = NULL;
3e170ce0 2108 user_ssize_t space, len, resid, orig_resid;
91447636 2109 int clen = 0, error, dontroute, mlen, sendflags;
1c79356b 2110 int atomic = sosendallatonce(so) || top;
91447636 2111 int sblocked = 0;
1c79356b 2112 struct proc *p = current_proc();
3e170ce0
A
2113 uint16_t headroom = 0;
2114 boolean_t en_tracing = FALSE;
1c79356b 2115
0a7de745 2116 if (uio != NULL) {
91447636 2117 resid = uio_resid(uio);
0a7de745 2118 } else {
1c79356b 2119 resid = top->m_pkthdr.len;
0a7de745 2120 }
39236c6e 2121
2d21ac55
A
2122 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2123 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1c79356b 2124
91447636 2125 socket_lock(so, 1);
fe8ab488 2126
3e170ce0
A
2127 /*
2128 * trace if tracing & network (vs. unix) sockets & and
2129 * non-loopback
2130 */
2131 if (ENTR_SHOULDTRACE &&
2132 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2133 struct inpcb *inp = sotoinpcb(so);
2134 if (inp->inp_last_outifp != NULL &&
2135 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2136 en_tracing = TRUE;
2137 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2138 VM_KERNEL_ADDRPERM(so),
2139 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2140 (int64_t)resid);
2141 orig_resid = resid;
2142 }
2143 }
2144
fe8ab488
A
2145 /*
2146 * Re-injection should not affect process accounting
2147 */
2148 if ((flags & MSG_SKIPCFIL) == 0) {
3e170ce0
A
2149 so_update_last_owner_locked(so, p);
2150 so_update_policy(so);
2151
fe8ab488 2152#if NECP
3e170ce0 2153 so_update_necp_policy(so, NULL, addr);
fe8ab488
A
2154#endif /* NECP */
2155 }
3e170ce0 2156
2d21ac55
A
2157 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2158 error = EOPNOTSUPP;
5ba3f43e 2159 goto out_locked;
2d21ac55 2160 }
91447636 2161
1c79356b
A
2162 /*
2163 * In theory resid should be unsigned.
2164 * However, space must be signed, as it might be less than 0
2165 * if we over-committed, and we must use a signed comparison
2166 * of space and resid. On the other hand, a negative resid
2167 * causes us to loop sending 0-length segments to the protocol.
2168 *
39236c6e 2169 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
39236c6e 2170 *
fe8ab488 2171 * Note: We limit resid to be a positive int value as we use
39236c6e 2172 * imin() to set bytes_to_copy -- radr://14558484
1c79356b 2173 */
f427ee49
A
2174 if (resid < 0 || resid > INT_MAX ||
2175 (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
1c79356b 2176 error = EINVAL;
5ba3f43e 2177 goto out_locked;
1c79356b
A
2178 }
2179
39236c6e
A
2180 dontroute = (flags & MSG_DONTROUTE) &&
2181 (so->so_options & SO_DONTROUTE) == 0 &&
1c79356b 2182 (so->so_proto->pr_flags & PR_ATOMIC);
b0d623f7 2183 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
39236c6e 2184
0a7de745 2185 if (control != NULL) {
1c79356b 2186 clen = control->m_len;
0a7de745 2187 }
1c79356b 2188
0a7de745 2189 if (soreserveheadroom != 0) {
3e170ce0 2190 headroom = so->so_pktheadroom;
0a7de745 2191 }
3e170ce0 2192
1c79356b 2193 do {
2d21ac55 2194 error = sosendcheck(so, addr, resid, clen, atomic, flags,
f427ee49 2195 &sblocked);
0a7de745 2196 if (error) {
5ba3f43e 2197 goto out_locked;
0a7de745 2198 }
39236c6e 2199
1c79356b 2200 mp = &top;
f427ee49 2201 space = sbspace(&so->so_snd) - clen;
39236c6e 2202 space += ((flags & MSG_OOB) ? 1024 : 0);
fa4905b1 2203
1c79356b 2204 do {
2d21ac55 2205 if (uio == NULL) {
91447636
A
2206 /*
2207 * Data is prepackaged in "top".
2208 */
2209 resid = 0;
0a7de745 2210 if (flags & MSG_EOR) {
1c79356b 2211 top->m_flags |= M_EOR;
0a7de745 2212 }
91447636 2213 } else {
2d21ac55
A
2214 int chainlength;
2215 int bytes_to_copy;
2216 boolean_t jumbocl;
fe8ab488 2217 boolean_t bigcl;
3e170ce0 2218 int bytes_to_alloc;
2d21ac55 2219
b0d623f7 2220 bytes_to_copy = imin(resid, space);
2d21ac55 2221
3e170ce0 2222 bytes_to_alloc = bytes_to_copy;
0a7de745 2223 if (top == NULL) {
3e170ce0 2224 bytes_to_alloc += headroom;
0a7de745 2225 }
3e170ce0 2226
0a7de745 2227 if (sosendminchain > 0) {
91447636 2228 chainlength = 0;
0a7de745 2229 } else {
91447636 2230 chainlength = sosendmaxchain;
0a7de745 2231 }
2d21ac55 2232
fe8ab488 2233 /*
3e170ce0
A
2234 * Use big 4 KB cluster when the outgoing interface
2235 * does not prefer 2 KB clusters
fe8ab488 2236 */
3e170ce0 2237 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
fe8ab488 2238 sosendbigcl_ignore_capab;
3e170ce0 2239
2d21ac55
A
2240 /*
2241 * Attempt to use larger than system page-size
2242 * clusters for large writes only if there is
2243 * a jumbo cluster pool and if the socket is
2244 * marked accordingly.
2245 */
2246 jumbocl = sosendjcl && njcl > 0 &&
2247 ((so->so_flags & SOF_MULTIPAGES) ||
fe8ab488
A
2248 sosendjcl_ignore_capab) &&
2249 bigcl;
2d21ac55 2250
91447636 2251 socket_unlock(so, 0);
2d21ac55 2252
91447636
A
2253 do {
2254 int num_needed;
39236c6e 2255 int hdrs_needed = (top == NULL) ? 1 : 0;
2d21ac55 2256
91447636 2257 /*
2d21ac55
A
2258 * try to maintain a local cache of mbuf
2259 * clusters needed to complete this
2260 * write the list is further limited to
2261 * the number that are currently needed
2262 * to fill the socket this mechanism
2263 * allows a large number of mbufs/
2264 * clusters to be grabbed under a single
2265 * mbuf lock... if we can't get any
2266 * clusters, than fall back to trying
2267 * for mbufs if we fail early (or
2268 * miscalcluate the number needed) make
2269 * sure to release any clusters we
2270 * haven't yet consumed.
91447636 2271 */
2d21ac55 2272 if (freelist == NULL &&
3e170ce0 2273 bytes_to_alloc > MBIGCLBYTES &&
6d2010ae 2274 jumbocl) {
2d21ac55 2275 num_needed =
3e170ce0 2276 bytes_to_alloc / M16KCLBYTES;
2d21ac55 2277
3e170ce0 2278 if ((bytes_to_alloc -
2d21ac55 2279 (num_needed * M16KCLBYTES))
0a7de745 2280 >= MINCLSIZE) {
2d21ac55 2281 num_needed++;
0a7de745 2282 }
91447636 2283
2d21ac55
A
2284 freelist =
2285 m_getpackets_internal(
0a7de745
A
2286 (unsigned int *)&num_needed,
2287 hdrs_needed, M_WAIT, 0,
2288 M16KCLBYTES);
2d21ac55
A
2289 /*
2290 * Fall back to 4K cluster size
2291 * if allocation failed
2292 */
2293 }
2294
2295 if (freelist == NULL &&
3e170ce0 2296 bytes_to_alloc > MCLBYTES &&
fe8ab488 2297 bigcl) {
2d21ac55 2298 num_needed =
3e170ce0 2299 bytes_to_alloc / MBIGCLBYTES;
2d21ac55 2300
3e170ce0 2301 if ((bytes_to_alloc -
6d2010ae 2302 (num_needed * MBIGCLBYTES)) >=
0a7de745 2303 MINCLSIZE) {
91447636 2304 num_needed++;
0a7de745 2305 }
2d21ac55
A
2306
2307 freelist =
2308 m_getpackets_internal(
0a7de745
A
2309 (unsigned int *)&num_needed,
2310 hdrs_needed, M_WAIT, 0,
2311 MBIGCLBYTES);
2d21ac55
A
2312 /*
2313 * Fall back to cluster size
2314 * if allocation failed
2315 */
91447636 2316 }
2d21ac55 2317
3e170ce0
A
2318 /*
2319 * Allocate a cluster as we want to
2320 * avoid to split the data in more
2321 * that one segment and using MINCLSIZE
2322 * would lead us to allocate two mbufs
2323 */
2324 if (soreserveheadroom != 0 &&
2325 freelist == NULL &&
2326 ((top == NULL &&
2327 bytes_to_alloc > _MHLEN) ||
2328 bytes_to_alloc > _MLEN)) {
2329 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2330 MCLBYTES;
2331 freelist =
2332 m_getpackets_internal(
0a7de745
A
2333 (unsigned int *)&num_needed,
2334 hdrs_needed, M_WAIT, 0,
2335 MCLBYTES);
3e170ce0
A
2336 /*
2337 * Fall back to a single mbuf
2338 * if allocation failed
2339 */
2340 } else if (freelist == NULL &&
2341 bytes_to_alloc > MINCLSIZE) {
2d21ac55 2342 num_needed =
3e170ce0 2343 bytes_to_alloc / MCLBYTES;
2d21ac55 2344
3e170ce0 2345 if ((bytes_to_alloc -
2d21ac55 2346 (num_needed * MCLBYTES)) >=
0a7de745 2347 MINCLSIZE) {
91447636 2348 num_needed++;
0a7de745 2349 }
2d21ac55
A
2350
2351 freelist =
2352 m_getpackets_internal(
0a7de745
A
2353 (unsigned int *)&num_needed,
2354 hdrs_needed, M_WAIT, 0,
2355 MCLBYTES);
2d21ac55
A
2356 /*
2357 * Fall back to a single mbuf
2358 * if allocation failed
2359 */
91447636 2360 }
3e170ce0
A
2361 /*
2362 * For datagram protocols, leave
2363 * headroom for protocol headers
2364 * in the first cluster of the chain
2365 */
2366 if (freelist != NULL && atomic &&
2367 top == NULL && headroom > 0) {
2368 freelist->m_data += headroom;
2369 }
39037602 2370
3e170ce0
A
2371 /*
2372 * Fall back to regular mbufs without
2373 * reserving the socket headroom
2374 */
91447636 2375 if (freelist == NULL) {
f427ee49
A
2376 if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2377 if (top == NULL) {
2378 MGETHDR(freelist,
2379 M_WAIT, MT_DATA);
2380 } else {
2381 MGET(freelist,
2382 M_WAIT, MT_DATA);
2383 }
0a7de745 2384 }
91447636
A
2385
2386 if (freelist == NULL) {
2387 error = ENOBUFS;
2388 socket_lock(so, 0);
5ba3f43e 2389 goto out_locked;
91447636
A
2390 }
2391 /*
2d21ac55
A
2392 * For datagram protocols,
2393 * leave room for protocol
2394 * headers in first mbuf.
91447636 2395 */
39236c6e 2396 if (atomic && top == NULL &&
2d21ac55
A
2397 bytes_to_copy < MHLEN) {
2398 MH_ALIGN(freelist,
2399 bytes_to_copy);
2400 }
91447636
A
2401 }
2402 m = freelist;
2403 freelist = m->m_next;
2404 m->m_next = NULL;
2d21ac55 2405
0a7de745 2406 if ((m->m_flags & M_EXT)) {
3e170ce0 2407 mlen = m->m_ext.ext_size -
d9a64523 2408 M_LEADINGSPACE(m);
0a7de745 2409 } else if ((m->m_flags & M_PKTHDR)) {
2d21ac55 2410 mlen =
d9a64523 2411 MHLEN - M_LEADINGSPACE(m);
0a7de745 2412 } else {
d9a64523 2413 mlen = MLEN - M_LEADINGSPACE(m);
0a7de745 2414 }
b0d623f7 2415 len = imin(mlen, bytes_to_copy);
91447636
A
2416
2417 chainlength += len;
2d21ac55 2418
91447636 2419 space -= len;
fa4905b1 2420
2d21ac55 2421 error = uiomove(mtod(m, caddr_t),
b0d623f7 2422 len, uio);
2d21ac55 2423
91447636 2424 resid = uio_resid(uio);
2d21ac55 2425
91447636
A
2426 m->m_len = len;
2427 *mp = m;
2428 top->m_pkthdr.len += len;
0a7de745 2429 if (error) {
91447636 2430 break;
0a7de745 2431 }
91447636
A
2432 mp = &m->m_next;
2433 if (resid <= 0) {
0a7de745 2434 if (flags & MSG_EOR) {
91447636 2435 top->m_flags |= M_EOR;
0a7de745 2436 }
91447636
A
2437 break;
2438 }
2439 bytes_to_copy = min(resid, space);
2d21ac55
A
2440 } while (space > 0 &&
2441 (chainlength < sosendmaxchain || atomic ||
2442 resid < MINCLSIZE));
2443
91447636 2444 socket_lock(so, 0);
2d21ac55 2445
0a7de745 2446 if (error) {
5ba3f43e 2447 goto out_locked;
0a7de745 2448 }
91447636 2449 }
2d21ac55 2450
0a7de745 2451 if (dontroute) {
2d21ac55 2452 so->so_options |= SO_DONTROUTE;
0a7de745 2453 }
2d21ac55 2454
3e170ce0
A
2455 /*
2456 * Compute flags here, for pru_send and NKEs
2457 *
2458 * If the user set MSG_EOF, the protocol
2459 * understands this flag and nothing left to
2460 * send then use PRU_SEND_EOF instead of PRU_SEND.
2461 */
2d21ac55 2462 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2d21ac55 2463 ((flags & MSG_EOF) &&
3e170ce0
A
2464 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2465 (resid <= 0)) ? PRUS_EOF :
2466 /* If there is more to send set PRUS_MORETOCOME */
2467 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2468
fe8ab488
A
2469 if ((flags & MSG_SKIPCFIL) == 0) {
2470 /*
2471 * Socket filter processing
2472 */
2473 error = sflt_data_out(so, addr, &top,
2474 &control, (sendflags & MSG_OOB) ?
2475 sock_data_filt_flag_oob : 0);
2476 if (error) {
2477 if (error == EJUSTRETURN) {
2478 error = 0;
2a1bd2d3 2479 goto packet_consumed;
fe8ab488 2480 }
5ba3f43e 2481 goto out_locked;
91447636 2482 }
fe8ab488
A
2483#if CONTENT_FILTER
2484 /*
2485 * Content filter processing
2486 */
2487 error = cfil_sock_data_out(so, addr, top,
d9a64523 2488 control, sendflags);
fe8ab488
A
2489 if (error) {
2490 if (error == EJUSTRETURN) {
2491 error = 0;
c3c9b80d 2492 goto packet_consumed;
0a7de745 2493 }
5ba3f43e 2494 goto out_locked;
fe8ab488
A
2495 }
2496#endif /* CONTENT_FILTER */
1c79356b 2497 }
6d2010ae 2498 error = (*so->so_proto->pr_usrreqs->pru_send)
39236c6e
A
2499 (so, sendflags, top, addr, control, p);
2500
2a1bd2d3 2501packet_consumed:
0a7de745 2502 if (dontroute) {
2d21ac55 2503 so->so_options &= ~SO_DONTROUTE;
0a7de745 2504 }
2d21ac55
A
2505
2506 clen = 0;
f427ee49 2507 control = NULL;
39236c6e 2508 top = NULL;
2d21ac55 2509 mp = &top;
0a7de745 2510 if (error) {
5ba3f43e 2511 goto out_locked;
0a7de745 2512 }
1c79356b
A
2513 } while (resid && space > 0);
2514 } while (resid);
2515
5ba3f43e 2516out_locked:
0a7de745
A
2517 if (sblocked) {
2518 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2519 } else {
3a60a9f5 2520 socket_unlock(so, 1);
0a7de745
A
2521 }
2522 if (top != NULL) {
1c79356b 2523 m_freem(top);
0a7de745
A
2524 }
2525 if (control != NULL) {
1c79356b 2526 m_freem(control);
0a7de745
A
2527 }
2528 if (freelist != NULL) {
2d21ac55 2529 m_freem_list(freelist);
0a7de745 2530 }
1c79356b 2531
5ba3f43e 2532 soclearfastopen(so);
3e170ce0
A
2533
2534 if (en_tracing) {
2535 /* resid passed here is the bytes left in uio */
2536 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2537 VM_KERNEL_ADDRPERM(so),
2538 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2539 (int64_t)(orig_resid - resid));
2540 }
2541 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2542 so->so_snd.sb_cc, space, error);
1c79356b 2543
0a7de745 2544 return error;
1c79356b
A
2545}
2546
d9a64523
A
2547int
2548sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2549{
cb323159 2550 struct mbuf *m0 = NULL, *control_end = NULL;
d9a64523
A
2551
2552 socket_lock_assert_owned(so);
2553
2554 /*
2555 * top must points to mbuf chain to be sent.
2556 * If control is not NULL, top must be packet header
2557 */
2558 VERIFY(top != NULL &&
0a7de745 2559 (control == NULL || top->m_flags & M_PKTHDR));
d9a64523
A
2560
2561 /*
2562 * If control is not passed in, see if we can get it
2563 * from top.
2564 */
2565 if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2566 // Locate start of control if present and start of data
2567 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2568 if (m0->m_flags & M_PKTHDR) {
2569 top = m0;
2570 break;
2571 } else if (m0->m_type == MT_CONTROL) {
2572 if (control == NULL) {
2573 // Found start of control
2574 control = m0;
2575 }
2576 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2577 // Found end of control
2578 control_end = m0;
2579 }
2580 }
2581 }
0a7de745 2582 if (control_end != NULL) {
d9a64523 2583 control_end->m_next = NULL;
0a7de745 2584 }
d9a64523
A
2585 }
2586
2587 int error = (*so->so_proto->pr_usrreqs->pru_send)
0a7de745 2588 (so, sendflags, top, addr, control, current_proc());
d9a64523
A
2589
2590 return error;
2591}
2592
3e170ce0
A
2593/*
2594 * Supported only connected sockets (no address) without ancillary data
2595 * (control mbuf) for atomic protocols
2596 */
fe8ab488 2597int
3e170ce0 2598sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
fe8ab488
A
2599{
2600 struct mbuf *m, *freelist = NULL;
2601 user_ssize_t len, resid;
3e170ce0
A
2602 int error, dontroute, mlen;
2603 int atomic = sosendallatonce(so);
fe8ab488
A
2604 int sblocked = 0;
2605 struct proc *p = current_proc();
2606 u_int uiofirst = 0;
2607 u_int uiolast = 0;
3e170ce0
A
2608 struct mbuf *top = NULL;
2609 uint16_t headroom = 0;
2610 boolean_t bigcl;
fe8ab488
A
2611
2612 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2613 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2614
2615 if (so->so_type != SOCK_DGRAM) {
2616 error = EINVAL;
2617 goto out;
2618 }
2619 if (atomic == 0) {
2620 error = EINVAL;
2621 goto out;
2622 }
2623 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2624 error = EPROTONOSUPPORT;
2625 goto out;
2626 }
2627 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2628 error = EINVAL;
2629 goto out;
2630 }
3e170ce0 2631 resid = uio_array_resid(uioarray, uiocnt);
fe8ab488
A
2632
2633 /*
2634 * In theory resid should be unsigned.
2635 * However, space must be signed, as it might be less than 0
2636 * if we over-committed, and we must use a signed comparison
2637 * of space and resid. On the other hand, a negative resid
2638 * causes us to loop sending 0-length segments to the protocol.
2639 *
2640 * Note: We limit resid to be a positive int value as we use
2641 * imin() to set bytes_to_copy -- radr://14558484
2642 */
2643 if (resid < 0 || resid > INT_MAX) {
2644 error = EINVAL;
2645 goto out;
2646 }
fe8ab488
A
2647
2648 socket_lock(so, 1);
2649 so_update_last_owner_locked(so, p);
2650 so_update_policy(so);
3e170ce0 2651
fe8ab488 2652#if NECP
3e170ce0 2653 so_update_necp_policy(so, NULL, NULL);
fe8ab488 2654#endif /* NECP */
3e170ce0 2655
fe8ab488
A
2656 dontroute = (flags & MSG_DONTROUTE) &&
2657 (so->so_options & SO_DONTROUTE) == 0 &&
2658 (so->so_proto->pr_flags & PR_ATOMIC);
2659 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2660
f427ee49 2661 error = sosendcheck(so, NULL, resid, 0, atomic, flags, &sblocked);
0a7de745 2662 if (error) {
fe8ab488 2663 goto release;
0a7de745 2664 }
fe8ab488 2665
3e170ce0
A
2666 /*
2667 * Use big 4 KB clusters when the outgoing interface does not prefer
2668 * 2 KB clusters
2669 */
2670 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2671
0a7de745 2672 if (soreserveheadroom != 0) {
3e170ce0 2673 headroom = so->so_pktheadroom;
0a7de745 2674 }
3e170ce0 2675
fe8ab488
A
2676 do {
2677 int i;
3e170ce0
A
2678 int num_needed = 0;
2679 int chainlength;
2680 size_t maxpktlen = 0;
2681 int bytes_to_alloc;
fe8ab488 2682
0a7de745 2683 if (sosendminchain > 0) {
3e170ce0 2684 chainlength = 0;
0a7de745 2685 } else {
3e170ce0 2686 chainlength = sosendmaxchain;
0a7de745 2687 }
fe8ab488 2688
3e170ce0 2689 socket_unlock(so, 0);
fe8ab488 2690
3e170ce0
A
2691 /*
2692 * Find a set of uio that fit in a reasonable number
2693 * of mbuf packets
2694 */
2695 for (i = uiofirst; i < uiocnt; i++) {
2696 struct uio *auio = uioarray[i];
fe8ab488 2697
3e170ce0 2698 len = uio_resid(auio);
fe8ab488 2699
3e170ce0 2700 /* Do nothing for empty messages */
0a7de745 2701 if (len == 0) {
3e170ce0 2702 continue;
0a7de745 2703 }
fe8ab488 2704
3e170ce0
A
2705 num_needed += 1;
2706 uiolast += 1;
fe8ab488 2707
0a7de745 2708 if (len > maxpktlen) {
3e170ce0 2709 maxpktlen = len;
0a7de745 2710 }
fe8ab488 2711
3e170ce0 2712 chainlength += len;
0a7de745 2713 if (chainlength > sosendmaxchain) {
fe8ab488 2714 break;
0a7de745 2715 }
3e170ce0
A
2716 }
2717 /*
2718 * Nothing left to send
2719 */
2720 if (num_needed == 0) {
2721 socket_lock(so, 0);
2722 break;
2723 }
2724 /*
2725 * Allocate buffer large enough to include headroom space for
2726 * network and link header
39037602 2727 *
3e170ce0
A
2728 */
2729 bytes_to_alloc = maxpktlen + headroom;
2730
2731 /*
2732 * Allocate a single contiguous buffer of the smallest available
2733 * size when possible
2734 */
2735 if (bytes_to_alloc > MCLBYTES &&
2736 bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2737 freelist = m_getpackets_internal(
0a7de745
A
2738 (unsigned int *)&num_needed,
2739 num_needed, M_WAIT, 1,
2740 MBIGCLBYTES);
3e170ce0
A
2741 } else if (bytes_to_alloc > _MHLEN &&
2742 bytes_to_alloc <= MCLBYTES) {
2743 freelist = m_getpackets_internal(
0a7de745
A
2744 (unsigned int *)&num_needed,
2745 num_needed, M_WAIT, 1,
2746 MCLBYTES);
3e170ce0 2747 } else {
fe8ab488 2748 freelist = m_allocpacket_internal(
0a7de745
A
2749 (unsigned int *)&num_needed,
2750 bytes_to_alloc, NULL, M_WAIT, 1, 0);
3e170ce0 2751 }
39037602 2752
3e170ce0
A
2753 if (freelist == NULL) {
2754 socket_lock(so, 0);
2755 error = ENOMEM;
2756 goto release;
2757 }
2758 /*
2759 * Copy each uio of the set into its own mbuf packet
2760 */
2761 for (i = uiofirst, m = freelist;
2762 i < uiolast && m != NULL;
2763 i++) {
2764 int bytes_to_copy;
2765 struct mbuf *n;
2766 struct uio *auio = uioarray[i];
fe8ab488 2767
3e170ce0
A
2768 bytes_to_copy = uio_resid(auio);
2769
2770 /* Do nothing for empty messages */
0a7de745 2771 if (bytes_to_copy == 0) {
3e170ce0 2772 continue;
0a7de745 2773 }
fe8ab488 2774 /*
3e170ce0
A
2775 * Leave headroom for protocol headers
2776 * in the first mbuf of the chain
fe8ab488 2777 */
3e170ce0
A
2778 m->m_data += headroom;
2779
2780 for (n = m; n != NULL; n = n->m_next) {
0a7de745 2781 if ((m->m_flags & M_EXT)) {
3e170ce0 2782 mlen = m->m_ext.ext_size -
d9a64523 2783 M_LEADINGSPACE(m);
0a7de745 2784 } else if ((m->m_flags & M_PKTHDR)) {
3e170ce0 2785 mlen =
d9a64523 2786 MHLEN - M_LEADINGSPACE(m);
0a7de745 2787 } else {
d9a64523 2788 mlen = MLEN - M_LEADINGSPACE(m);
0a7de745 2789 }
3e170ce0 2790 len = imin(mlen, bytes_to_copy);
fe8ab488 2791
3e170ce0
A
2792 /*
2793 * Note: uiomove() decrements the iovec
2794 * length
2795 */
2796 error = uiomove(mtod(n, caddr_t),
2797 len, auio);
0a7de745 2798 if (error != 0) {
fe8ab488 2799 break;
0a7de745 2800 }
3e170ce0
A
2801 n->m_len = len;
2802 m->m_pkthdr.len += len;
fe8ab488 2803
3e170ce0 2804 VERIFY(m->m_pkthdr.len <= maxpktlen);
fe8ab488 2805
3e170ce0
A
2806 bytes_to_copy -= len;
2807 resid -= len;
2808 }
2809 if (m->m_pkthdr.len == 0) {
2810 printf(
0a7de745
A
2811 "%s:%d so %llx pkt %llx type %u len null\n",
2812 __func__, __LINE__,
2813 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2814 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2815 m->m_type);
3e170ce0 2816 }
0a7de745 2817 if (error != 0) {
3e170ce0 2818 break;
0a7de745 2819 }
3e170ce0 2820 m = m->m_nextpkt;
fe8ab488
A
2821 }
2822
3e170ce0
A
2823 socket_lock(so, 0);
2824
0a7de745 2825 if (error) {
3e170ce0 2826 goto release;
0a7de745 2827 }
3e170ce0
A
2828 top = freelist;
2829 freelist = NULL;
2830
0a7de745 2831 if (dontroute) {
fe8ab488 2832 so->so_options |= SO_DONTROUTE;
0a7de745 2833 }
fe8ab488
A
2834
2835 if ((flags & MSG_SKIPCFIL) == 0) {
2836 struct mbuf **prevnextp = NULL;
3e170ce0 2837
fe8ab488
A
2838 for (i = uiofirst, m = top;
2839 i < uiolast && m != NULL;
2840 i++) {
2841 struct mbuf *nextpkt = m->m_nextpkt;
2842
2843 /*
2844 * Socket filter processing
2845 */
3e170ce0
A
2846 error = sflt_data_out(so, NULL, &m,
2847 NULL, 0);
0a7de745 2848 if (error != 0 && error != EJUSTRETURN) {
fe8ab488 2849 goto release;
0a7de745 2850 }
3e170ce0 2851
fe8ab488
A
2852#if CONTENT_FILTER
2853 if (error == 0) {
2854 /*
2855 * Content filter processing
2856 */
3e170ce0
A
2857 error = cfil_sock_data_out(so, NULL, m,
2858 NULL, 0);
0a7de745 2859 if (error != 0 && error != EJUSTRETURN) {
fe8ab488 2860 goto release;
0a7de745 2861 }
fe8ab488
A
2862 }
2863#endif /* CONTENT_FILTER */
2864 /*
2865 * Remove packet from the list when
2866 * swallowed by a filter
2867 */
2868 if (error == EJUSTRETURN) {
2869 error = 0;
0a7de745 2870 if (prevnextp != NULL) {
fe8ab488 2871 *prevnextp = nextpkt;
0a7de745 2872 } else {
fe8ab488 2873 top = nextpkt;
0a7de745 2874 }
3e170ce0
A
2875 }
2876
fe8ab488 2877 m = nextpkt;
0a7de745 2878 if (m != NULL) {
fe8ab488 2879 prevnextp = &m->m_nextpkt;
0a7de745 2880 }
fe8ab488
A
2881 }
2882 }
0a7de745 2883 if (top != NULL) {
fe8ab488 2884 error = (*so->so_proto->pr_usrreqs->pru_send_list)
3e170ce0 2885 (so, 0, top, NULL, NULL, p);
0a7de745 2886 }
fe8ab488 2887
0a7de745 2888 if (dontroute) {
fe8ab488 2889 so->so_options &= ~SO_DONTROUTE;
0a7de745 2890 }
fe8ab488 2891
fe8ab488
A
2892 top = NULL;
2893 uiofirst = uiolast;
2894 } while (resid > 0 && error == 0);
2895release:
0a7de745
A
2896 if (sblocked) {
2897 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2898 } else {
fe8ab488 2899 socket_unlock(so, 1);
0a7de745 2900 }
fe8ab488 2901out:
0a7de745 2902 if (top != NULL) {
fe8ab488 2903 m_freem(top);
0a7de745
A
2904 }
2905 if (freelist != NULL) {
fe8ab488 2906 m_freem_list(freelist);
0a7de745 2907 }
fe8ab488
A
2908
2909 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2910 so->so_snd.sb_cc, 0, error);
2911
0a7de745 2912 return error;
fe8ab488
A
2913}
2914
3e170ce0
A
2915/*
2916 * May return ERESTART when packet is dropped by MAC policy check
2917 */
2918static int
2919soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2920 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2921{
2922 int error = 0;
2923 struct mbuf *m = *mp;
2924 struct mbuf *nextrecord = *nextrecordp;
2925
2926 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2927#if CONFIG_MACF_SOCKET_SUBSET
2928 /*
2929 * Call the MAC framework for policy checking if we're in
2930 * the user process context and the socket isn't connected.
2931 */
2932 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2933 struct mbuf *m0 = m;
2934 /*
2935 * Dequeue this record (temporarily) from the receive
2936 * list since we're about to drop the socket's lock
2937 * where a new record may arrive and be appended to
2938 * the list. Upon MAC policy failure, the record
2939 * will be freed. Otherwise, we'll add it back to
2940 * the head of the list. We cannot rely on SB_LOCK
2941 * because append operation uses the socket's lock.
2942 */
2943 do {
2944 m->m_nextpkt = NULL;
2945 sbfree(&so->so_rcv, m);
2946 m = m->m_next;
2947 } while (m != NULL);
2948 m = m0;
2949 so->so_rcv.sb_mb = nextrecord;
2950 SB_EMPTY_FIXUP(&so->so_rcv);
2951 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2952 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2953 socket_unlock(so, 0);
2954
2955 if (mac_socket_check_received(proc_ucred(p), so,
2956 mtod(m, struct sockaddr *)) != 0) {
2957 /*
2958 * MAC policy failure; free this record and
2959 * process the next record (or block until
2960 * one is available). We have adjusted sb_cc
2961 * and sb_mbcnt above so there is no need to
2962 * call sbfree() again.
2963 */
2964 m_freem(m);
2965 /*
2966 * Clear SB_LOCK but don't unlock the socket.
2967 * Process the next record or wait for one.
2968 */
2969 socket_lock(so, 0);
2970 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2971 error = ERESTART;
2972 goto done;
2973 }
2974 socket_lock(so, 0);
2975 /*
2976 * If the socket has been defunct'd, drop it.
2977 */
2978 if (so->so_flags & SOF_DEFUNCT) {
2979 m_freem(m);
2980 error = ENOTCONN;
2981 goto done;
2982 }
2983 /*
2984 * Re-adjust the socket receive list and re-enqueue
2985 * the record in front of any packets which may have
2986 * been appended while we dropped the lock.
2987 */
0a7de745 2988 for (m = m0; m->m_next != NULL; m = m->m_next) {
3e170ce0 2989 sballoc(&so->so_rcv, m);
0a7de745 2990 }
3e170ce0
A
2991 sballoc(&so->so_rcv, m);
2992 if (so->so_rcv.sb_mb == NULL) {
2993 so->so_rcv.sb_lastrecord = m0;
2994 so->so_rcv.sb_mbtail = m;
2995 }
2996 m = m0;
2997 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2998 so->so_rcv.sb_mb = m;
2999 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3000 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3001 }
3002#endif /* CONFIG_MACF_SOCKET_SUBSET */
3003 if (psa != NULL) {
3004 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3005 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3006 error = EWOULDBLOCK;
3007 goto done;
3008 }
3009 }
3010 if (flags & MSG_PEEK) {
3011 m = m->m_next;
3012 } else {
3013 sbfree(&so->so_rcv, m);
3014 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3015 panic("%s: about to create invalid socketbuf",
3016 __func__);
3017 /* NOTREACHED */
3018 }
3019 MFREE(m, so->so_rcv.sb_mb);
3020 m = so->so_rcv.sb_mb;
3021 if (m != NULL) {
3022 m->m_nextpkt = nextrecord;
3023 } else {
3024 so->so_rcv.sb_mb = nextrecord;
3025 SB_EMPTY_FIXUP(&so->so_rcv);
3026 }
3027 }
3028done:
3029 *mp = m;
3030 *nextrecordp = nextrecord;
3031
0a7de745 3032 return error;
3e170ce0
A
3033}
3034
c3c9b80d
A
3035/*
3036 * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
3037 * so clear the data portion in order not to leak the file pointers
3038 */
3039static void
3040sopeek_scm_rights(struct mbuf *rights)
3041{
3042 struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
3043
3044 if (cm->cmsg_type == SCM_RIGHTS) {
3045 memset(cm + 1, 0, cm->cmsg_len - sizeof(*cm));
3046 }
3047}
3048
3e170ce0
A
3049/*
3050 * Process one or more MT_CONTROL mbufs present before any data mbufs
3051 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3052 * just copy the data; if !MSG_PEEK, we call into the protocol to
3053 * perform externalization.
3054 */
3055static int
3056soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3057 struct mbuf **mp, struct mbuf **nextrecordp)
3058{
3059 int error = 0;
3060 struct mbuf *cm = NULL, *cmn;
3061 struct mbuf **cme = &cm;
3062 struct sockbuf *sb_rcv = &so->so_rcv;
3063 struct mbuf **msgpcm = NULL;
3064 struct mbuf *m = *mp;
3065 struct mbuf *nextrecord = *nextrecordp;
3066 struct protosw *pr = so->so_proto;
3067
3068 /*
3069 * Externalizing the control messages would require us to
3070 * drop the socket's lock below. Once we re-acquire the
3071 * lock, the mbuf chain might change. In order to preserve
3072 * consistency, we unlink all control messages from the
3073 * first mbuf chain in one shot and link them separately
3074 * onto a different chain.
3075 */
3076 do {
3077 if (flags & MSG_PEEK) {
3078 if (controlp != NULL) {
3079 if (*controlp == NULL) {
3080 msgpcm = controlp;
3081 }
3082 *controlp = m_copy(m, 0, m->m_len);
3083
3084 /*
3085 * If we failed to allocate an mbuf,
3086 * release any previously allocated
3087 * mbufs for control data. Return
3088 * an error. Keep the mbufs in the
3089 * socket as this is using
3090 * MSG_PEEK flag.
3091 */
3092 if (*controlp == NULL) {
3093 m_freem(*msgpcm);
3094 error = ENOBUFS;
3095 goto done;
3096 }
c3c9b80d
A
3097
3098 sopeek_scm_rights(*controlp);
3099
3e170ce0
A
3100 controlp = &(*controlp)->m_next;
3101 }
3102 m = m->m_next;
3103 } else {
3104 m->m_nextpkt = NULL;
3105 sbfree(sb_rcv, m);
3106 sb_rcv->sb_mb = m->m_next;
3107 m->m_next = NULL;
3108 *cme = m;
3109 cme = &(*cme)->m_next;
3110 m = sb_rcv->sb_mb;
3111 }
3112 } while (m != NULL && m->m_type == MT_CONTROL);
3113
3114 if (!(flags & MSG_PEEK)) {
3115 if (sb_rcv->sb_mb != NULL) {
3116 sb_rcv->sb_mb->m_nextpkt = nextrecord;
3117 } else {
3118 sb_rcv->sb_mb = nextrecord;
3119 SB_EMPTY_FIXUP(sb_rcv);
3120 }
0a7de745 3121 if (nextrecord == NULL) {
3e170ce0 3122 sb_rcv->sb_lastrecord = m;
0a7de745 3123 }
3e170ce0
A
3124 }
3125
3126 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3127 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3128
3129 while (cm != NULL) {
3130 int cmsg_type;
3131
3132 cmn = cm->m_next;
3133 cm->m_next = NULL;
3134 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3135
3136 /*
3137 * Call the protocol to externalize SCM_RIGHTS message
3138 * and return the modified message to the caller upon
3139 * success. Otherwise, all other control messages are
3140 * returned unmodified to the caller. Note that we
3141 * only get into this loop if MSG_PEEK is not set.
3142 */
3143 if (pr->pr_domain->dom_externalize != NULL &&
3144 cmsg_type == SCM_RIGHTS) {
3145 /*
3146 * Release socket lock: see 3903171. This
3147 * would also allow more records to be appended
3148 * to the socket buffer. We still have SB_LOCK
3149 * set on it, so we can be sure that the head
3150 * of the mbuf chain won't change.
3151 */
3152 socket_unlock(so, 0);
3153 error = (*pr->pr_domain->dom_externalize)(cm);
3154 socket_lock(so, 0);
3155 } else {
3156 error = 0;
3157 }
3158
3159 if (controlp != NULL && error == 0) {
3160 *controlp = cm;
3161 controlp = &(*controlp)->m_next;
3162 } else {
3163 (void) m_free(cm);
3164 }
3165 cm = cmn;
3166 }
3167 /*
3168 * Update the value of nextrecord in case we received new
3169 * records when the socket was unlocked above for
3170 * externalizing SCM_RIGHTS.
3171 */
0a7de745 3172 if (m != NULL) {
3e170ce0 3173 nextrecord = sb_rcv->sb_mb->m_nextpkt;
0a7de745 3174 } else {
3e170ce0 3175 nextrecord = sb_rcv->sb_mb;
0a7de745 3176 }
3e170ce0
A
3177
3178done:
3179 *mp = m;
3180 *nextrecordp = nextrecord;
3181
0a7de745 3182 return error;
3e170ce0
A
3183}
3184
f427ee49
A
3185/*
3186 * If we have less data than requested, block awaiting more
3187 * (subject to any timeout) if:
3188 * 1. the current count is less than the low water mark, or
3189 * 2. MSG_WAITALL is set, and it is possible to do the entire
3190 * receive operation at once if we block (resid <= hiwat).
3191 * 3. MSG_DONTWAIT is not set
3192 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3193 * we have to do the receive in sections, and thus risk returning
3194 * a short count if a timeout or signal occurs after we start.
3195 */
3196static boolean_t
3197so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3198{
3199 struct protosw *pr = so->so_proto;
3200
3201 /* No mbufs in the receive-queue? Wait! */
3202 if (m == NULL) {
3203 return true;
3204 }
3205
3206 /* Not enough data in the receive socket-buffer - we may have to wait */
3207 if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(uio) &&
3208 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3209 /*
3210 * Application did set the lowater-mark, so we should wait for
3211 * this data to be present.
3212 */
3213 if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3214 return true;
3215 }
3216
3217 /*
3218 * Application wants all the data - so let's try to do the
3219 * receive-operation at once by waiting for everything to
3220 * be there.
3221 */
3222 if ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat) {
3223 return true;
3224 }
3225 }
3226
3227 return false;
3228}
3229
1c79356b
A
3230/*
3231 * Implement receive operations on a socket.
3232 * We depend on the way that records are added to the sockbuf
3233 * by sbappend*. In particular, each record (mbufs linked through m_next)
3234 * must begin with an address if the protocol so specifies,
3235 * followed by an optional mbuf or mbufs containing ancillary data,
3236 * and then zero or more mbufs of data.
3237 * In order to avoid blocking network interrupts for the entire time here,
3238 * we splx() while doing the actual copy to user space.
3239 * Although the sockbuf is locked, new data may still be appended,
3240 * and thus we must maintain consistency of the sockbuf during that time.
3241 *
3242 * The caller may receive the data as a single mbuf chain by supplying
3243 * an mbuf **mp0 for use in returning the chain. The uio is then used
3244 * only for the count in uio_resid.
2d21ac55
A
3245 *
3246 * Returns: 0 Success
3247 * ENOBUFS
3248 * ENOTCONN
3249 * EWOULDBLOCK
3250 * uiomove:EFAULT
3251 * sblock:EWOULDBLOCK
3252 * sblock:EINTR
3253 * sbwait:EBADF
3254 * sbwait:EINTR
3255 * sodelayed_copy:EFAULT
3256 * <pru_rcvoob>:EINVAL[TCP]
3257 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3258 * <pru_rcvoob>:???
3259 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3260 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3261 * <pr_domain->dom_externalize>:???
3262 *
3263 * Notes: Additional return values from calls through <pru_rcvoob> and
3264 * <pr_domain->dom_externalize> depend on protocols other than
3265 * TCP or AF_UNIX, which are documented above.
1c79356b
A
3266 */
3267int
2d21ac55
A
3268soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3269 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1c79356b 3270{
39236c6e
A
3271 struct mbuf *m, **mp, *ml = NULL;
3272 struct mbuf *nextrecord, *free_list;
3273 int flags, error, offset;
3274 user_ssize_t len;
1c79356b 3275 struct protosw *pr = so->so_proto;
3e170ce0 3276 int moff, type = 0;
39236c6e
A
3277 user_ssize_t orig_resid = uio_resid(uio);
3278 user_ssize_t delayed_copy_len;
55e303ae 3279 int can_delay;
55e303ae 3280 struct proc *p = current_proc();
3e170ce0 3281 boolean_t en_tracing = FALSE;
1c79356b 3282
fe8ab488
A
3283 /*
3284 * Sanity check on the length passed by caller as we are making 'int'
3285 * comparisons
3286 */
0a7de745
A
3287 if (orig_resid < 0 || orig_resid > INT_MAX) {
3288 return EINVAL;
3289 }
fe8ab488 3290
3e170ce0
A
3291 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3292 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3293 so->so_rcv.sb_hiwat);
3294
91447636 3295 socket_lock(so, 1);
6d2010ae 3296 so_update_last_owner_locked(so, p);
39236c6e 3297 so_update_policy(so);
1c79356b 3298
91447636 3299#ifdef MORE_LOCKING_DEBUG
39236c6e
A
3300 if (so->so_usecount == 1) {
3301 panic("%s: so=%x no other reference on socket\n", __func__, so);
3302 /* NOTREACHED */
3303 }
91447636 3304#endif
1c79356b 3305 mp = mp0;
0a7de745 3306 if (psa != NULL) {
39236c6e 3307 *psa = NULL;
0a7de745
A
3308 }
3309 if (controlp != NULL) {
39236c6e 3310 *controlp = NULL;
0a7de745
A
3311 }
3312 if (flagsp != NULL) {
3313 flags = *flagsp & ~MSG_EOR;
3314 } else {
1c79356b 3315 flags = 0;
0a7de745 3316 }
2d21ac55
A
3317
3318 /*
3319 * If a recv attempt is made on a previously-accepted socket
3320 * that has been marked as inactive (disconnected), reject
3321 * the request.
3322 */
3323 if (so->so_flags & SOF_DEFUNCT) {
3324 struct sockbuf *sb = &so->so_rcv;
3325
6d2010ae 3326 error = ENOTCONN;
39037602
A
3327 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3328 __func__, proc_pid(p), proc_best_name(p),
3329 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3330 SOCK_DOM(so), SOCK_TYPE(so), error);
2d21ac55
A
3331 /*
3332 * This socket should have been disconnected and flushed
6d2010ae
A
3333 * prior to being returned from sodefunct(); there should
3334 * be no data on its receive list, so panic otherwise.
2d21ac55 3335 */
0a7de745 3336 if (so->so_state & SS_DEFUNCT) {
6d2010ae 3337 sb_empty_assert(sb, __func__);
0a7de745 3338 }
2d21ac55 3339 socket_unlock(so, 1);
0a7de745 3340 return error;
2d21ac55
A
3341 }
3342
3e170ce0
A
3343 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3344 pr->pr_usrreqs->pru_preconnect) {
3345 /*
3346 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3347 * calling write() right after this. *If* the app calls a read
3348 * we do not want to block this read indefinetely. Thus,
3349 * we trigger a connect so that the session gets initiated.
3350 */
3351 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3352
3353 if (error) {
3354 socket_unlock(so, 1);
0a7de745 3355 return error;
3e170ce0
A
3356 }
3357 }
3358
3359 if (ENTR_SHOULDTRACE &&
3360 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3361 /*
3362 * enable energy tracing for inet sockets that go over
3363 * non-loopback interfaces only.
3364 */
3365 struct inpcb *inp = sotoinpcb(so);
3366 if (inp->inp_last_outifp != NULL &&
3367 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3368 en_tracing = TRUE;
3369 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3370 VM_KERNEL_ADDRPERM(so),
3371 ((so->so_state & SS_NBIO) ?
3372 kEnTrFlagNonBlocking : 0),
3373 (int64_t)orig_resid);
3374 }
3375 }
3376
2d21ac55
A
3377 /*
3378 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3379 * regardless of the flags argument. Here is the case were
3380 * out-of-band data is not inline.
3381 */
3382 if ((flags & MSG_OOB) ||
3383 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3384 (so->so_options & SO_OOBINLINE) == 0 &&
3385 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
1c79356b 3386 m = m_get(M_WAIT, MT_DATA);
55e303ae 3387 if (m == NULL) {
91447636 3388 socket_unlock(so, 1);
2d21ac55
A
3389 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3390 ENOBUFS, 0, 0, 0, 0);
0a7de745 3391 return ENOBUFS;
55e303ae 3392 }
1c79356b 3393 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
0a7de745 3394 if (error) {
1c79356b 3395 goto bad;
0a7de745 3396 }
91447636 3397 socket_unlock(so, 0);
1c79356b
A
3398 do {
3399 error = uiomove(mtod(m, caddr_t),
b0d623f7 3400 imin(uio_resid(uio), m->m_len), uio);
1c79356b 3401 m = m_free(m);
39236c6e 3402 } while (uio_resid(uio) && error == 0 && m != NULL);
91447636 3403 socket_lock(so, 0);
1c79356b 3404bad:
0a7de745 3405 if (m != NULL) {
1c79356b 3406 m_freem(m);
0a7de745 3407 }
39236c6e 3408
9bccf70c
A
3409 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3410 if (error == EWOULDBLOCK || error == EINVAL) {
2d21ac55 3411 /*
9bccf70c 3412 * Let's try to get normal data:
2d21ac55
A
3413 * EWOULDBLOCK: out-of-band data not
3414 * receive yet. EINVAL: out-of-band data
3415 * already read.
9bccf70c
A
3416 */
3417 error = 0;
3418 goto nooob;
39236c6e 3419 } else if (error == 0 && flagsp != NULL) {
9bccf70c 3420 *flagsp |= MSG_OOB;
2d21ac55
A
3421 }
3422 }
91447636 3423 socket_unlock(so, 1);
3e170ce0
A
3424 if (en_tracing) {
3425 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3426 VM_KERNEL_ADDRPERM(so), 0,
3427 (int64_t)(orig_resid - uio_resid(uio)));
3428 }
2d21ac55
A
3429 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3430 0, 0, 0, 0);
39236c6e 3431
0a7de745 3432 return error;
1c79356b
A
3433 }
3434nooob:
0a7de745 3435 if (mp != NULL) {
39236c6e 3436 *mp = NULL;
0a7de745 3437 }
fe8ab488
A
3438
3439 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
1c79356b 3440 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
fe8ab488 3441 }
1c79356b 3442
39236c6e 3443 free_list = NULL;
55e303ae 3444 delayed_copy_len = 0;
1c79356b 3445restart:
91447636 3446#ifdef MORE_LOCKING_DEBUG
0a7de745 3447 if (so->so_usecount <= 1) {
fe8ab488 3448 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3e170ce0 3449 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
0a7de745 3450 }
91447636 3451#endif
6601e61a
A
3452 /*
3453 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3454 * and if so just return to the caller. This could happen when
3455 * soreceive() is called by a socket upcall function during the
3456 * time the socket is freed. The socket buffer would have been
3457 * locked across the upcall, therefore we cannot put this thread
3458 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3459 * we may livelock), because the lock on the socket buffer will
3460 * only be released when the upcall routine returns to its caller.
3461 * Because the socket has been officially closed, there can be
3462 * no further read on it.
39236c6e
A
3463 *
3464 * A multipath subflow socket would have its SS_NOFDREF set by
3465 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3466 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
6601e61a
A
3467 */
3468 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
39236c6e 3469 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
6601e61a 3470 socket_unlock(so, 1);
0a7de745 3471 return 0;
6601e61a
A
3472 }
3473
9bccf70c
A
3474 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3475 if (error) {
91447636 3476 socket_unlock(so, 1);
2d21ac55
A
3477 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3478 0, 0, 0, 0);
3e170ce0
A
3479 if (en_tracing) {
3480 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3481 VM_KERNEL_ADDRPERM(so), 0,
3482 (int64_t)(orig_resid - uio_resid(uio)));
3483 }
0a7de745 3484 return error;
1c79356b 3485 }
1c79356b
A
3486
3487 m = so->so_rcv.sb_mb;
f427ee49 3488 if (so_should_wait(so, uio, m, flags)) {
2d21ac55
A
3489 /*
3490 * Panic if we notice inconsistencies in the socket's
3491 * receive list; both sb_mb and sb_cc should correctly
3492 * reflect the contents of the list, otherwise we may
3493 * end up with false positives during select() or poll()
3494 * which could put the application in a bad state.
3495 */
316670eb 3496 SB_MB_CHECK(&so->so_rcv);
55e303ae 3497
1c79356b 3498 if (so->so_error) {
0a7de745 3499 if (m != NULL) {
1c79356b 3500 goto dontblock;
0a7de745 3501 }
1c79356b 3502 error = so->so_error;
0a7de745 3503 if ((flags & MSG_PEEK) == 0) {
1c79356b 3504 so->so_error = 0;
0a7de745 3505 }
1c79356b
A
3506 goto release;
3507 }
3508 if (so->so_state & SS_CANTRCVMORE) {
fe8ab488
A
3509#if CONTENT_FILTER
3510 /*
3511 * Deal with half closed connections
3512 */
3513 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
0a7de745 3514 cfil_sock_data_pending(&so->so_rcv) != 0) {
fe8ab488 3515 CFIL_LOG(LOG_INFO,
0a7de745
A
3516 "so %llx ignore SS_CANTRCVMORE",
3517 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3518 } else
fe8ab488 3519#endif /* CONTENT_FILTER */
0a7de745 3520 if (m != NULL) {
1c79356b 3521 goto dontblock;
0a7de745 3522 } else {
1c79356b 3523 goto release;
0a7de745 3524 }
1c79356b 3525 }
0a7de745 3526 for (; m != NULL; m = m->m_next) {
2d21ac55 3527 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
1c79356b
A
3528 m = so->so_rcv.sb_mb;
3529 goto dontblock;
3530 }
0a7de745
A
3531 }
3532 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
1c79356b
A
3533 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3534 error = ENOTCONN;
3535 goto release;
3536 }
0a7de745 3537 if (uio_resid(uio) == 0) {
1c79356b 3538 goto release;
0a7de745 3539 }
3e170ce0 3540
2d21ac55 3541 if ((so->so_state & SS_NBIO) ||
0a7de745 3542 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
1c79356b
A
3543 error = EWOULDBLOCK;
3544 goto release;
3545 }
2d21ac55
A
3546 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3547 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
0a7de745 3548 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
2d21ac55 3549#if EVEN_MORE_LOCKING_DEBUG
0a7de745 3550 if (socket_debug) {
2d21ac55 3551 printf("Waiting for socket data\n");
0a7de745 3552 }
91447636 3553#endif
55e303ae 3554
f427ee49
A
3555 /*
3556 * Depending on the protocol (e.g. TCP), the following
3557 * might cause the socket lock to be dropped and later
3558 * be reacquired, and more data could have arrived and
3559 * have been appended to the receive socket buffer by
3560 * the time it returns. Therefore, we only sleep in
3561 * sbwait() below if and only if the wait-condition is still
3562 * true.
3563 */
3564 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3565 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3566 }
3567
3568 error = 0;
3569 if (so_should_wait(so, uio, so->so_rcv.sb_mb, flags)) {
3570 error = sbwait(&so->so_rcv);
3571 }
3572
2d21ac55 3573#if EVEN_MORE_LOCKING_DEBUG
0a7de745 3574 if (socket_debug) {
2d21ac55 3575 printf("SORECEIVE - sbwait returned %d\n", error);
0a7de745 3576 }
91447636 3577#endif
39236c6e
A
3578 if (so->so_usecount < 1) {
3579 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3580 __func__, so, so->so_usecount);
3581 /* NOTREACHED */
3582 }
9bccf70c 3583 if (error) {
91447636 3584 socket_unlock(so, 1);
2d21ac55
A
3585 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3586 0, 0, 0, 0);
3e170ce0
A
3587 if (en_tracing) {
3588 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3589 VM_KERNEL_ADDRPERM(so), 0,
3590 (int64_t)(orig_resid - uio_resid(uio)));
3591 }
0a7de745 3592 return error;
1c79356b
A
3593 }
3594 goto restart;
3595 }
3596dontblock:
b0d623f7 3597 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
2d21ac55
A
3598 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3599 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
1c79356b 3600 nextrecord = m->m_nextpkt;
fe8ab488 3601
3e170ce0
A
3602 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3603 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3604 mp0 == NULL);
0a7de745 3605 if (error == ERESTART) {
3e170ce0 3606 goto restart;
0a7de745 3607 } else if (error != 0) {
3e170ce0 3608 goto release;
0a7de745 3609 }
1c79356b 3610 orig_resid = 0;
1c79356b 3611 }
2d21ac55
A
3612
3613 /*
3614 * Process one or more MT_CONTROL mbufs present before any data mbufs
3615 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3616 * just copy the data; if !MSG_PEEK, we call into the protocol to
3617 * perform externalization.
3618 */
3619 if (m != NULL && m->m_type == MT_CONTROL) {
3e170ce0 3620 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
0a7de745 3621 if (error != 0) {
3e170ce0 3622 goto release;
0a7de745 3623 }
316670eb 3624 orig_resid = 0;
1c79356b 3625 }
2d21ac55
A
3626
3627 if (m != NULL) {
3628 if (!(flags & MSG_PEEK)) {
3629 /*
3630 * We get here because m points to an mbuf following
3631 * any MT_SONAME or MT_CONTROL mbufs which have been
3632 * processed above. In any case, m should be pointing
3633 * to the head of the mbuf chain, and the nextrecord
3634 * should be either NULL or equal to m->m_nextpkt.
3635 * See comments above about SB_LOCK.
3636 */
39236c6e
A
3637 if (m != so->so_rcv.sb_mb ||
3638 m->m_nextpkt != nextrecord) {
3639 panic("%s: post-control !sync so=%p m=%p "
3640 "nextrecord=%p\n", __func__, so, m,
3641 nextrecord);
3642 /* NOTREACHED */
3643 }
0a7de745 3644 if (nextrecord == NULL) {
2d21ac55 3645 so->so_rcv.sb_lastrecord = m;
0a7de745 3646 }
2d21ac55 3647 }
1c79356b 3648 type = m->m_type;
0a7de745 3649 if (type == MT_OOBDATA) {
1c79356b 3650 flags |= MSG_OOB;
0a7de745 3651 }
2d21ac55
A
3652 } else {
3653 if (!(flags & MSG_PEEK)) {
2d21ac55
A
3654 SB_EMPTY_FIXUP(&so->so_rcv);
3655 }
1c79356b 3656 }
2d21ac55
A
3657 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3658 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3659
1c79356b
A
3660 moff = 0;
3661 offset = 0;
fa4905b1 3662
0a7de745 3663 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
2d21ac55 3664 can_delay = 1;
0a7de745 3665 } else {
2d21ac55 3666 can_delay = 0;
0a7de745 3667 }
55e303ae 3668
39236c6e
A
3669 while (m != NULL &&
3670 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
1c79356b 3671 if (m->m_type == MT_OOBDATA) {
0a7de745 3672 if (type != MT_OOBDATA) {
1c79356b 3673 break;
0a7de745 3674 }
2d21ac55 3675 } else if (type == MT_OOBDATA) {
1c79356b 3676 break;
2d21ac55 3677 }
c3c9b80d
A
3678
3679 if (m->m_type != MT_OOBDATA && m->m_type != MT_DATA &&
3680 m->m_type != MT_HEADER) {
3681 break;
3682 }
9bccf70c 3683 /*
2d21ac55 3684 * Make sure to allways set MSG_OOB event when getting
9bccf70c
A
3685 * out of band data inline.
3686 */
1c79356b 3687 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2d21ac55
A
3688 (so->so_options & SO_OOBINLINE) != 0 &&
3689 (so->so_state & SS_RCVATMARK) != 0) {
9bccf70c
A
3690 flags |= MSG_OOB;
3691 }
1c79356b 3692 so->so_state &= ~SS_RCVATMARK;
91447636 3693 len = uio_resid(uio) - delayed_copy_len;
0a7de745 3694 if (so->so_oobmark && len > so->so_oobmark - offset) {
1c79356b 3695 len = so->so_oobmark - offset;
0a7de745
A
3696 }
3697 if (len > m->m_len - moff) {
1c79356b 3698 len = m->m_len - moff;
0a7de745 3699 }
1c79356b
A
3700 /*
3701 * If mp is set, just pass back the mbufs.
3702 * Otherwise copy them out via the uio, then free.
3703 * Sockbuf must be consistent here (points to current mbuf,
3704 * it points to next record) when we drop priority;
3705 * we must note any additions to the sockbuf when we
3706 * block interrupts again.
3707 */
39236c6e 3708 if (mp == NULL) {
2d21ac55
A
3709 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3710 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
55e303ae 3711 if (can_delay && len == m->m_len) {
2d21ac55 3712 /*
55e303ae
A
3713 * only delay the copy if we're consuming the
3714 * mbuf and we're NOT in MSG_PEEK mode
3715 * and we have enough data to make it worthwile
2d21ac55
A
3716 * to drop and retake the lock... can_delay
3717 * reflects the state of the 2 latter
3718 * constraints moff should always be zero
3719 * in these cases
55e303ae 3720 */
2d21ac55 3721 delayed_copy_len += len;
55e303ae 3722 } else {
2d21ac55
A
3723 if (delayed_copy_len) {
3724 error = sodelayed_copy(so, uio,
3725 &free_list, &delayed_copy_len);
55e303ae
A
3726
3727 if (error) {
55e303ae
A
3728 goto release;
3729 }
2d21ac55
A
3730 /*
3731 * can only get here if MSG_PEEK is not
3732 * set therefore, m should point at the
3733 * head of the rcv queue; if it doesn't,
3734 * it means something drastically
3735 * changed while we were out from behind
3736 * the lock in sodelayed_copy. perhaps
3737 * a RST on the stream. in any event,
3738 * the stream has been interrupted. it's
3739 * probably best just to return whatever
3740 * data we've moved and let the caller
3741 * sort it out...
3742 */
55e303ae 3743 if (m != so->so_rcv.sb_mb) {
2d21ac55 3744 break;
55e303ae
A
3745 }
3746 }
91447636 3747 socket_unlock(so, 0);
2d21ac55
A
3748 error = uiomove(mtod(m, caddr_t) + moff,
3749 (int)len, uio);
91447636 3750 socket_lock(so, 0);
55e303ae 3751
0a7de745 3752 if (error) {
2d21ac55 3753 goto release;
0a7de745 3754 }
55e303ae 3755 }
2d21ac55 3756 } else {
91447636 3757 uio_setresid(uio, (uio_resid(uio) - len));
2d21ac55 3758 }
1c79356b 3759 if (len == m->m_len - moff) {
0a7de745 3760 if (m->m_flags & M_EOR) {
1c79356b 3761 flags |= MSG_EOR;
0a7de745 3762 }
1c79356b
A
3763 if (flags & MSG_PEEK) {
3764 m = m->m_next;
3765 moff = 0;
3766 } else {
3767 nextrecord = m->m_nextpkt;
3768 sbfree(&so->so_rcv, m);
91447636 3769 m->m_nextpkt = NULL;
55e303ae 3770
39236c6e 3771 if (mp != NULL) {
1c79356b
A
3772 *mp = m;
3773 mp = &m->m_next;
3774 so->so_rcv.sb_mb = m = m->m_next;
39236c6e 3775 *mp = NULL;
1c79356b 3776 } else {
0a7de745 3777 if (free_list == NULL) {
2d21ac55 3778 free_list = m;
0a7de745 3779 } else {
2d21ac55 3780 ml->m_next = m;
0a7de745 3781 }
2d21ac55 3782 ml = m;
14353aa8 3783 so->so_rcv.sb_mb = m = m->m_next;
39236c6e 3784 ml->m_next = NULL;
1c79356b 3785 }
2d21ac55 3786 if (m != NULL) {
1c79356b 3787 m->m_nextpkt = nextrecord;
0a7de745 3788 if (nextrecord == NULL) {
2d21ac55 3789 so->so_rcv.sb_lastrecord = m;
0a7de745 3790 }
2d21ac55
A
3791 } else {
3792 so->so_rcv.sb_mb = nextrecord;
3793 SB_EMPTY_FIXUP(&so->so_rcv);
3794 }
3795 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3796 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
1c79356b
A
3797 }
3798 } else {
2d21ac55 3799 if (flags & MSG_PEEK) {
1c79356b 3800 moff += len;
2d21ac55 3801 } else {
6d2010ae
A
3802 if (mp != NULL) {
3803 int copy_flag;
3804
0a7de745 3805 if (flags & MSG_DONTWAIT) {
6d2010ae 3806 copy_flag = M_DONTWAIT;
0a7de745 3807 } else {
6d2010ae 3808 copy_flag = M_WAIT;
0a7de745 3809 }
6d2010ae 3810 *mp = m_copym(m, 0, len, copy_flag);
39236c6e
A
3811 /*
3812 * Failed to allocate an mbuf?
3813 * Adjust uio_resid back, it was
3814 * adjusted down by len bytes which
3815 * we didn't copy over.
3816 */
6d2010ae 3817 if (*mp == NULL) {
39236c6e
A
3818 uio_setresid(uio,
3819 (uio_resid(uio) + len));
6d2010ae
A
3820 break;
3821 }
3822 }
1c79356b
A
3823 m->m_data += len;
3824 m->m_len -= len;
3825 so->so_rcv.sb_cc -= len;
3826 }
3827 }
3828 if (so->so_oobmark) {
3829 if ((flags & MSG_PEEK) == 0) {
3830 so->so_oobmark -= len;
3831 if (so->so_oobmark == 0) {
2d21ac55 3832 so->so_state |= SS_RCVATMARK;
2d21ac55 3833 break;
1c79356b
A
3834 }
3835 } else {
3836 offset += len;
0a7de745 3837 if (offset == so->so_oobmark) {
1c79356b 3838 break;
0a7de745 3839 }
1c79356b
A
3840 }
3841 }
0a7de745 3842 if (flags & MSG_EOR) {
1c79356b 3843 break;
0a7de745 3844 }
1c79356b 3845 /*
2d21ac55
A
3846 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3847 * (for non-atomic socket), we must not quit until
3848 * "uio->uio_resid == 0" or an error termination.
3849 * If a signal/timeout occurs, return with a short
3850 * count but without error. Keep sockbuf locked
3851 * against other readers.
1c79356b 3852 */
0a7de745 3853 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
2d21ac55 3854 (uio_resid(uio) - delayed_copy_len) > 0 &&
1c79356b 3855 !sosendallatonce(so) && !nextrecord) {
fe8ab488
A
3856 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3857#if CONTENT_FILTER
3858 && cfil_sock_data_pending(&so->so_rcv) == 0
3859#endif /* CONTENT_FILTER */
0a7de745 3860 )) {
2d21ac55 3861 goto release;
0a7de745 3862 }
fa4905b1 3863
2d21ac55
A
3864 /*
3865 * Depending on the protocol (e.g. TCP), the following
3866 * might cause the socket lock to be dropped and later
3867 * be reacquired, and more data could have arrived and
3868 * have been appended to the receive socket buffer by
3869 * the time it returns. Therefore, we only sleep in
3870 * sbwait() below if and only if the socket buffer is
3871 * empty, in order to avoid a false sleep.
3872 */
f427ee49 3873 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
2d21ac55 3874 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
0a7de745 3875 }
2d21ac55
A
3876
3877 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3878 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3879
3880 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3881 error = 0;
55e303ae 3882 goto release;
fa4905b1 3883 }
55e303ae 3884 /*
2d21ac55
A
3885 * have to wait until after we get back from the sbwait
3886 * to do the copy because we will drop the lock if we
3887 * have enough data that has been delayed... by dropping
3888 * the lock we open up a window allowing the netisr
3889 * thread to process the incoming packets and to change
3890 * the state of this socket... we're issuing the sbwait
3891 * because the socket is empty and we're expecting the
3892 * netisr thread to wake us up when more packets arrive;
3893 * if we allow that processing to happen and then sbwait
3894 * we could stall forever with packets sitting in the
3895 * socket if no further packets arrive from the remote
3896 * side.
55e303ae 3897 *
2d21ac55
A
3898 * we want to copy before we've collected all the data
3899 * to satisfy this request to allow the copy to overlap
3900 * the incoming packet processing on an MP system
55e303ae 3901 */
2d21ac55
A
3902 if (delayed_copy_len > sorecvmincopy &&
3903 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3904 error = sodelayed_copy(so, uio,
3905 &free_list, &delayed_copy_len);
55e303ae 3906
0a7de745 3907 if (error) {
2d21ac55 3908 goto release;
0a7de745 3909 }
1c79356b
A
3910 }
3911 m = so->so_rcv.sb_mb;
39236c6e 3912 if (m != NULL) {
1c79356b 3913 nextrecord = m->m_nextpkt;
fa4905b1 3914 }
316670eb 3915 SB_MB_CHECK(&so->so_rcv);
1c79356b
A
3916 }
3917 }
91447636 3918#ifdef MORE_LOCKING_DEBUG
39236c6e
A
3919 if (so->so_usecount <= 1) {
3920 panic("%s: after big while so=%p ref=%d on socket\n",
3921 __func__, so, so->so_usecount);
3922 /* NOTREACHED */
3923 }
91447636 3924#endif
1c79356b 3925
39236c6e 3926 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
2d21ac55 3927 if (so->so_options & SO_DONTTRUNC) {
1c79356b 3928 flags |= MSG_RCVMORE;
2d21ac55 3929 } else {
9bccf70c 3930 flags |= MSG_TRUNC;
0a7de745 3931 if ((flags & MSG_PEEK) == 0) {
1c79356b 3932 (void) sbdroprecord(&so->so_rcv);
0a7de745 3933 }
1c79356b
A
3934 }
3935 }
2d21ac55
A
3936
3937 /*
3938 * pru_rcvd below (for TCP) may cause more data to be received
3939 * if the socket lock is dropped prior to sending the ACK; some
3940 * legacy OpenTransport applications don't handle this well
3941 * (if it receives less data than requested while MSG_HAVEMORE
3942 * is set), and so we set the flag now based on what we know
3943 * prior to calling pru_rcvd.
3944 */
0a7de745 3945 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
2d21ac55 3946 flags |= MSG_HAVEMORE;
0a7de745 3947 }
2d21ac55 3948
1c79356b 3949 if ((flags & MSG_PEEK) == 0) {
39236c6e 3950 if (m == NULL) {
1c79356b 3951 so->so_rcv.sb_mb = nextrecord;
2d21ac55
A
3952 /*
3953 * First part is an inline SB_EMPTY_FIXUP(). Second
3954 * part makes sure sb_lastrecord is up-to-date if
3955 * there is still data in the socket buffer.
3956 */
3957 if (so->so_rcv.sb_mb == NULL) {
3958 so->so_rcv.sb_mbtail = NULL;
3959 so->so_rcv.sb_lastrecord = NULL;
3960 } else if (nextrecord->m_nextpkt == NULL) {
3961 so->so_rcv.sb_lastrecord = nextrecord;
3962 }
316670eb 3963 SB_MB_CHECK(&so->so_rcv);
2d21ac55
A
3964 }
3965 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3966 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
0a7de745 3967 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
1c79356b 3968 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
0a7de745 3969 }
1c79356b 3970 }
39236c6e 3971
55e303ae 3972 if (delayed_copy_len) {
91447636 3973 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
0a7de745 3974 if (error) {
2d21ac55 3975 goto release;
0a7de745 3976 }
55e303ae 3977 }
39236c6e
A
3978 if (free_list != NULL) {
3979 m_freem_list(free_list);
3980 free_list = NULL;
55e303ae 3981 }
39236c6e 3982
91447636 3983 if (orig_resid == uio_resid(uio) && orig_resid &&
1c79356b 3984 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
0a7de745 3985 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
1c79356b
A
3986 goto restart;
3987 }
3988
0a7de745 3989 if (flagsp != NULL) {
1c79356b 3990 *flagsp |= flags;
0a7de745 3991 }
1c79356b 3992release:
91447636 3993#ifdef MORE_LOCKING_DEBUG
39236c6e
A
3994 if (so->so_usecount <= 1) {
3995 panic("%s: release so=%p ref=%d on socket\n", __func__,
2d21ac55 3996 so, so->so_usecount);
39236c6e
A
3997 /* NOTREACHED */
3998 }
91447636 3999#endif
0a7de745 4000 if (delayed_copy_len) {
2d21ac55 4001 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
0a7de745 4002 }
1c79356b 4003
0a7de745 4004 if (free_list != NULL) {
39236c6e 4005 m_freem_list(free_list);
0a7de745 4006 }
39236c6e 4007
0a7de745 4008 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
39236c6e 4009
3e170ce0
A
4010 if (en_tracing) {
4011 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4012 VM_KERNEL_ADDRPERM(so),
4013 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4014 (int64_t)(orig_resid - uio_resid(uio)));
4015 }
2d21ac55
A
4016 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4017 so->so_rcv.sb_cc, 0, error);
1c79356b 4018
0a7de745 4019 return error;
1c79356b
A
4020}
4021
2d21ac55
A
4022/*
4023 * Returns: 0 Success
4024 * uiomove:EFAULT
4025 */
4026static int
4027sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
39236c6e 4028 user_ssize_t *resid)
55e303ae 4029{
2d21ac55 4030 int error = 0;
55e303ae
A
4031 struct mbuf *m;
4032
4033 m = *free_list;
4034
91447636 4035 socket_unlock(so, 0);
55e303ae 4036
39236c6e 4037 while (m != NULL && error == 0) {
2d21ac55 4038 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
2d21ac55
A
4039 m = m->m_next;
4040 }
4041 m_freem_list(*free_list);
4042
39236c6e 4043 *free_list = NULL;
2d21ac55
A
4044 *resid = 0;
4045
4046 socket_lock(so, 0);
55e303ae 4047
0a7de745 4048 return error;
2d21ac55
A
4049}
4050
3e170ce0
A
4051static int
4052sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4053 u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4054{
4055#pragma unused(so)
4056 int error = 0;
4057 struct mbuf *ml, *m;
4058 int i = 0;
4059 struct uio *auio;
4060
4061 for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4062 ml = ml->m_nextpkt, i++) {
4063 auio = msgarray[i].uio;
4064 for (m = ml; m != NULL; m = m->m_next) {
4065 error = uiomove(mtod(m, caddr_t), m->m_len, auio);
0a7de745 4066 if (error != 0) {
3e170ce0 4067 goto out;
0a7de745 4068 }
3e170ce0
A
4069 }
4070 }
4071out:
4072 m_freem_list(*free_list);
4073
4074 *free_list = NULL;
4075 *resid = 0;
4076
0a7de745 4077 return error;
3e170ce0
A
4078}
4079
2d21ac55 4080int
3e170ce0
A
4081soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4082 int *flagsp)
2d21ac55 4083{
3e170ce0 4084 struct mbuf *m;
fe8ab488 4085 struct mbuf *nextrecord;
3e170ce0
A
4086 struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4087 int error;
4088 user_ssize_t len, pktlen, delayed_copy_len = 0;
fe8ab488 4089 struct protosw *pr = so->so_proto;
3e170ce0 4090 user_ssize_t resid;
fe8ab488
A
4091 struct proc *p = current_proc();
4092 struct uio *auio = NULL;
3e170ce0 4093 int npkts = 0;
fe8ab488 4094 int sblocked = 0;
3e170ce0
A
4095 struct sockaddr **psa = NULL;
4096 struct mbuf **controlp = NULL;
4097 int can_delay;
4098 int flags;
4099 struct mbuf *free_others = NULL;
55e303ae 4100
fe8ab488
A
4101 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4102 so, uiocnt,
4103 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4104
fe8ab488
A
4105 /*
4106 * Sanity checks:
4107 * - Only supports don't wait flags
4108 * - Only support datagram sockets (could be extended to raw)
4109 * - Must be atomic
4110 * - Protocol must support packet chains
4111 * - The uio array is NULL (should we panic?)
4112 */
0a7de745 4113 if (flagsp != NULL) {
3e170ce0 4114 flags = *flagsp;
0a7de745 4115 } else {
3e170ce0 4116 flags = 0;
0a7de745 4117 }
3e170ce0
A
4118 if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4119 MSG_NBIO)) {
4120 printf("%s invalid flags 0x%x\n", __func__, flags);
4121 error = EINVAL;
fe8ab488
A
4122 goto out;
4123 }
4124 if (so->so_type != SOCK_DGRAM) {
4125 error = EINVAL;
4126 goto out;
4127 }
4128 if (sosendallatonce(so) == 0) {
4129 error = EINVAL;
4130 goto out;
4131 }
4132 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4133 error = EPROTONOSUPPORT;
4134 goto out;
4135 }
3e170ce0 4136 if (msgarray == NULL) {
fe8ab488
A
4137 printf("%s uioarray is NULL\n", __func__);
4138 error = EINVAL;
4139 goto out;
4140 }
4141 if (uiocnt == 0) {
4142 printf("%s uiocnt is 0\n", __func__);
4143 error = EINVAL;
4144 goto out;
4145 }
4146 /*
4147 * Sanity check on the length passed by caller as we are making 'int'
4148 * comparisons
4149 */
3e170ce0
A
4150 resid = recv_msg_array_resid(msgarray, uiocnt);
4151 if (resid < 0 || resid > INT_MAX) {
fe8ab488
A
4152 error = EINVAL;
4153 goto out;
4154 }
4155
0a7de745 4156 if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
3e170ce0 4157 can_delay = 1;
0a7de745 4158 } else {
3e170ce0 4159 can_delay = 0;
0a7de745 4160 }
3e170ce0 4161
fe8ab488
A
4162 socket_lock(so, 1);
4163 so_update_last_owner_locked(so, p);
4164 so_update_policy(so);
4165
4166#if NECP
4167 so_update_necp_policy(so, NULL, NULL);
4168#endif /* NECP */
3e170ce0 4169
fe8ab488
A
4170 /*
4171 * If a recv attempt is made on a previously-accepted socket
4172 * that has been marked as inactive (disconnected), reject
4173 * the request.
4174 */
4175 if (so->so_flags & SOF_DEFUNCT) {
4176 struct sockbuf *sb = &so->so_rcv;
4177
4178 error = ENOTCONN;
39037602
A
4179 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4180 __func__, proc_pid(p), proc_best_name(p),
4181 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4182 SOCK_DOM(so), SOCK_TYPE(so), error);
fe8ab488
A
4183 /*
4184 * This socket should have been disconnected and flushed
4185 * prior to being returned from sodefunct(); there should
4186 * be no data on its receive list, so panic otherwise.
4187 */
0a7de745 4188 if (so->so_state & SS_DEFUNCT) {
fe8ab488 4189 sb_empty_assert(sb, __func__);
0a7de745 4190 }
fe8ab488
A
4191 goto release;
4192 }
3e170ce0
A
4193
4194next:
4195 /*
4196 * The uio may be empty
4197 */
4198 if (npkts >= uiocnt) {
4199 error = 0;
4200 goto release;
4201 }
fe8ab488
A
4202restart:
4203 /*
4204 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4205 * and if so just return to the caller. This could happen when
4206 * soreceive() is called by a socket upcall function during the
4207 * time the socket is freed. The socket buffer would have been
4208 * locked across the upcall, therefore we cannot put this thread
4209 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4210 * we may livelock), because the lock on the socket buffer will
4211 * only be released when the upcall routine returns to its caller.
4212 * Because the socket has been officially closed, there can be
4213 * no further read on it.
4214 */
4215 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4216 (SS_NOFDREF | SS_CANTRCVMORE)) {
4217 error = 0;
4218 goto release;
4219 }
4220
4221 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4222 if (error) {
4223 goto release;
4224 }
4225 sblocked = 1;
4226
fe8ab488
A
4227 m = so->so_rcv.sb_mb;
4228 /*
4229 * Block awaiting more datagram if needed
4230 */
3e170ce0
A
4231 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4232 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4233 ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
fe8ab488
A
4234 /*
4235 * Panic if we notice inconsistencies in the socket's
4236 * receive list; both sb_mb and sb_cc should correctly
4237 * reflect the contents of the list, otherwise we may
4238 * end up with false positives during select() or poll()
4239 * which could put the application in a bad state.
4240 */
4241 SB_MB_CHECK(&so->so_rcv);
4242
4243 if (so->so_error) {
4244 error = so->so_error;
0a7de745 4245 if ((flags & MSG_PEEK) == 0) {
3e170ce0 4246 so->so_error = 0;
0a7de745 4247 }
fe8ab488
A
4248 goto release;
4249 }
4250 if (so->so_state & SS_CANTRCVMORE) {
4251 goto release;
4252 }
0a7de745 4253 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
fe8ab488
A
4254 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4255 error = ENOTCONN;
4256 goto release;
4257 }
4258 if ((so->so_state & SS_NBIO) ||
0a7de745 4259 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
fe8ab488
A
4260 error = EWOULDBLOCK;
4261 goto release;
4262 }
4263 /*
4264 * Do not block if we got some data
fe8ab488 4265 */
3e170ce0 4266 if (free_list != NULL) {
fe8ab488
A
4267 error = 0;
4268 goto release;
4269 }
3e170ce0 4270
fe8ab488
A
4271 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4272 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4273
0a7de745 4274 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
fe8ab488
A
4275 sblocked = 0;
4276
4277 error = sbwait(&so->so_rcv);
4278 if (error) {
4279 goto release;
4280 }
4281 goto restart;
4282 }
4283
fe8ab488
A
4284 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4285 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4286 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4287
4288 /*
4289 * Consume the current uio index as we have a datagram
4290 */
3e170ce0
A
4291 auio = msgarray[npkts].uio;
4292 resid = uio_resid(auio);
4293 msgarray[npkts].which |= SOCK_MSG_DATA;
4294 psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4295 &msgarray[npkts].psa : NULL;
4296 controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4297 &msgarray[npkts].controlp : NULL;
4298 npkts += 1;
fe8ab488
A
4299 nextrecord = m->m_nextpkt;
4300
fe8ab488 4301 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3e170ce0 4302 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
0a7de745 4303 if (error == ERESTART) {
3e170ce0 4304 goto restart;
0a7de745 4305 } else if (error != 0) {
3e170ce0 4306 goto release;
0a7de745 4307 }
fe8ab488 4308 }
fe8ab488 4309
fe8ab488 4310 if (m != NULL && m->m_type == MT_CONTROL) {
3e170ce0 4311 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
0a7de745 4312 if (error != 0) {
3e170ce0 4313 goto release;
0a7de745 4314 }
fe8ab488 4315 }
fe8ab488 4316
3e170ce0
A
4317 if (m->m_pkthdr.len == 0) {
4318 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4319 __func__, __LINE__,
4320 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4321 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4322 m->m_type);
4323 }
fe8ab488
A
4324
4325 /*
3e170ce0
A
4326 * Loop to copy the mbufs of the current record
4327 * Support zero length packets
fe8ab488 4328 */
3e170ce0
A
4329 ml = NULL;
4330 pktlen = 0;
4331 while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
0a7de745 4332 if (m->m_len == 0) {
3e170ce0 4333 panic("%p m_len zero", m);
0a7de745
A
4334 }
4335 if (m->m_type == 0) {
3e170ce0 4336 panic("%p m_type zero", m);
0a7de745 4337 }
fe8ab488
A
4338 /*
4339 * Clip to the residual length
4340 */
0a7de745 4341 if (len > m->m_len) {
fe8ab488 4342 len = m->m_len;
0a7de745 4343 }
3e170ce0 4344 pktlen += len;
fe8ab488 4345 /*
3e170ce0 4346 * Copy the mbufs via the uio or delay the copy
fe8ab488
A
4347 * Sockbuf must be consistent here (points to current mbuf,
4348 * it points to next record) when we drop priority;
4349 * we must note any additions to the sockbuf when we
4350 * block interrupts again.
4351 */
3e170ce0 4352 if (len > 0 && can_delay == 0) {
fe8ab488
A
4353 socket_unlock(so, 0);
4354 error = uiomove(mtod(m, caddr_t), (int)len, auio);
4355 socket_lock(so, 0);
0a7de745 4356 if (error) {
fe8ab488 4357 goto release;
0a7de745 4358 }
3e170ce0
A
4359 } else {
4360 delayed_copy_len += len;
fe8ab488 4361 }
3e170ce0 4362
fe8ab488
A
4363 if (len == m->m_len) {
4364 /*
3e170ce0 4365 * m was entirely copied
fe8ab488 4366 */
fe8ab488 4367 sbfree(&so->so_rcv, m);
3e170ce0 4368 nextrecord = m->m_nextpkt;
fe8ab488
A
4369 m->m_nextpkt = NULL;
4370
4371 /*
3e170ce0 4372 * Set the first packet to the head of the free list
fe8ab488 4373 */
0a7de745 4374 if (free_list == NULL) {
3e170ce0 4375 free_list = m;
0a7de745 4376 }
3e170ce0
A
4377 /*
4378 * Link current packet to tail of free list
4379 */
4380 if (ml == NULL) {
0a7de745 4381 if (free_tail != NULL) {
3e170ce0 4382 free_tail->m_nextpkt = m;
0a7de745 4383 }
3e170ce0 4384 free_tail = m;
fe8ab488 4385 }
3e170ce0
A
4386 /*
4387 * Link current mbuf to last mbuf of current packet
4388 */
0a7de745 4389 if (ml != NULL) {
3e170ce0 4390 ml->m_next = m;
0a7de745 4391 }
3e170ce0
A
4392 ml = m;
4393
4394 /*
4395 * Move next buf to head of socket buffer
4396 */
4397 so->so_rcv.sb_mb = m = ml->m_next;
4398 ml->m_next = NULL;
4399
fe8ab488
A
4400 if (m != NULL) {
4401 m->m_nextpkt = nextrecord;
0a7de745 4402 if (nextrecord == NULL) {
fe8ab488 4403 so->so_rcv.sb_lastrecord = m;
0a7de745 4404 }
fe8ab488
A
4405 } else {
4406 so->so_rcv.sb_mb = nextrecord;
4407 SB_EMPTY_FIXUP(&so->so_rcv);
4408 }
4409 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4410 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4411 } else {
4412 /*
4413 * Stop the loop on partial copy
4414 */
fe8ab488
A
4415 break;
4416 }
4417 }
4418#ifdef MORE_LOCKING_DEBUG
4419 if (so->so_usecount <= 1) {
4420 panic("%s: after big while so=%llx ref=%d on socket\n",
4421 __func__,
3e170ce0 4422 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
fe8ab488
A
4423 /* NOTREACHED */
4424 }
4425#endif
4426 /*
4427 * Tell the caller we made a partial copy
4428 */
4429 if (m != NULL) {
4430 if (so->so_options & SO_DONTTRUNC) {
3e170ce0
A
4431 /*
4432 * Copyout first the freelist then the partial mbuf
4433 */
4434 socket_unlock(so, 0);
0a7de745 4435 if (delayed_copy_len) {
3e170ce0
A
4436 error = sodelayed_copy_list(so, msgarray,
4437 uiocnt, &free_list, &delayed_copy_len);
0a7de745 4438 }
3e170ce0
A
4439
4440 if (error == 0) {
4441 error = uiomove(mtod(m, caddr_t), (int)len,
4442 auio);
4443 }
4444 socket_lock(so, 0);
0a7de745 4445 if (error) {
3e170ce0 4446 goto release;
0a7de745 4447 }
3e170ce0 4448
fe8ab488
A
4449 m->m_data += len;
4450 m->m_len -= len;
4451 so->so_rcv.sb_cc -= len;
4452 flags |= MSG_RCVMORE;
4453 } else {
4454 (void) sbdroprecord(&so->so_rcv);
4455 nextrecord = so->so_rcv.sb_mb;
4456 m = NULL;
4457 flags |= MSG_TRUNC;
4458 }
4459 }
4460
4461 if (m == NULL) {
4462 so->so_rcv.sb_mb = nextrecord;
4463 /*
4464 * First part is an inline SB_EMPTY_FIXUP(). Second
4465 * part makes sure sb_lastrecord is up-to-date if
4466 * there is still data in the socket buffer.
4467 */
4468 if (so->so_rcv.sb_mb == NULL) {
4469 so->so_rcv.sb_mbtail = NULL;
4470 so->so_rcv.sb_lastrecord = NULL;
4471 } else if (nextrecord->m_nextpkt == NULL) {
4472 so->so_rcv.sb_lastrecord = nextrecord;
4473 }
4474 SB_MB_CHECK(&so->so_rcv);
4475 }
4476 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4477 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4478
4479 /*
4480 * We can continue to the next packet as long as:
4481 * - We haven't exhausted the uio array
4482 * - There was no error
4483 * - A packet was not truncated
4484 * - We can still receive more data
3e170ce0
A
4485 */
4486 if (npkts < uiocnt && error == 0 &&
4487 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4488 (so->so_state & SS_CANTRCVMORE) == 0) {
0a7de745 4489 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
fe8ab488
A
4490 sblocked = 0;
4491
3e170ce0 4492 goto next;
fe8ab488 4493 }
0a7de745 4494 if (flagsp != NULL) {
3e170ce0 4495 *flagsp |= flags;
0a7de745 4496 }
fe8ab488
A
4497
4498release:
4499 /*
4500 * pru_rcvd may cause more data to be received if the socket lock
4501 * is dropped so we set MSG_HAVEMORE now based on what we know.
3e170ce0
A
4502 * That way the caller won't be surprised if it receives less data
4503 * than requested.
fe8ab488 4504 */
0a7de745 4505 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
fe8ab488 4506 flags |= MSG_HAVEMORE;
0a7de745 4507 }
fe8ab488 4508
0a7de745 4509 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
fe8ab488 4510 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
0a7de745 4511 }
fe8ab488 4512
0a7de745
A
4513 if (sblocked) {
4514 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4515 } else {
fe8ab488 4516 socket_unlock(so, 1);
0a7de745 4517 }
3e170ce0 4518
0a7de745 4519 if (delayed_copy_len) {
3e170ce0
A
4520 error = sodelayed_copy_list(so, msgarray, uiocnt,
4521 &free_list, &delayed_copy_len);
0a7de745 4522 }
fe8ab488
A
4523out:
4524 /*
3e170ce0 4525 * Amortize the cost of freeing the mbufs
fe8ab488 4526 */
0a7de745 4527 if (free_list != NULL) {
fe8ab488 4528 m_freem_list(free_list);
0a7de745
A
4529 }
4530 if (free_others != NULL) {
3e170ce0 4531 m_freem_list(free_others);
0a7de745 4532 }
fe8ab488
A
4533
4534 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4535 0, 0, 0, 0);
0a7de745 4536 return error;
fe8ab488
A
4537}
4538
cb323159
A
4539static int
4540so_statistics_event_to_nstat_event(int64_t *input_options,
4541 uint64_t *nstat_event)
4542{
4543 int error = 0;
4544 switch (*input_options) {
4545 case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4546 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4547 break;
4548 case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4549 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4550 break;
4551#if (DEBUG || DEVELOPMENT)
4552 case SO_STATISTICS_EVENT_RESERVED_1:
4553 *nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4554 break;
4555 case SO_STATISTICS_EVENT_RESERVED_2:
4556 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4557 break;
4558#endif /* (DEBUG || DEVELOPMENT) */
4559 default:
4560 error = EINVAL;
4561 break;
4562 }
4563 return error;
4564}
4565
fe8ab488
A
4566/*
4567 * Returns: 0 Success
4568 * EINVAL
4569 * ENOTCONN
4570 * <pru_shutdown>:EINVAL
4571 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4572 * <pru_shutdown>:ENOBUFS[TCP]
4573 * <pru_shutdown>:EMSGSIZE[TCP]
4574 * <pru_shutdown>:EHOSTUNREACH[TCP]
4575 * <pru_shutdown>:ENETUNREACH[TCP]
4576 * <pru_shutdown>:ENETDOWN[TCP]
4577 * <pru_shutdown>:ENOMEM[TCP]
4578 * <pru_shutdown>:EACCES[TCP]
4579 * <pru_shutdown>:EMSGSIZE[TCP]
4580 * <pru_shutdown>:ENOBUFS[TCP]
4581 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4582 * <pru_shutdown>:??? [other protocol families]
4583 */
4584int
4585soshutdown(struct socket *so, int how)
4586{
4587 int error;
4588
4589 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4590
4591 switch (how) {
4592 case SHUT_RD:
4593 case SHUT_WR:
4594 case SHUT_RDWR:
4595 socket_lock(so, 1);
4596 if ((so->so_state &
0a7de745 4597 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
fe8ab488 4598 error = ENOTCONN;
2d21ac55
A
4599 } else {
4600 error = soshutdownlock(so, how);
4601 }
4602 socket_unlock(so, 1);
4603 break;
4604 default:
4605 error = EINVAL;
4606 break;
55e303ae 4607 }
55e303ae 4608
fe8ab488
A
4609 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4610
0a7de745 4611 return error;
55e303ae
A
4612}
4613
1c79356b 4614int
fe8ab488 4615soshutdownlock_final(struct socket *so, int how)
1c79356b 4616{
2d21ac55
A
4617 struct protosw *pr = so->so_proto;
4618 int error = 0;
1c79356b 4619
91447636 4620 sflt_notify(so, sock_evt_shutdown, &how);
1c79356b 4621
9bccf70c 4622 if (how != SHUT_WR) {
2d21ac55
A
4623 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4624 /* read already shut down */
4625 error = ENOTCONN;
4626 goto done;
4627 }
1c79356b 4628 sorflush(so);
1c79356b 4629 }
9bccf70c 4630 if (how != SHUT_RD) {
2d21ac55
A
4631 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4632 /* write already shut down */
4633 error = ENOTCONN;
4634 goto done;
4635 }
4636 error = (*pr->pr_usrreqs->pru_shutdown)(so);
1c79356b 4637 }
2d21ac55 4638done:
fe8ab488 4639 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
0a7de745 4640 return error;
fe8ab488
A
4641}
4642
4643int
4644soshutdownlock(struct socket *so, int how)
4645{
4646 int error = 0;
4647
4648#if CONTENT_FILTER
4649 /*
4650 * A content filter may delay the actual shutdown until it
4651 * has processed the pending data
4652 */
4653 if (so->so_flags & SOF_CONTENT_FILTER) {
4654 error = cfil_sock_shutdown(so, &how);
4655 if (error == EJUSTRETURN) {
4656 error = 0;
4657 goto done;
4658 } else if (error != 0) {
4659 goto done;
4660 }
4661 }
4662#endif /* CONTENT_FILTER */
3e170ce0 4663
fe8ab488
A
4664 error = soshutdownlock_final(so, how);
4665
4666done:
0a7de745 4667 return error;
1c79356b
A
4668}
4669
39236c6e
A
4670void
4671sowflush(struct socket *so)
4672{
4673 struct sockbuf *sb = &so->so_snd;
39236c6e
A
4674
4675 /*
4676 * Obtain lock on the socket buffer (SB_LOCK). This is required
4677 * to prevent the socket buffer from being unexpectedly altered
4678 * while it is used by another thread in socket send/receive.
4679 *
4680 * sblock() must not fail here, hence the assertion.
4681 */
4682 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4683 VERIFY(sb->sb_flags & SB_LOCK);
4684
0a7de745
A
4685 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4686 sb->sb_flags |= SB_DROP;
4687 sb->sb_upcall = NULL;
4688 sb->sb_upcallarg = NULL;
39236c6e 4689
0a7de745 4690 sbunlock(sb, TRUE); /* keep socket locked */
39236c6e
A
4691
4692 selthreadclear(&sb->sb_sel);
4693 sbrelease(sb);
4694}
4695
1c79356b 4696void
2d21ac55 4697sorflush(struct socket *so)
1c79356b 4698{
39236c6e
A
4699 struct sockbuf *sb = &so->so_rcv;
4700 struct protosw *pr = so->so_proto;
1c79356b 4701 struct sockbuf asb;
39236c6e 4702#ifdef notyet
2d21ac55 4703 lck_mtx_t *mutex_held;
39236c6e
A
4704 /*
4705 * XXX: This code is currently commented out, because we may get here
4706 * as part of sofreelastref(), and at that time, pr_getlock() may no
4707 * longer be able to return us the lock; this will be fixed in future.
4708 */
0a7de745 4709 if (so->so_proto->pr_getlock != NULL) {
91447636 4710 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
0a7de745 4711 } else {
91447636 4712 mutex_held = so->so_proto->pr_domain->dom_mtx;
0a7de745 4713 }
39236c6e 4714
5ba3f43e 4715 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
39236c6e 4716#endif /* notyet */
91447636
A
4717
4718 sflt_notify(so, sock_evt_flush_read, NULL);
1c79356b 4719
1c79356b 4720 socantrcvmore(so);
39236c6e
A
4721
4722 /*
4723 * Obtain lock on the socket buffer (SB_LOCK). This is required
4724 * to prevent the socket buffer from being unexpectedly altered
4725 * while it is used by another thread in socket send/receive.
4726 *
4727 * sblock() must not fail here, hence the assertion.
4728 */
4729 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4730 VERIFY(sb->sb_flags & SB_LOCK);
4731
4732 /*
4733 * Copy only the relevant fields from "sb" to "asb" which we
4734 * need for sbrelease() to function. In particular, skip
4735 * sb_sel as it contains the wait queue linkage, which would
4736 * wreak havoc if we were to issue selthreadclear() on "asb".
4737 * Make sure to not carry over SB_LOCK in "asb", as we need
4738 * to acquire it later as part of sbrelease().
4739 */
0a7de745
A
4740 bzero(&asb, sizeof(asb));
4741 asb.sb_cc = sb->sb_cc;
4742 asb.sb_hiwat = sb->sb_hiwat;
4743 asb.sb_mbcnt = sb->sb_mbcnt;
4744 asb.sb_mbmax = sb->sb_mbmax;
4745 asb.sb_ctl = sb->sb_ctl;
4746 asb.sb_lowat = sb->sb_lowat;
4747 asb.sb_mb = sb->sb_mb;
4748 asb.sb_mbtail = sb->sb_mbtail;
4749 asb.sb_lastrecord = sb->sb_lastrecord;
4750 asb.sb_so = sb->sb_so;
4751 asb.sb_flags = sb->sb_flags;
4752 asb.sb_flags &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4753 asb.sb_flags |= SB_DROP;
39236c6e
A
4754
4755 /*
4756 * Ideally we'd bzero() these and preserve the ones we need;
4757 * but to do that we'd need to shuffle things around in the
4758 * sockbuf, and we can't do it now because there are KEXTS
4759 * that are directly referring to the socket structure.
4760 *
4761 * Setting SB_DROP acts as a barrier to prevent further appends.
4762 * Clearing SB_SEL is done for selthreadclear() below.
4763 */
0a7de745
A
4764 sb->sb_cc = 0;
4765 sb->sb_hiwat = 0;
4766 sb->sb_mbcnt = 0;
4767 sb->sb_mbmax = 0;
4768 sb->sb_ctl = 0;
4769 sb->sb_lowat = 0;
4770 sb->sb_mb = NULL;
4771 sb->sb_mbtail = NULL;
4772 sb->sb_lastrecord = NULL;
4773 sb->sb_timeo.tv_sec = 0;
4774 sb->sb_timeo.tv_usec = 0;
4775 sb->sb_upcall = NULL;
4776 sb->sb_upcallarg = NULL;
4777 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4778 sb->sb_flags |= SB_DROP;
4779
4780 sbunlock(sb, TRUE); /* keep socket locked */
39236c6e
A
4781
4782 /*
4783 * Note that selthreadclear() is called on the original "sb" and
4784 * not the local "asb" because of the way wait queue linkage is
4785 * implemented. Given that selwakeup() may be triggered, SB_SEL
4786 * should no longer be set (cleared above.)
4787 */
0b4e3aa0 4788 selthreadclear(&sb->sb_sel);
39236c6e 4789
0a7de745 4790 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
1c79356b 4791 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
0a7de745 4792 }
39236c6e 4793
1c79356b
A
4794 sbrelease(&asb);
4795}
4796
4797/*
4798 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4799 * an additional variant to handle the case where the option value needs
4800 * to be some kind of integer, but not a specific size.
4801 * In addition to their use here, these functions are also called by the
4802 * protocol-level pr_ctloutput() routines.
2d21ac55
A
4803 *
4804 * Returns: 0 Success
4805 * EINVAL
4806 * copyin:EFAULT
1c79356b
A
4807 */
4808int
2d21ac55 4809sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
1c79356b 4810{
0a7de745 4811 size_t valsize;
1c79356b
A
4812
4813 /*
4814 * If the user gives us more than we wanted, we ignore it,
4815 * but if we don't get the minimum length the caller
4816 * wants, we return EINVAL. On success, sopt->sopt_valsize
4817 * is set to however much we actually retrieved.
4818 */
0a7de745
A
4819 if ((valsize = sopt->sopt_valsize) < minlen) {
4820 return EINVAL;
4821 }
4822 if (valsize > len) {
1c79356b 4823 sopt->sopt_valsize = valsize = len;
0a7de745 4824 }
1c79356b 4825
0a7de745
A
4826 if (sopt->sopt_p != kernproc) {
4827 return copyin(sopt->sopt_val, buf, valsize);
4828 }
1c79356b 4829
91447636 4830 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
0a7de745 4831 return 0;
2d21ac55
A
4832}
4833
4834/*
4835 * sooptcopyin_timeval
4836 * Copy in a timeval value into tv_p, and take into account whether the
4837 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4838 * code here so that we can verify the 64-bit tv_sec value before we lose
4839 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4840 */
4841static int
39236c6e 4842sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
2d21ac55 4843{
0a7de745 4844 int error;
b0d623f7 4845
2d21ac55 4846 if (proc_is64bit(sopt->sopt_p)) {
0a7de745 4847 struct user64_timeval tv64;
2d21ac55 4848
0a7de745
A
4849 if (sopt->sopt_valsize < sizeof(tv64)) {
4850 return EINVAL;
4851 }
39236c6e 4852
0a7de745 4853 sopt->sopt_valsize = sizeof(tv64);
b0d623f7 4854 if (sopt->sopt_p != kernproc) {
0a7de745
A
4855 error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4856 if (error != 0) {
4857 return error;
4858 }
b0d623f7
A
4859 } else {
4860 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
0a7de745 4861 sizeof(tv64));
2d21ac55 4862 }
39236c6e 4863 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
0a7de745
A
4864 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4865 return EDOM;
4866 }
39236c6e 4867
2d21ac55
A
4868 tv_p->tv_sec = tv64.tv_sec;
4869 tv_p->tv_usec = tv64.tv_usec;
4870 } else {
0a7de745 4871 struct user32_timeval tv32;
b0d623f7 4872
0a7de745
A
4873 if (sopt->sopt_valsize < sizeof(tv32)) {
4874 return EINVAL;
4875 }
39236c6e 4876
0a7de745 4877 sopt->sopt_valsize = sizeof(tv32);
b0d623f7 4878 if (sopt->sopt_p != kernproc) {
0a7de745 4879 error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
2d21ac55 4880 if (error != 0) {
0a7de745 4881 return error;
2d21ac55
A
4882 }
4883 } else {
b0d623f7 4884 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
0a7de745 4885 sizeof(tv32));
2d21ac55 4886 }
39236c6e
A
4887#ifndef __LP64__
4888 /*
4889 * K64todo "comparison is always false due to
4890 * limited range of data type"
4891 */
4892 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
0a7de745
A
4893 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4894 return EDOM;
4895 }
b0d623f7
A
4896#endif
4897 tv_p->tv_sec = tv32.tv_sec;
4898 tv_p->tv_usec = tv32.tv_usec;
2d21ac55 4899 }
0a7de745 4900 return 0;
1c79356b
A
4901}
4902
5ba3f43e 4903int
cb323159
A
4904soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4905 boolean_t ignore_delegate)
39037602
A
4906{
4907 kauth_cred_t cred = NULL;
4908 proc_t ep = PROC_NULL;
5ba3f43e
A
4909 uid_t uid;
4910 int error = 0;
39037602 4911
cb323159 4912 if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
39037602 4913 ep = proc_find(so->e_pid);
0a7de745 4914 if (ep) {
39037602 4915 cred = kauth_cred_proc_ref(ep);
0a7de745 4916 }
39037602 4917 }
5ba3f43e
A
4918
4919 uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4920
4921 /* uid is 0 for root */
0a7de745 4922 if (uid != 0 || !allow_root) {
5ba3f43e 4923 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
0a7de745
A
4924 }
4925 if (cred) {
39037602 4926 kauth_cred_unref(&cred);
0a7de745
A
4927 }
4928 if (ep != PROC_NULL) {
39037602 4929 proc_rele(ep);
0a7de745 4930 }
39037602 4931
0a7de745 4932 return error;
39037602
A
4933}
4934
2d21ac55
A
4935/*
4936 * Returns: 0 Success
4937 * EINVAL
4938 * ENOPROTOOPT
4939 * ENOBUFS
4940 * EDOM
4941 * sooptcopyin:EINVAL
4942 * sooptcopyin:EFAULT
4943 * sooptcopyin_timeval:EINVAL
4944 * sooptcopyin_timeval:EFAULT
4945 * sooptcopyin_timeval:EDOM
4946 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4947 * <pr_ctloutput>:???w
4948 * sflt_attach_private:??? [whatever a filter author chooses]
4949 * <sf_setoption>:??? [whatever a filter author chooses]
4950 *
4951 * Notes: Other <pru_listen> returns depend on the protocol family; all
4952 * <sf_listen> returns depend on what the filter author causes
4953 * their filter to return.
4954 */
1c79356b 4955int
39236c6e 4956sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
1c79356b 4957{
0a7de745 4958 int error, optval;
cb323159 4959 int64_t long_optval;
0a7de745
A
4960 struct linger l;
4961 struct timeval tv;
91447636 4962
0a7de745 4963 if (sopt->sopt_dir != SOPT_SET) {
39236c6e 4964 sopt->sopt_dir = SOPT_SET;
0a7de745 4965 }
39236c6e 4966
0a7de745 4967 if (dolock) {
39236c6e 4968 socket_lock(so, 1);
0a7de745 4969 }
39236c6e
A
4970
4971 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4972 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
b0d623f7 4973 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
2d21ac55
A
4974 /* the socket has been shutdown, no more sockopt's */
4975 error = EINVAL;
39236c6e 4976 goto out;
9bccf70c
A
4977 }
4978
6d2010ae 4979 error = sflt_setsockopt(so, sopt);
39236c6e 4980 if (error != 0) {
0a7de745 4981 if (error == EJUSTRETURN) {
6d2010ae 4982 error = 0;
0a7de745 4983 }
39236c6e 4984 goto out;
1c79356b
A
4985 }
4986
1c79356b 4987 if (sopt->sopt_level != SOL_SOCKET) {
39236c6e
A
4988 if (so->so_proto != NULL &&
4989 so->so_proto->pr_ctloutput != NULL) {
2d21ac55 4990 error = (*so->so_proto->pr_ctloutput)(so, sopt);
39236c6e 4991 goto out;
91447636 4992 }
1c79356b
A
4993 error = ENOPROTOOPT;
4994 } else {
39236c6e
A
4995 /*
4996 * Allow socket-level (SOL_SOCKET) options to be filtered by
4997 * the protocol layer, if needed. A zero value returned from
4998 * the handler means use default socket-level processing as
4999 * done by the rest of this routine. Otherwise, any other
5000 * return value indicates that the option is unsupported.
5001 */
5002 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
0a7de745 5003 pru_socheckopt(so, sopt)) != 0) {
39236c6e 5004 goto out;
0a7de745 5005 }
39236c6e
A
5006
5007 error = 0;
1c79356b
A
5008 switch (sopt->sopt_name) {
5009 case SO_LINGER:
91447636 5010 case SO_LINGER_SEC:
0a7de745
A
5011 error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5012 if (error != 0) {
39236c6e 5013 goto out;
0a7de745 5014 }
1c79356b 5015
2d21ac55
A
5016 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
5017 l.l_linger : l.l_linger * hz;
0a7de745 5018 if (l.l_onoff != 0) {
1c79356b 5019 so->so_options |= SO_LINGER;
0a7de745 5020 } else {
1c79356b 5021 so->so_options &= ~SO_LINGER;
0a7de745 5022 }
1c79356b
A
5023 break;
5024
5025 case SO_DEBUG:
5026 case SO_KEEPALIVE:
5027 case SO_DONTROUTE:
5028 case SO_USELOOPBACK:
5029 case SO_BROADCAST:
5030 case SO_REUSEADDR:
5031 case SO_REUSEPORT:
5032 case SO_OOBINLINE:
5033 case SO_TIMESTAMP:
6d2010ae 5034 case SO_TIMESTAMP_MONOTONIC:
d9a64523 5035 case SO_TIMESTAMP_CONTINUOUS:
1c79356b
A
5036 case SO_DONTTRUNC:
5037 case SO_WANTMORE:
9bccf70c 5038 case SO_WANTOOBFLAG:
fe8ab488 5039 case SO_NOWAKEFROMSLEEP:
39037602 5040 case SO_NOAPNFALLBK:
0a7de745
A
5041 error = sooptcopyin(sopt, &optval, sizeof(optval),
5042 sizeof(optval));
5043 if (error != 0) {
39236c6e 5044 goto out;
0a7de745
A
5045 }
5046 if (optval) {
1c79356b 5047 so->so_options |= sopt->sopt_name;
0a7de745 5048 } else {
1c79356b 5049 so->so_options &= ~sopt->sopt_name;
0a7de745 5050 }
1c79356b
A
5051 break;
5052
5053 case SO_SNDBUF:
5054 case SO_RCVBUF:
5055 case SO_SNDLOWAT:
5056 case SO_RCVLOWAT:
0a7de745
A
5057 error = sooptcopyin(sopt, &optval, sizeof(optval),
5058 sizeof(optval));
5059 if (error != 0) {
39236c6e 5060 goto out;
0a7de745 5061 }
1c79356b
A
5062
5063 /*
5064 * Values < 1 make no sense for any of these
5065 * options, so disallow them.
5066 */
5067 if (optval < 1) {
5068 error = EINVAL;
39236c6e 5069 goto out;
1c79356b
A
5070 }
5071
5072 switch (sopt->sopt_name) {
5073 case SO_SNDBUF:
39236c6e
A
5074 case SO_RCVBUF: {
5075 struct sockbuf *sb =
5076 (sopt->sopt_name == SO_SNDBUF) ?
5077 &so->so_snd : &so->so_rcv;
5078 if (sbreserve(sb, (u_int32_t)optval) == 0) {
1c79356b 5079 error = ENOBUFS;
39236c6e 5080 goto out;
1c79356b 5081 }
316670eb
A
5082 sb->sb_flags |= SB_USRSIZE;
5083 sb->sb_flags &= ~SB_AUTOSIZE;
5084 sb->sb_idealsize = (u_int32_t)optval;
1c79356b 5085 break;
316670eb 5086 }
1c79356b
A
5087 /*
5088 * Make sure the low-water is never greater than
5089 * the high-water.
5090 */
fe8ab488
A
5091 case SO_SNDLOWAT: {
5092 int space = sbspace(&so->so_snd);
5093 u_int32_t hiwat = so->so_snd.sb_hiwat;
5094
5095 if (so->so_snd.sb_flags & SB_UNIX) {
5096 struct unpcb *unp =
5097 (struct unpcb *)(so->so_pcb);
3e170ce0
A
5098 if (unp != NULL &&
5099 unp->unp_conn != NULL) {
fe8ab488
A
5100 hiwat += unp->unp_conn->unp_cc;
5101 }
5102 }
5103
1c79356b 5104 so->so_snd.sb_lowat =
fe8ab488
A
5105 (optval > hiwat) ?
5106 hiwat : optval;
5107
5108 if (space >= so->so_snd.sb_lowat) {
5109 sowwakeup(so);
5110 }
1c79356b 5111 break;
3e170ce0 5112 }
fe8ab488
A
5113 case SO_RCVLOWAT: {
5114 int64_t data_len;
1c79356b
A
5115 so->so_rcv.sb_lowat =
5116 (optval > so->so_rcv.sb_hiwat) ?
5117 so->so_rcv.sb_hiwat : optval;
3e170ce0 5118 data_len = so->so_rcv.sb_cc
fe8ab488 5119 - so->so_rcv.sb_ctl;
0a7de745
A
5120 if (data_len >= so->so_rcv.sb_lowat) {
5121 sorwakeup(so);
5122 }
1c79356b
A
5123 break;
5124 }
fe8ab488 5125 }
1c79356b
A
5126 break;
5127
5128 case SO_SNDTIMEO:
5129 case SO_RCVTIMEO:
2d21ac55 5130 error = sooptcopyin_timeval(sopt, &tv);
0a7de745 5131 if (error != 0) {
39236c6e 5132 goto out;
0a7de745 5133 }
1c79356b 5134
1c79356b
A
5135 switch (sopt->sopt_name) {
5136 case SO_SNDTIMEO:
91447636 5137 so->so_snd.sb_timeo = tv;
1c79356b
A
5138 break;
5139 case SO_RCVTIMEO:
91447636 5140 so->so_rcv.sb_timeo = tv;
1c79356b
A
5141 break;
5142 }
5143 break;
5144
39236c6e 5145 case SO_NKE: {
9bccf70c 5146 struct so_nke nke;
1c79356b 5147
0a7de745
A
5148 error = sooptcopyin(sopt, &nke, sizeof(nke),
5149 sizeof(nke));
5150 if (error != 0) {
39236c6e 5151 goto out;
0a7de745 5152 }
1c79356b 5153
6d2010ae 5154 error = sflt_attach_internal(so, nke.nke_handle);
1c79356b
A
5155 break;
5156 }
5157
9bccf70c 5158 case SO_NOSIGPIPE:
0a7de745
A
5159 error = sooptcopyin(sopt, &optval, sizeof(optval),
5160 sizeof(optval));
5161 if (error != 0) {
39236c6e 5162 goto out;
0a7de745
A
5163 }
5164 if (optval != 0) {
2d21ac55 5165 so->so_flags |= SOF_NOSIGPIPE;
0a7de745 5166 } else {
2d21ac55 5167 so->so_flags &= ~SOF_NOSIGPIPE;
0a7de745 5168 }
9bccf70c
A
5169 break;
5170
55e303ae 5171 case SO_NOADDRERR:
0a7de745
A
5172 error = sooptcopyin(sopt, &optval, sizeof(optval),
5173 sizeof(optval));
5174 if (error != 0) {
39236c6e 5175 goto out;
0a7de745
A
5176 }
5177 if (optval != 0) {
2d21ac55 5178 so->so_flags |= SOF_NOADDRAVAIL;
0a7de745 5179 } else {
2d21ac55 5180 so->so_flags &= ~SOF_NOADDRAVAIL;
0a7de745 5181 }
2d21ac55
A
5182 break;
5183
5184 case SO_REUSESHAREUID:
0a7de745
A
5185 error = sooptcopyin(sopt, &optval, sizeof(optval),
5186 sizeof(optval));
5187 if (error != 0) {
39236c6e 5188 goto out;
0a7de745
A
5189 }
5190 if (optval != 0) {
2d21ac55 5191 so->so_flags |= SOF_REUSESHAREUID;
0a7de745 5192 } else {
2d21ac55 5193 so->so_flags &= ~SOF_REUSESHAREUID;
0a7de745 5194 }
2d21ac55 5195 break;
39236c6e 5196
2d21ac55
A
5197 case SO_NOTIFYCONFLICT:
5198 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5199 error = EPERM;
39236c6e 5200 goto out;
2d21ac55 5201 }
0a7de745
A
5202 error = sooptcopyin(sopt, &optval, sizeof(optval),
5203 sizeof(optval));
5204 if (error != 0) {
39236c6e 5205 goto out;
0a7de745
A
5206 }
5207 if (optval != 0) {
2d21ac55 5208 so->so_flags |= SOF_NOTIFYCONFLICT;
0a7de745 5209 } else {
2d21ac55 5210 so->so_flags &= ~SOF_NOTIFYCONFLICT;
0a7de745 5211 }
2d21ac55 5212 break;
39236c6e 5213
2d21ac55 5214 case SO_RESTRICTIONS:
0a7de745
A
5215 error = sooptcopyin(sopt, &optval, sizeof(optval),
5216 sizeof(optval));
5217 if (error != 0) {
39236c6e 5218 goto out;
0a7de745 5219 }
39236c6e
A
5220
5221 error = so_set_restrictions(so, optval);
2d21ac55
A
5222 break;
5223
fe8ab488
A
5224 case SO_AWDL_UNRESTRICTED:
5225 if (SOCK_DOM(so) != PF_INET &&
5226 SOCK_DOM(so) != PF_INET6) {
5227 error = EOPNOTSUPP;
5228 goto out;
5229 }
5230 error = sooptcopyin(sopt, &optval, sizeof(optval),
5231 sizeof(optval));
0a7de745 5232 if (error != 0) {
fe8ab488 5233 goto out;
0a7de745 5234 }
fe8ab488 5235 if (optval != 0) {
39037602 5236 error = soopt_cred_check(so,
cb323159 5237 PRIV_NET_RESTRICTED_AWDL, false, false);
0a7de745 5238 if (error == 0) {
fe8ab488 5239 inp_set_awdl_unrestricted(
0a7de745
A
5240 sotoinpcb(so));
5241 }
5242 } else {
fe8ab488 5243 inp_clear_awdl_unrestricted(sotoinpcb(so));
0a7de745 5244 }
fe8ab488 5245 break;
39037602
A
5246 case SO_INTCOPROC_ALLOW:
5247 if (SOCK_DOM(so) != PF_INET6) {
5248 error = EOPNOTSUPP;
5249 goto out;
5250 }
5251 error = sooptcopyin(sopt, &optval, sizeof(optval),
5252 sizeof(optval));
0a7de745 5253 if (error != 0) {
39037602 5254 goto out;
0a7de745 5255 }
743345f9 5256 if (optval != 0 &&
0a7de745 5257 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
39037602 5258 error = soopt_cred_check(so,
cb323159 5259 PRIV_NET_RESTRICTED_INTCOPROC, false, false);
0a7de745 5260 if (error == 0) {
39037602 5261 inp_set_intcoproc_allowed(
0a7de745
A
5262 sotoinpcb(so));
5263 }
5264 } else if (optval == 0) {
39037602 5265 inp_clear_intcoproc_allowed(sotoinpcb(so));
0a7de745 5266 }
39037602 5267 break;
fe8ab488 5268
2d21ac55 5269 case SO_LABEL:
2d21ac55 5270 error = EOPNOTSUPP;
55e303ae
A
5271 break;
5272
4a3eedf9 5273 case SO_UPCALLCLOSEWAIT:
0a7de745
A
5274 error = sooptcopyin(sopt, &optval, sizeof(optval),
5275 sizeof(optval));
5276 if (error != 0) {
39236c6e 5277 goto out;
0a7de745
A
5278 }
5279 if (optval != 0) {
4a3eedf9 5280 so->so_flags |= SOF_UPCALLCLOSEWAIT;
0a7de745 5281 } else {
4a3eedf9 5282 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
0a7de745 5283 }
4a3eedf9 5284 break;
4a3eedf9 5285
b0d623f7 5286 case SO_RANDOMPORT:
0a7de745
A
5287 error = sooptcopyin(sopt, &optval, sizeof(optval),
5288 sizeof(optval));
5289 if (error != 0) {
39236c6e 5290 goto out;
0a7de745
A
5291 }
5292 if (optval != 0) {
b0d623f7 5293 so->so_flags |= SOF_BINDRANDOMPORT;
0a7de745 5294 } else {
b0d623f7 5295 so->so_flags &= ~SOF_BINDRANDOMPORT;
0a7de745 5296 }
b0d623f7
A
5297 break;
5298
5299 case SO_NP_EXTENSIONS: {
5300 struct so_np_extensions sonpx;
5301
0a7de745
A
5302 error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5303 sizeof(sonpx));
5304 if (error != 0) {
39236c6e 5305 goto out;
0a7de745 5306 }
b0d623f7
A
5307 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5308 error = EINVAL;
39236c6e 5309 goto out;
b0d623f7
A
5310 }
5311 /*
5312 * Only one bit defined for now
5313 */
5314 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
0a7de745 5315 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
b0d623f7 5316 so->so_flags |= SOF_NPX_SETOPTSHUT;
0a7de745 5317 } else {
b0d623f7 5318 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
0a7de745 5319 }
b0d623f7
A
5320 }
5321 break;
5322 }
5323
d41d1dae 5324 case SO_TRAFFIC_CLASS: {
0a7de745
A
5325 error = sooptcopyin(sopt, &optval, sizeof(optval),
5326 sizeof(optval));
5327 if (error != 0) {
39236c6e 5328 goto out;
0a7de745 5329 }
39037602
A
5330 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5331 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5332 error = so_set_net_service_type(so, netsvc);
5333 goto out;
5334 }
6d2010ae 5335 error = so_set_traffic_class(so, optval);
0a7de745 5336 if (error != 0) {
39236c6e 5337 goto out;
0a7de745 5338 }
39037602
A
5339 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5340 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
6d2010ae 5341 break;
d41d1dae 5342 }
6d2010ae
A
5343
5344 case SO_RECV_TRAFFIC_CLASS: {
0a7de745
A
5345 error = sooptcopyin(sopt, &optval, sizeof(optval),
5346 sizeof(optval));
5347 if (error != 0) {
39236c6e 5348 goto out;
0a7de745
A
5349 }
5350 if (optval == 0) {
6d2010ae 5351 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
0a7de745 5352 } else {
6d2010ae 5353 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
0a7de745 5354 }
6d2010ae
A
5355 break;
5356 }
316670eb 5357
39037602 5358#if (DEVELOPMENT || DEBUG)
6d2010ae
A
5359 case SO_TRAFFIC_CLASS_DBG: {
5360 struct so_tcdbg so_tcdbg;
316670eb
A
5361
5362 error = sooptcopyin(sopt, &so_tcdbg,
0a7de745
A
5363 sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5364 if (error != 0) {
39236c6e 5365 goto out;
0a7de745 5366 }
6d2010ae 5367 error = so_set_tcdbg(so, &so_tcdbg);
0a7de745 5368 if (error != 0) {
39236c6e 5369 goto out;
0a7de745 5370 }
6d2010ae
A
5371 break;
5372 }
39037602 5373#endif /* (DEVELOPMENT || DEBUG) */
316670eb
A
5374
5375 case SO_PRIVILEGED_TRAFFIC_CLASS:
5376 error = priv_check_cred(kauth_cred_get(),
5377 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
0a7de745 5378 if (error != 0) {
39236c6e 5379 goto out;
0a7de745
A
5380 }
5381 error = sooptcopyin(sopt, &optval, sizeof(optval),
5382 sizeof(optval));
5383 if (error != 0) {
39236c6e 5384 goto out;
0a7de745
A
5385 }
5386 if (optval == 0) {
316670eb 5387 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
0a7de745 5388 } else {
316670eb 5389 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
0a7de745 5390 }
316670eb
A
5391 break;
5392
a39ff7e2
A
5393#if (DEVELOPMENT || DEBUG)
5394 case SO_DEFUNCTIT:
5395 error = sosetdefunct(current_proc(), so, 0, FALSE);
0a7de745 5396 if (error == 0) {
a39ff7e2 5397 error = sodefunct(current_proc(), so, 0);
0a7de745 5398 }
a39ff7e2
A
5399
5400 break;
5401#endif /* (DEVELOPMENT || DEBUG) */
5402
6d2010ae 5403 case SO_DEFUNCTOK:
0a7de745
A
5404 error = sooptcopyin(sopt, &optval, sizeof(optval),
5405 sizeof(optval));
6d2010ae 5406 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
0a7de745 5407 if (error == 0) {
6d2010ae 5408 error = EBADF;
0a7de745 5409 }
39236c6e 5410 goto out;
6d2010ae
A
5411 }
5412 /*
5413 * Any process can set SO_DEFUNCTOK (clear
5414 * SOF_NODEFUNCT), but only root can clear
5415 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5416 */
5417 if (optval == 0 &&
5418 kauth_cred_issuser(kauth_cred_get()) == 0) {
5419 error = EPERM;
39236c6e 5420 goto out;
6d2010ae 5421 }
0a7de745 5422 if (optval) {
6d2010ae 5423 so->so_flags &= ~SOF_NODEFUNCT;
0a7de745 5424 } else {
6d2010ae 5425 so->so_flags |= SOF_NODEFUNCT;
0a7de745 5426 }
6d2010ae 5427
39236c6e
A
5428 if (SOCK_DOM(so) == PF_INET ||
5429 SOCK_DOM(so) == PF_INET6) {
5430 char s[MAX_IPv6_STR_LEN];
5431 char d[MAX_IPv6_STR_LEN];
5432 struct inpcb *inp = sotoinpcb(so);
5433
39037602
A
5434 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5435 "[%s %s:%d -> %s:%d] is now marked "
5436 "as %seligible for "
39236c6e 5437 "defunct\n", __func__, proc_selfpid(),
39037602 5438 proc_best_name(current_proc()),
3e170ce0 5439 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39236c6e
A
5440 (SOCK_TYPE(so) == SOCK_STREAM) ?
5441 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5442 ((SOCK_DOM(so) == PF_INET) ?
5443 (void *)&inp->inp_laddr.s_addr :
0a7de745 5444 (void *)&inp->in6p_laddr), s, sizeof(s)),
39236c6e
A
5445 ntohs(inp->in6p_lport),
5446 inet_ntop(SOCK_DOM(so),
5447 (SOCK_DOM(so) == PF_INET) ?
5448 (void *)&inp->inp_faddr.s_addr :
0a7de745 5449 (void *)&inp->in6p_faddr, d, sizeof(d)),
39236c6e
A
5450 ntohs(inp->in6p_fport),
5451 (so->so_flags & SOF_NODEFUNCT) ?
39037602 5452 "not " : "");
39236c6e 5453 } else {
39037602
A
5454 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5455 "is now marked as %seligible for "
5456 "defunct\n",
39236c6e 5457 __func__, proc_selfpid(),
39037602 5458 proc_best_name(current_proc()),
3e170ce0 5459 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39236c6e
A
5460 SOCK_DOM(so), SOCK_TYPE(so),
5461 (so->so_flags & SOF_NODEFUNCT) ?
39037602 5462 "not " : "");
39236c6e 5463 }
6d2010ae
A
5464 break;
5465
5466 case SO_ISDEFUNCT:
5467 /* This option is not settable */
5468 error = EINVAL;
5469 break;
d41d1dae 5470
316670eb 5471 case SO_OPPORTUNISTIC:
0a7de745
A
5472 error = sooptcopyin(sopt, &optval, sizeof(optval),
5473 sizeof(optval));
5474 if (error == 0) {
316670eb 5475 error = so_set_opportunistic(so, optval);
0a7de745 5476 }
316670eb
A
5477 break;
5478
5479 case SO_FLUSH:
5480 /* This option is handled by lower layer(s) */
5481 error = 0;
5482 break;
5483
5484 case SO_RECV_ANYIF:
0a7de745
A
5485 error = sooptcopyin(sopt, &optval, sizeof(optval),
5486 sizeof(optval));
5487 if (error == 0) {
316670eb 5488 error = so_set_recv_anyif(so, optval);
0a7de745 5489 }
316670eb
A
5490 break;
5491
39236c6e
A
5492 case SO_TRAFFIC_MGT_BACKGROUND: {
5493 /* This option is handled by lower layer(s) */
5494 error = 0;
5495 break;
5496 }
5497
5498#if FLOW_DIVERT
5499 case SO_FLOW_DIVERT_TOKEN:
5500 error = flow_divert_token_set(so, sopt);
5501 break;
0a7de745 5502#endif /* FLOW_DIVERT */
39236c6e
A
5503
5504
5505 case SO_DELEGATED:
0a7de745
A
5506 if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5507 sizeof(optval))) != 0) {
39236c6e 5508 break;
0a7de745 5509 }
39236c6e 5510
cb323159 5511 error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
39236c6e
A
5512 break;
5513
5514 case SO_DELEGATED_UUID: {
5515 uuid_t euuid;
5516
0a7de745
A
5517 if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5518 sizeof(euuid))) != 0) {
39236c6e 5519 break;
0a7de745 5520 }
39236c6e 5521
cb323159 5522 error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
39236c6e
A
5523 break;
5524 }
3e170ce0 5525
fe8ab488
A
5526#if NECP
5527 case SO_NECP_ATTRIBUTES:
5528 error = necp_set_socket_attributes(so, sopt);
5529 break;
fe8ab488 5530
cb323159 5531 case SO_NECP_CLIENTUUID: {
5ba3f43e
A
5532 if (SOCK_DOM(so) == PF_MULTIPATH) {
5533 /* Handled by MPTCP itself */
fe8ab488
A
5534 break;
5535 }
5536
5ba3f43e
A
5537 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5538 error = EINVAL;
fe8ab488 5539 goto out;
5ba3f43e
A
5540 }
5541
5542 struct inpcb *inp = sotoinpcb(so);
5543 if (!uuid_is_null(inp->necp_client_uuid)) {
5544 // Clear out the old client UUID if present
5545 necp_inpcb_remove_cb(inp);
5546 }
5547
5548 error = sooptcopyin(sopt, &inp->necp_client_uuid,
0a7de745 5549 sizeof(uuid_t), sizeof(uuid_t));
5ba3f43e
A
5550 if (error != 0) {
5551 goto out;
5552 }
5553
5554 if (uuid_is_null(inp->necp_client_uuid)) {
5555 error = EINVAL;
5556 goto out;
5557 }
5558
cb323159
A
5559 pid_t current_pid = proc_pid(current_proc());
5560 error = necp_client_register_socket_flow(current_pid,
5ba3f43e
A
5561 inp->necp_client_uuid, inp);
5562 if (error != 0) {
5563 uuid_clear(inp->necp_client_uuid);
5564 goto out;
5565 }
5566
5567 if (inp->inp_lport != 0) {
cb323159 5568 // There is a bound local port, so this is not
5ba3f43e 5569 // a fresh socket. Assign to the client.
cb323159 5570 necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5ba3f43e
A
5571 }
5572
fe8ab488 5573 break;
cb323159
A
5574 }
5575 case SO_NECP_LISTENUUID: {
5576 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5577 error = EINVAL;
5578 goto out;
5579 }
5580
5581 struct inpcb *inp = sotoinpcb(so);
5582 if (!uuid_is_null(inp->necp_client_uuid)) {
5583 error = EINVAL;
5584 goto out;
5585 }
5586
5587 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5588 sizeof(uuid_t), sizeof(uuid_t));
5589 if (error != 0) {
5590 goto out;
5591 }
5592
5593 if (uuid_is_null(inp->necp_client_uuid)) {
5594 error = EINVAL;
5595 goto out;
5596 }
5597
5598 error = necp_client_register_socket_listener(proc_pid(current_proc()),
5599 inp->necp_client_uuid, inp);
5600 if (error != 0) {
5601 uuid_clear(inp->necp_client_uuid);
5602 goto out;
5603 }
5604
5605 // Mark that the port registration is held by NECP
5606 inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5607
5608 break;
5609 }
5ba3f43e 5610#endif /* NECP */
39236c6e 5611
3e170ce0 5612 case SO_EXTENDED_BK_IDLE:
0a7de745
A
5613 error = sooptcopyin(sopt, &optval, sizeof(optval),
5614 sizeof(optval));
5615 if (error == 0) {
3e170ce0 5616 error = so_set_extended_bk_idle(so, optval);
0a7de745 5617 }
3e170ce0
A
5618 break;
5619
490019cf
A
5620 case SO_MARK_CELLFALLBACK:
5621 error = sooptcopyin(sopt, &optval, sizeof(optval),
5622 sizeof(optval));
0a7de745 5623 if (error != 0) {
490019cf 5624 goto out;
0a7de745 5625 }
490019cf
A
5626 if (optval < 0) {
5627 error = EINVAL;
5628 goto out;
5629 }
0a7de745 5630 if (optval == 0) {
490019cf 5631 so->so_flags1 &= ~SOF1_CELLFALLBACK;
0a7de745 5632 } else {
490019cf 5633 so->so_flags1 |= SOF1_CELLFALLBACK;
0a7de745 5634 }
490019cf 5635 break;
39037602 5636
cb323159
A
5637 case SO_STATISTICS_EVENT:
5638 error = sooptcopyin(sopt, &long_optval,
5639 sizeof(long_optval), sizeof(long_optval));
5640 if (error != 0) {
5641 goto out;
5642 }
5643 u_int64_t nstat_event = 0;
5644 error = so_statistics_event_to_nstat_event(
5645 &long_optval, &nstat_event);
5646 if (error != 0) {
5647 goto out;
5648 }
5649 nstat_pcb_event(sotoinpcb(so), nstat_event);
5650 break;
5651
39037602
A
5652 case SO_NET_SERVICE_TYPE: {
5653 error = sooptcopyin(sopt, &optval, sizeof(optval),
5654 sizeof(optval));
0a7de745 5655 if (error != 0) {
39037602 5656 goto out;
0a7de745 5657 }
39037602
A
5658 error = so_set_net_service_type(so, optval);
5659 break;
5660 }
5661
5662 case SO_QOSMARKING_POLICY_OVERRIDE:
5663 error = priv_check_cred(kauth_cred_get(),
5664 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
0a7de745 5665 if (error != 0) {
39037602 5666 goto out;
0a7de745 5667 }
39037602
A
5668 error = sooptcopyin(sopt, &optval, sizeof(optval),
5669 sizeof(optval));
0a7de745 5670 if (error != 0) {
39037602 5671 goto out;
0a7de745
A
5672 }
5673 if (optval == 0) {
39037602 5674 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
0a7de745 5675 } else {
39037602 5676 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
0a7de745 5677 }
39037602
A
5678 break;
5679
cb323159
A
5680 case SO_MPKL_SEND_INFO: {
5681 struct so_mpkl_send_info so_mpkl_send_info;
5682
5683 error = sooptcopyin(sopt, &so_mpkl_send_info,
5684 sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5685 if (error != 0) {
5686 goto out;
5687 }
5688 uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5689 so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5690
5691 if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5692 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5693 } else {
5694 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5695 }
5696 break;
5697 }
f427ee49
A
5698 case SO_WANT_KEV_SOCKET_CLOSED: {
5699 error = sooptcopyin(sopt, &optval, sizeof(optval),
5700 sizeof(optval));
5701 if (error != 0) {
5702 goto out;
5703 }
5704 if (optval == 0) {
5705 so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5706 } else {
5707 so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5708 }
5709 break;
5710 }
1c79356b
A
5711 default:
5712 error = ENOPROTOOPT;
5713 break;
5714 }
39236c6e
A
5715 if (error == 0 && so->so_proto != NULL &&
5716 so->so_proto->pr_ctloutput != NULL) {
5717 (void) so->so_proto->pr_ctloutput(so, sopt);
1c79356b
A
5718 }
5719 }
39236c6e 5720out:
0a7de745 5721 if (dolock) {
39236c6e 5722 socket_unlock(so, 1);
0a7de745
A
5723 }
5724 return error;
1c79356b
A
5725}
5726
2d21ac55 5727/* Helper routines for getsockopt */
1c79356b 5728int
2d21ac55 5729sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
1c79356b 5730{
0a7de745
A
5731 int error;
5732 size_t valsize;
1c79356b
A
5733
5734 error = 0;
5735
5736 /*
5737 * Documented get behavior is that we always return a value,
5738 * possibly truncated to fit in the user's buffer.
5739 * Traditional behavior is that we always tell the user
5740 * precisely how much we copied, rather than something useful
5741 * like the total amount we had available for her.
5742 * Note that this interface is not idempotent; the entire answer must
5743 * generated ahead of time.
5744 */
5745 valsize = min(len, sopt->sopt_valsize);
5746 sopt->sopt_valsize = valsize;
91447636 5747 if (sopt->sopt_val != USER_ADDR_NULL) {
0a7de745 5748 if (sopt->sopt_p != kernproc) {
1c79356b 5749 error = copyout(buf, sopt->sopt_val, valsize);
0a7de745 5750 } else {
91447636 5751 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
0a7de745 5752 }
1c79356b 5753 }
0a7de745 5754 return error;
2d21ac55
A
5755}
5756
5757static int
39236c6e 5758sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
2d21ac55 5759{
0a7de745
A
5760 int error;
5761 size_t len;
5762 struct user64_timeval tv64 = {};
5763 struct user32_timeval tv32 = {};
5764 const void * val;
5765 size_t valsize;
b0d623f7 5766
2d21ac55
A
5767 error = 0;
5768 if (proc_is64bit(sopt->sopt_p)) {
0a7de745 5769 len = sizeof(tv64);
2d21ac55
A
5770 tv64.tv_sec = tv_p->tv_sec;
5771 tv64.tv_usec = tv_p->tv_usec;
5772 val = &tv64;
5773 } else {
0a7de745 5774 len = sizeof(tv32);
b0d623f7
A
5775 tv32.tv_sec = tv_p->tv_sec;
5776 tv32.tv_usec = tv_p->tv_usec;
5777 val = &tv32;
2d21ac55
A
5778 }
5779 valsize = min(len, sopt->sopt_valsize);
5780 sopt->sopt_valsize = valsize;
5781 if (sopt->sopt_val != USER_ADDR_NULL) {
0a7de745 5782 if (sopt->sopt_p != kernproc) {
2d21ac55 5783 error = copyout(val, sopt->sopt_val, valsize);
0a7de745 5784 } else {
2d21ac55 5785 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
0a7de745 5786 }
2d21ac55 5787 }
0a7de745 5788 return error;
1c79356b
A
5789}
5790
2d21ac55
A
5791/*
5792 * Return: 0 Success
5793 * ENOPROTOOPT
5794 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5795 * <pr_ctloutput>:???
5796 * <sf_getoption>:???
5797 */
1c79356b 5798int
39236c6e 5799sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
1c79356b 5800{
0a7de745
A
5801 int error, optval;
5802 struct linger l;
5803 struct timeval tv;
1c79356b 5804
0a7de745 5805 if (sopt->sopt_dir != SOPT_GET) {
2d21ac55 5806 sopt->sopt_dir = SOPT_GET;
0a7de745 5807 }
9bccf70c 5808
0a7de745 5809 if (dolock) {
39236c6e 5810 socket_lock(so, 1);
0a7de745 5811 }
2d21ac55 5812
6d2010ae 5813 error = sflt_getsockopt(so, sopt);
39236c6e 5814 if (error != 0) {
0a7de745 5815 if (error == EJUSTRETURN) {
6d2010ae 5816 error = 0;
0a7de745 5817 }
39236c6e 5818 goto out;
1c79356b 5819 }
39236c6e 5820
1c79356b 5821 if (sopt->sopt_level != SOL_SOCKET) {
39236c6e
A
5822 if (so->so_proto != NULL &&
5823 so->so_proto->pr_ctloutput != NULL) {
2d21ac55 5824 error = (*so->so_proto->pr_ctloutput)(so, sopt);
39236c6e 5825 goto out;
91447636 5826 }
39236c6e 5827 error = ENOPROTOOPT;
1c79356b 5828 } else {
39236c6e
A
5829 /*
5830 * Allow socket-level (SOL_SOCKET) options to be filtered by
5831 * the protocol layer, if needed. A zero value returned from
5832 * the handler means use default socket-level processing as
5833 * done by the rest of this routine. Otherwise, any other
5834 * return value indicates that the option is unsupported.
5835 */
5836 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
0a7de745 5837 pru_socheckopt(so, sopt)) != 0) {
39236c6e 5838 goto out;
0a7de745 5839 }
39236c6e
A
5840
5841 error = 0;
1c79356b
A
5842 switch (sopt->sopt_name) {
5843 case SO_LINGER:
91447636 5844 case SO_LINGER_SEC:
39236c6e 5845 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
2d21ac55
A
5846 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5847 so->so_linger : so->so_linger / hz;
0a7de745 5848 error = sooptcopyout(sopt, &l, sizeof(l));
1c79356b
A
5849 break;
5850
5851 case SO_USELOOPBACK:
5852 case SO_DONTROUTE:
5853 case SO_DEBUG:
5854 case SO_KEEPALIVE:
5855 case SO_REUSEADDR:
5856 case SO_REUSEPORT:
5857 case SO_BROADCAST:
5858 case SO_OOBINLINE:
5859 case SO_TIMESTAMP:
6d2010ae 5860 case SO_TIMESTAMP_MONOTONIC:
d9a64523 5861 case SO_TIMESTAMP_CONTINUOUS:
1c79356b
A
5862 case SO_DONTTRUNC:
5863 case SO_WANTMORE:
9bccf70c 5864 case SO_WANTOOBFLAG:
fe8ab488 5865 case SO_NOWAKEFROMSLEEP:
39037602 5866 case SO_NOAPNFALLBK:
1c79356b
A
5867 optval = so->so_options & sopt->sopt_name;
5868integer:
0a7de745 5869 error = sooptcopyout(sopt, &optval, sizeof(optval));
1c79356b
A
5870 break;
5871
5872 case SO_TYPE:
5873 optval = so->so_type;
5874 goto integer;
5875
5876 case SO_NREAD:
2d21ac55
A
5877 if (so->so_proto->pr_flags & PR_ATOMIC) {
5878 int pkt_total;
5879 struct mbuf *m1;
1c79356b 5880
2d21ac55
A
5881 pkt_total = 0;
5882 m1 = so->so_rcv.sb_mb;
39236c6e
A
5883 while (m1 != NULL) {
5884 if (m1->m_type == MT_DATA ||
5885 m1->m_type == MT_HEADER ||
0a7de745 5886 m1->m_type == MT_OOBDATA) {
1c79356b 5887 pkt_total += m1->m_len;
0a7de745 5888 }
1c79356b
A
5889 m1 = m1->m_next;
5890 }
5891 optval = pkt_total;
2d21ac55
A
5892 } else {
5893 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5894 }
1c79356b 5895 goto integer;
39236c6e 5896
fe8ab488
A
5897 case SO_NUMRCVPKT:
5898 if (so->so_proto->pr_flags & PR_ATOMIC) {
5899 int cnt = 0;
5900 struct mbuf *m1;
5901
5902 m1 = so->so_rcv.sb_mb;
5903 while (m1 != NULL) {
cb323159 5904 cnt += 1;
fe8ab488
A
5905 m1 = m1->m_nextpkt;
5906 }
5907 optval = cnt;
5908 goto integer;
5909 } else {
cb323159 5910 error = ENOPROTOOPT;
fe8ab488
A
5911 break;
5912 }
5913
91447636
A
5914 case SO_NWRITE:
5915 optval = so->so_snd.sb_cc;
2d21ac55 5916 goto integer;
39236c6e 5917
1c79356b
A
5918 case SO_ERROR:
5919 optval = so->so_error;
5920 so->so_error = 0;
5921 goto integer;
5922
fe8ab488
A
5923 case SO_SNDBUF: {
5924 u_int32_t hiwat = so->so_snd.sb_hiwat;
1c79356b 5925
fe8ab488
A
5926 if (so->so_snd.sb_flags & SB_UNIX) {
5927 struct unpcb *unp =
5928 (struct unpcb *)(so->so_pcb);
5929 if (unp != NULL && unp->unp_conn != NULL) {
5930 hiwat += unp->unp_conn->unp_cc;
5931 }
5932 }
5933
5934 optval = hiwat;
5935 goto integer;
5936 }
1c79356b
A
5937 case SO_RCVBUF:
5938 optval = so->so_rcv.sb_hiwat;
5939 goto integer;
5940
5941 case SO_SNDLOWAT:
5942 optval = so->so_snd.sb_lowat;
5943 goto integer;
5944
5945 case SO_RCVLOWAT:
5946 optval = so->so_rcv.sb_lowat;
5947 goto integer;
5948
5949 case SO_SNDTIMEO:
5950 case SO_RCVTIMEO:
91447636 5951 tv = (sopt->sopt_name == SO_SNDTIMEO ?
2d21ac55 5952 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1c79356b 5953
2d21ac55
A
5954 error = sooptcopyout_timeval(sopt, &tv);
5955 break;
1c79356b 5956
91447636
A
5957 case SO_NOSIGPIPE:
5958 optval = (so->so_flags & SOF_NOSIGPIPE);
5959 goto integer;
9bccf70c 5960
55e303ae 5961 case SO_NOADDRERR:
91447636
A
5962 optval = (so->so_flags & SOF_NOADDRAVAIL);
5963 goto integer;
55e303ae 5964
2d21ac55
A
5965 case SO_REUSESHAREUID:
5966 optval = (so->so_flags & SOF_REUSESHAREUID);
5967 goto integer;
5968
39236c6e 5969
2d21ac55
A
5970 case SO_NOTIFYCONFLICT:
5971 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5972 goto integer;
39236c6e 5973
2d21ac55 5974 case SO_RESTRICTIONS:
39236c6e 5975 optval = so_get_restrictions(so);
2d21ac55
A
5976 goto integer;
5977
fe8ab488 5978 case SO_AWDL_UNRESTRICTED:
3e170ce0 5979 if (SOCK_DOM(so) == PF_INET ||
fe8ab488
A
5980 SOCK_DOM(so) == PF_INET6) {
5981 optval = inp_get_awdl_unrestricted(
0a7de745 5982 sotoinpcb(so));
fe8ab488 5983 goto integer;
0a7de745 5984 } else {
fe8ab488 5985 error = EOPNOTSUPP;
0a7de745 5986 }
fe8ab488
A
5987 break;
5988
39037602
A
5989 case SO_INTCOPROC_ALLOW:
5990 if (SOCK_DOM(so) == PF_INET6) {
5991 optval = inp_get_intcoproc_allowed(
0a7de745 5992 sotoinpcb(so));
39037602 5993 goto integer;
0a7de745 5994 } else {
39037602 5995 error = EOPNOTSUPP;
0a7de745 5996 }
39037602
A
5997 break;
5998
2d21ac55 5999 case SO_LABEL:
2d21ac55 6000 error = EOPNOTSUPP;
2d21ac55
A
6001 break;
6002
6003 case SO_PEERLABEL:
2d21ac55 6004 error = EOPNOTSUPP;
2d21ac55
A
6005 break;
6006
4a3eedf9
A
6007#ifdef __APPLE_API_PRIVATE
6008 case SO_UPCALLCLOSEWAIT:
6009 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6010 goto integer;
6011#endif
b0d623f7
A
6012 case SO_RANDOMPORT:
6013 optval = (so->so_flags & SOF_BINDRANDOMPORT);
6014 goto integer;
6015
6016 case SO_NP_EXTENSIONS: {
527f9951 6017 struct so_np_extensions sonpx = {};
b0d623f7 6018
39236c6e
A
6019 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6020 SONPX_SETOPTSHUT : 0;
b0d623f7 6021 sonpx.npx_mask = SONPX_MASK_VALID;
4a3eedf9 6022
39236c6e 6023 error = sooptcopyout(sopt, &sonpx,
0a7de745 6024 sizeof(struct so_np_extensions));
39236c6e 6025 break;
b0d623f7 6026 }
6d2010ae 6027
d41d1dae
A
6028 case SO_TRAFFIC_CLASS:
6029 optval = so->so_traffic_class;
6030 goto integer;
316670eb 6031
6d2010ae
A
6032 case SO_RECV_TRAFFIC_CLASS:
6033 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6034 goto integer;
6035
39037602 6036#if (DEVELOPMENT || DEBUG)
39236c6e 6037 case SO_TRAFFIC_CLASS_DBG:
6d2010ae
A
6038 error = sogetopt_tcdbg(so, sopt);
6039 break;
39037602 6040#endif /* (DEVELOPMENT || DEBUG) */
316670eb
A
6041
6042 case SO_PRIVILEGED_TRAFFIC_CLASS:
6043 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6044 goto integer;
6045
6d2010ae
A
6046 case SO_DEFUNCTOK:
6047 optval = !(so->so_flags & SOF_NODEFUNCT);
6048 goto integer;
6049
6050 case SO_ISDEFUNCT:
6051 optval = (so->so_flags & SOF_DEFUNCT);
6052 goto integer;
d41d1dae 6053
316670eb
A
6054 case SO_OPPORTUNISTIC:
6055 optval = so_get_opportunistic(so);
6056 goto integer;
6057
6058 case SO_FLUSH:
6059 /* This option is not gettable */
6060 error = EINVAL;
6061 break;
6062
6063 case SO_RECV_ANYIF:
6064 optval = so_get_recv_anyif(so);
6065 goto integer;
6066
39236c6e
A
6067 case SO_TRAFFIC_MGT_BACKGROUND:
6068 /* This option is handled by lower layer(s) */
6069 if (so->so_proto != NULL &&
6070 so->so_proto->pr_ctloutput != NULL) {
6071 (void) so->so_proto->pr_ctloutput(so, sopt);
6072 }
6073 break;
6074
6075#if FLOW_DIVERT
6076 case SO_FLOW_DIVERT_TOKEN:
6077 error = flow_divert_token_get(so, sopt);
6078 break;
0a7de745 6079#endif /* FLOW_DIVERT */
3e170ce0 6080
fe8ab488
A
6081#if NECP
6082 case SO_NECP_ATTRIBUTES:
6083 error = necp_get_socket_attributes(so, sopt);
6084 break;
5ba3f43e 6085
cb323159 6086 case SO_NECP_CLIENTUUID: {
5ba3f43e
A
6087 uuid_t *ncu;
6088
6089 if (SOCK_DOM(so) == PF_MULTIPATH) {
6090 ncu = &mpsotomppcb(so)->necp_client_uuid;
6091 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6092 ncu = &sotoinpcb(so)->necp_client_uuid;
6093 } else {
6094 error = EINVAL;
6095 goto out;
6096 }
6097
6098 error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6099 break;
6100 }
cb323159
A
6101
6102 case SO_NECP_LISTENUUID: {
6103 uuid_t *nlu;
6104
6105 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6106 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6107 nlu = &sotoinpcb(so)->necp_client_uuid;
6108 } else {
6109 error = ENOENT;
6110 goto out;
6111 }
6112 } else {
6113 error = EINVAL;
6114 goto out;
6115 }
6116
6117 error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6118 break;
6119 }
fe8ab488
A
6120#endif /* NECP */
6121
6122#if CONTENT_FILTER
6123 case SO_CFIL_SOCK_ID: {
6124 cfil_sock_id_t sock_id;
6125
6126 sock_id = cfil_sock_id_from_socket(so);
6127
3e170ce0 6128 error = sooptcopyout(sopt, &sock_id,
0a7de745 6129 sizeof(cfil_sock_id_t));
fe8ab488
A
6130 break;
6131 }
0a7de745 6132#endif /* CONTENT_FILTER */
fe8ab488 6133
3e170ce0
A
6134 case SO_EXTENDED_BK_IDLE:
6135 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6136 goto integer;
490019cf
A
6137 case SO_MARK_CELLFALLBACK:
6138 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6139 ? 1 : 0;
6140 goto integer;
39037602 6141 case SO_NET_SERVICE_TYPE: {
0a7de745 6142 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
39037602 6143 optval = so->so_netsvctype;
0a7de745 6144 } else {
39037602 6145 optval = NET_SERVICE_TYPE_BE;
0a7de745 6146 }
39037602
A
6147 goto integer;
6148 }
6149 case SO_NETSVC_MARKING_LEVEL:
6150 optval = so_get_netsvc_marking_level(so);
6151 goto integer;
6152
cb323159
A
6153 case SO_MPKL_SEND_INFO: {
6154 struct so_mpkl_send_info so_mpkl_send_info;
6155
6156 uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6157 so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6158 error = sooptcopyout(sopt, &so_mpkl_send_info,
6159 sizeof(struct so_mpkl_send_info));
6160 break;
6161 }
1c79356b
A
6162 default:
6163 error = ENOPROTOOPT;
6164 break;
6165 }
1c79356b 6166 }
39236c6e 6167out:
0a7de745 6168 if (dolock) {
39236c6e 6169 socket_unlock(so, 1);
0a7de745
A
6170 }
6171 return error;
1c79356b 6172}
39236c6e
A
6173
6174/*
6175 * The size limits on our soopt_getm is different from that on FreeBSD.
6d2010ae
A
6176 * We limit the size of options to MCLBYTES. This will have to change
6177 * if we need to define options that need more space than MCLBYTES.
6178 */
1c79356b 6179int
9bccf70c 6180soopt_getm(struct sockopt *sopt, struct mbuf **mp)
1c79356b
A
6181{
6182 struct mbuf *m, *m_prev;
6183 int sopt_size = sopt->sopt_valsize;
b0d623f7 6184 int how;
1c79356b 6185
0a7de745
A
6186 if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6187 return EMSGSIZE;
6188 }
a3d08fcd 6189
b0d623f7
A
6190 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6191 MGET(m, how, MT_DATA);
0a7de745
A
6192 if (m == NULL) {
6193 return ENOBUFS;
6194 }
1c79356b 6195 if (sopt_size > MLEN) {
b0d623f7 6196 MCLGET(m, how);
1c79356b
A
6197 if ((m->m_flags & M_EXT) == 0) {
6198 m_free(m);
0a7de745 6199 return ENOBUFS;
1c79356b
A
6200 }
6201 m->m_len = min(MCLBYTES, sopt_size);
6202 } else {
6203 m->m_len = min(MLEN, sopt_size);
6204 }
6205 sopt_size -= m->m_len;
6206 *mp = m;
6207 m_prev = m;
6208
6d2010ae 6209 while (sopt_size > 0) {
b0d623f7 6210 MGET(m, how, MT_DATA);
39236c6e 6211 if (m == NULL) {
1c79356b 6212 m_freem(*mp);
0a7de745 6213 return ENOBUFS;
1c79356b
A
6214 }
6215 if (sopt_size > MLEN) {
b0d623f7 6216 MCLGET(m, how);
1c79356b
A
6217 if ((m->m_flags & M_EXT) == 0) {
6218 m_freem(*mp);
6d2010ae 6219 m_freem(m);
0a7de745 6220 return ENOBUFS;
1c79356b
A
6221 }
6222 m->m_len = min(MCLBYTES, sopt_size);
6223 } else {
6224 m->m_len = min(MLEN, sopt_size);
6225 }
6226 sopt_size -= m->m_len;
6227 m_prev->m_next = m;
6228 m_prev = m;
6229 }
0a7de745 6230 return 0;
1c79356b
A
6231}
6232
6d2010ae 6233/* copyin sopt data into mbuf chain */
1c79356b 6234int
9bccf70c 6235soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
1c79356b
A
6236{
6237 struct mbuf *m0 = m;
6238
0a7de745
A
6239 if (sopt->sopt_val == USER_ADDR_NULL) {
6240 return 0;
6241 }
1c79356b 6242 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
b0d623f7 6243 if (sopt->sopt_p != kernproc) {
1c79356b
A
6244 int error;
6245
2d21ac55
A
6246 error = copyin(sopt->sopt_val, mtod(m, char *),
6247 m->m_len);
1c79356b
A
6248 if (error != 0) {
6249 m_freem(m0);
0a7de745 6250 return error;
1c79356b 6251 }
2d21ac55
A
6252 } else {
6253 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6254 mtod(m, char *), m->m_len);
6255 }
1c79356b 6256 sopt->sopt_valsize -= m->m_len;
2d21ac55 6257 sopt->sopt_val += m->m_len;
1c79356b
A
6258 m = m->m_next;
6259 }
39236c6e
A
6260 /* should be allocated enoughly at ip6_sooptmcopyin() */
6261 if (m != NULL) {
9bccf70c 6262 panic("soopt_mcopyin");
39236c6e
A
6263 /* NOTREACHED */
6264 }
0a7de745 6265 return 0;
1c79356b
A
6266}
6267
6d2010ae 6268/* copyout mbuf chain data into soopt */
1c79356b 6269int
9bccf70c 6270soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
1c79356b
A
6271{
6272 struct mbuf *m0 = m;
6273 size_t valsize = 0;
6274
0a7de745
A
6275 if (sopt->sopt_val == USER_ADDR_NULL) {
6276 return 0;
6277 }
1c79356b 6278 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
b0d623f7 6279 if (sopt->sopt_p != kernproc) {
1c79356b
A
6280 int error;
6281
2d21ac55
A
6282 error = copyout(mtod(m, char *), sopt->sopt_val,
6283 m->m_len);
1c79356b
A
6284 if (error != 0) {
6285 m_freem(m0);
0a7de745 6286 return error;
1c79356b 6287 }
2d21ac55
A
6288 } else {
6289 bcopy(mtod(m, char *),
6290 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6291 }
6292 sopt->sopt_valsize -= m->m_len;
6293 sopt->sopt_val += m->m_len;
6294 valsize += m->m_len;
6295 m = m->m_next;
1c79356b
A
6296 }
6297 if (m != NULL) {
6298 /* enough soopt buffer should be given from user-land */
6299 m_freem(m0);
0a7de745 6300 return EINVAL;
1c79356b
A
6301 }
6302 sopt->sopt_valsize = valsize;
0a7de745 6303 return 0;
1c79356b
A
6304}
6305
9bccf70c 6306void
2d21ac55 6307sohasoutofband(struct socket *so)
9bccf70c 6308{
0a7de745 6309 if (so->so_pgid < 0) {
9bccf70c 6310 gsignal(-so->so_pgid, SIGURG);
0a7de745 6311 } else if (so->so_pgid > 0) {
2d21ac55 6312 proc_signal(so->so_pgid, SIGURG);
0a7de745 6313 }
9bccf70c 6314 selwakeup(&so->so_rcv.sb_sel);
39037602
A
6315 if (so->so_rcv.sb_flags & SB_KNOTE) {
6316 KNOTE(&so->so_rcv.sb_sel.si_note,
6317 (NOTE_OOB | SO_FILT_HINT_LOCKED));
6318 }
9bccf70c
A
6319}
6320
6321int
39236c6e 6322sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
9bccf70c 6323{
39236c6e 6324#pragma unused(cred)
9bccf70c
A
6325 struct proc *p = current_proc();
6326 int revents = 0;
91447636
A
6327
6328 socket_lock(so, 1);
39236c6e
A
6329 so_update_last_owner_locked(so, PROC_NULL);
6330 so_update_policy(so);
9bccf70c 6331
0a7de745
A
6332 if (events & (POLLIN | POLLRDNORM)) {
6333 if (soreadable(so)) {
9bccf70c 6334 revents |= events & (POLLIN | POLLRDNORM);
0a7de745
A
6335 }
6336 }
9bccf70c 6337
0a7de745
A
6338 if (events & (POLLOUT | POLLWRNORM)) {
6339 if (sowriteable(so)) {
9bccf70c 6340 revents |= events & (POLLOUT | POLLWRNORM);
0a7de745
A
6341 }
6342 }
9bccf70c 6343
0a7de745
A
6344 if (events & (POLLPRI | POLLRDBAND)) {
6345 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
9bccf70c 6346 revents |= events & (POLLPRI | POLLRDBAND);
0a7de745
A
6347 }
6348 }
9bccf70c
A
6349
6350 if (revents == 0) {
6351 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
2d21ac55
A
6352 /*
6353 * Darwin sets the flag first,
6354 * BSD calls selrecord first
6355 */
9bccf70c
A
6356 so->so_rcv.sb_flags |= SB_SEL;
6357 selrecord(p, &so->so_rcv.sb_sel, wql);
6358 }
6359
6360 if (events & (POLLOUT | POLLWRNORM)) {
2d21ac55
A
6361 /*
6362 * Darwin sets the flag first,
6363 * BSD calls selrecord first
6364 */
9bccf70c
A
6365 so->so_snd.sb_flags |= SB_SEL;
6366 selrecord(p, &so->so_snd.sb_sel, wql);
6367 }
6368 }
6369
91447636 6370 socket_unlock(so, 1);
0a7de745 6371 return revents;
9bccf70c 6372}
55e303ae 6373
55e303ae 6374int
cb323159 6375soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
55e303ae 6376{
f427ee49 6377 struct socket *so = (struct socket *)fp->fp_glob->fg_data;
39037602 6378 int result;
2d21ac55 6379
91447636 6380 socket_lock(so, 1);
39236c6e
A
6381 so_update_last_owner_locked(so, PROC_NULL);
6382 so_update_policy(so);
55e303ae
A
6383
6384 switch (kn->kn_filter) {
6385 case EVFILT_READ:
39037602 6386 kn->kn_filtid = EVFILTID_SOREAD;
55e303ae
A
6387 break;
6388 case EVFILT_WRITE:
39037602 6389 kn->kn_filtid = EVFILTID_SOWRITE;
316670eb
A
6390 break;
6391 case EVFILT_SOCK:
39037602
A
6392 kn->kn_filtid = EVFILTID_SCK;
6393 break;
6394 case EVFILT_EXCEPT:
6395 kn->kn_filtid = EVFILTID_SOEXCEPT;
55e303ae
A
6396 break;
6397 default:
91447636 6398 socket_unlock(so, 1);
cb323159 6399 knote_set_error(kn, EINVAL);
39037602 6400 return 0;
316670eb 6401 }
55e303ae 6402
39037602
A
6403 /*
6404 * call the appropriate sub-filter attach
6405 * with the socket still locked
6406 */
5ba3f43e 6407 result = knote_fops(kn)->f_attach(kn, kev);
55e303ae 6408
91447636 6409 socket_unlock(so, 1);
39037602
A
6410
6411 return result;
55e303ae
A
6412}
6413
55e303ae 6414static int
cb323159 6415filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
55e303ae 6416{
cb323159
A
6417 int retval = 0;
6418 int64_t data = 0;
b0d623f7 6419
cb323159 6420 if (so->so_options & SO_ACCEPTCONN) {
39236c6e
A
6421 /*
6422 * Radar 6615193 handle the listen case dynamically
6423 * for kqueue read filter. This allows to call listen()
6424 * after registering the kqueue EVFILT_READ.
b0d623f7
A
6425 */
6426
cb323159
A
6427 retval = !TAILQ_EMPTY(&so->so_comp);
6428 data = so->so_qlen;
6429 goto out;
b0d623f7
A
6430 }
6431
6432 /* socket isn't a listener */
3e170ce0
A
6433 /*
6434 * NOTE_LOWAT specifies new low water mark in data, i.e.
6435 * the bytes of protocol data. We therefore exclude any
6436 * control bytes.
6437 */
cb323159 6438 data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3e170ce0 6439
39037602
A
6440 if (kn->kn_sfflags & NOTE_OOB) {
6441 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6442 kn->kn_fflags |= NOTE_OOB;
cb323159
A
6443 data -= so->so_oobmark;
6444 retval = 1;
6445 goto out;
91447636 6446 }
04b8595b 6447 }
3e170ce0 6448
04b8595b 6449 if ((so->so_state & SS_CANTRCVMORE)
fe8ab488 6450#if CONTENT_FILTER
04b8595b 6451 && cfil_sock_data_pending(&so->so_rcv) == 0
fe8ab488 6452#endif /* CONTENT_FILTER */
0a7de745 6453 ) {
04b8595b
A
6454 kn->kn_flags |= EV_EOF;
6455 kn->kn_fflags = so->so_error;
cb323159
A
6456 retval = 1;
6457 goto out;
91447636
A
6458 }
6459
0a7de745 6460 if (so->so_error) { /* temporary udp error */
cb323159
A
6461 retval = 1;
6462 goto out;
91447636
A
6463 }
6464
0a7de745 6465 int64_t lowwat = so->so_rcv.sb_lowat;
3e170ce0
A
6466 /*
6467 * Ensure that when NOTE_LOWAT is used, the derived
6468 * low water mark is bounded by socket's rcv buf's
6469 * high and low water mark values.
6470 */
39236c6e 6471 if (kn->kn_sfflags & NOTE_LOWAT) {
0a7de745 6472 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6d2010ae 6473 lowwat = so->so_rcv.sb_hiwat;
0a7de745 6474 } else if (kn->kn_sdata > lowwat) {
6d2010ae 6475 lowwat = kn->kn_sdata;
0a7de745 6476 }
6d2010ae 6477 }
39236c6e 6478
ea3f0419
A
6479 /*
6480 * While the `data` field is the amount of data to read,
6481 * 0-sized packets need to wake up the kqueue, see 58140856,
6482 * so we need to take control bytes into account too.
6483 */
6484 retval = (so->so_rcv.sb_cc >= lowwat);
3e170ce0 6485
cb323159
A
6486out:
6487 if (retval && kev) {
6488 knote_fill_kevent(kn, kev, data);
6489 }
6490 return retval;
55e303ae
A
6491}
6492
39037602 6493static int
cb323159 6494filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
39037602 6495{
f427ee49 6496 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
39037602
A
6497
6498 /* socket locked */
6499
6500 /*
6501 * If the caller explicitly asked for OOB results (e.g. poll())
6502 * from EVFILT_READ, then save that off in the hookid field
6503 * and reserve the kn_flags EV_OOBAND bit for output only.
6504 */
6505 if (kn->kn_filter == EVFILT_READ &&
6506 kn->kn_flags & EV_OOBAND) {
6507 kn->kn_flags &= ~EV_OOBAND;
cb323159 6508 kn->kn_hook32 = EV_OOBAND;
39037602 6509 } else {
cb323159 6510 kn->kn_hook32 = 0;
39037602 6511 }
0a7de745 6512 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
39037602 6513 so->so_rcv.sb_flags |= SB_KNOTE;
0a7de745 6514 }
39037602
A
6515
6516 /* indicate if event is already fired */
cb323159 6517 return filt_soread_common(kn, NULL, so);
39037602
A
6518}
6519
55e303ae 6520static void
39037602 6521filt_sordetach(struct knote *kn)
55e303ae 6522{
f427ee49 6523 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
39037602 6524
91447636 6525 socket_lock(so, 1);
0a7de745
A
6526 if (so->so_rcv.sb_flags & SB_KNOTE) {
6527 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
39037602 6528 so->so_rcv.sb_flags &= ~SB_KNOTE;
0a7de745
A
6529 }
6530 }
39037602
A
6531 socket_unlock(so, 1);
6532}
6533
6534/*ARGSUSED*/
6535static int
6536filt_soread(struct knote *kn, long hint)
6537{
f427ee49 6538 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
39037602
A
6539 int retval;
6540
0a7de745 6541 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
39037602 6542 socket_lock(so, 1);
0a7de745 6543 }
39037602 6544
cb323159 6545 retval = filt_soread_common(kn, NULL, so);
39037602 6546
0a7de745 6547 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
39037602 6548 socket_unlock(so, 1);
0a7de745 6549 }
39037602
A
6550
6551 return retval;
6552}
6553
6554static int
cb323159 6555filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
39037602 6556{
f427ee49 6557 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
39037602
A
6558 int retval;
6559
6560 socket_lock(so, 1);
6561
6562 /* save off the new input fflags and data */
6563 kn->kn_sfflags = kev->fflags;
6564 kn->kn_sdata = kev->data;
39037602
A
6565
6566 /* determine if changes result in fired events */
cb323159 6567 retval = filt_soread_common(kn, NULL, so);
55e303ae 6568
91447636 6569 socket_unlock(so, 1);
39037602
A
6570
6571 return retval;
6572}
6573
6574static int
cb323159 6575filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
39037602 6576{
f427ee49 6577 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
39037602
A
6578 int retval;
6579
6580 socket_lock(so, 1);
cb323159 6581 retval = filt_soread_common(kn, kev, so);
39037602
A
6582 socket_unlock(so, 1);
6583
6584 return retval;
55e303ae
A
6585}
6586
316670eb
A
6587int
6588so_wait_for_if_feedback(struct socket *so)
6589{
39236c6e 6590 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
316670eb
A
6591 (so->so_state & SS_ISCONNECTED)) {
6592 struct inpcb *inp = sotoinpcb(so);
0a7de745
A
6593 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6594 return 1;
6595 }
316670eb 6596 }
0a7de745 6597 return 0;
316670eb
A
6598}
6599
55e303ae 6600static int
cb323159 6601filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
55e303ae 6602{
316670eb 6603 int ret = 0;
cb323159 6604 int64_t data = sbspace(&so->so_snd);
91447636 6605
55e303ae 6606 if (so->so_state & SS_CANTSENDMORE) {
2d21ac55 6607 kn->kn_flags |= EV_EOF;
55e303ae 6608 kn->kn_fflags = so->so_error;
cb323159
A
6609 ret = 1;
6610 goto out;
55e303ae 6611 }
cb323159 6612
0a7de745 6613 if (so->so_error) { /* temporary udp error */
cb323159
A
6614 ret = 1;
6615 goto out;
91447636 6616 }
cb323159 6617
3e170ce0 6618 if (!socanwrite(so)) {
cb323159
A
6619 ret = 0;
6620 goto out;
91447636 6621 }
cb323159 6622
3e170ce0 6623 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
cb323159
A
6624 ret = 1;
6625 goto out;
3e170ce0 6626 }
cb323159 6627
0a7de745 6628 int64_t lowwat = so->so_snd.sb_lowat;
cb323159 6629
39236c6e 6630 if (kn->kn_sfflags & NOTE_LOWAT) {
0a7de745 6631 if (kn->kn_sdata > so->so_snd.sb_hiwat) {
6d2010ae 6632 lowwat = so->so_snd.sb_hiwat;
0a7de745 6633 } else if (kn->kn_sdata > lowwat) {
6d2010ae 6634 lowwat = kn->kn_sdata;
0a7de745 6635 }
6d2010ae 6636 }
cb323159
A
6637
6638 if (data >= lowwat) {
39037602
A
6639 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6640#if (DEBUG || DEVELOPMENT)
6641 && so_notsent_lowat_check == 1
6642#endif /* DEBUG || DEVELOPMENT */
6643 ) {
6644 if ((SOCK_DOM(so) == PF_INET ||
6645 SOCK_DOM(so) == PF_INET6) &&
6646 so->so_type == SOCK_STREAM) {
fe8ab488
A
6647 ret = tcp_notsent_lowat_check(so);
6648 }
6649#if MPTCP
6650 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6651 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6652 ret = mptcp_notsent_lowat_check(so);
6653 }
6654#endif
6655 else {
cb323159
A
6656 ret = 1;
6657 goto out;
fe8ab488 6658 }
316670eb
A
6659 } else {
6660 ret = 1;
6661 }
6662 }
0a7de745 6663 if (so_wait_for_if_feedback(so)) {
316670eb 6664 ret = 0;
0a7de745 6665 }
cb323159
A
6666
6667out:
6668 if (ret && kev) {
6669 knote_fill_kevent(kn, kev, data);
6670 }
0a7de745 6671 return ret;
316670eb
A
6672}
6673
39037602 6674static int
cb323159 6675filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
39037602 6676{
f427ee49 6677 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
39037602
A
6678
6679 /* socket locked */
0a7de745 6680 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
39037602 6681 so->so_snd.sb_flags |= SB_KNOTE;
0a7de745 6682 }
39037602
A
6683
6684 /* determine if its already fired */
cb323159 6685 return filt_sowrite_common(kn, NULL, so);
39037602
A
6686}
6687
316670eb 6688static void
39037602 6689filt_sowdetach(struct knote *kn)
316670eb 6690{
f427ee49 6691 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
316670eb 6692 socket_lock(so, 1);
39236c6e 6693
0a7de745
A
6694 if (so->so_snd.sb_flags & SB_KNOTE) {
6695 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
39037602 6696 so->so_snd.sb_flags &= ~SB_KNOTE;
0a7de745
A
6697 }
6698 }
316670eb
A
6699 socket_unlock(so, 1);
6700}
6701
39037602 6702/*ARGSUSED*/
316670eb 6703static int
39037602 6704filt_sowrite(struct knote *kn, long hint)
316670eb 6705{
f427ee49 6706 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
39037602 6707 int ret;
316670eb 6708
0a7de745 6709 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
316670eb 6710 socket_lock(so, 1);
0a7de745 6711 }
39037602 6712
cb323159 6713 ret = filt_sowrite_common(kn, NULL, so);
39037602 6714
0a7de745 6715 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
39037602 6716 socket_unlock(so, 1);
0a7de745 6717 }
39037602
A
6718
6719 return ret;
6720}
6721
6722static int
cb323159 6723filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
39037602 6724{
f427ee49 6725 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
39037602
A
6726 int ret;
6727
6728 socket_lock(so, 1);
6729
6730 /*save off the new input fflags and data */
6731 kn->kn_sfflags = kev->fflags;
6732 kn->kn_sdata = kev->data;
39037602
A
6733
6734 /* determine if these changes result in a triggered event */
cb323159 6735 ret = filt_sowrite_common(kn, NULL, so);
39037602
A
6736
6737 socket_unlock(so, 1);
6738
6739 return ret;
6740}
6741
6742static int
cb323159 6743filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
39037602 6744{
f427ee49 6745 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
39037602
A
6746 int ret;
6747
6748 socket_lock(so, 1);
cb323159 6749 ret = filt_sowrite_common(kn, kev, so);
39037602 6750 socket_unlock(so, 1);
cb323159 6751
39037602
A
6752 return ret;
6753}
6754
6755static int
cb323159
A
6756filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6757 struct socket *so, long ev_hint)
39037602
A
6758{
6759 int ret = 0;
cb323159 6760 int64_t data = 0;
39037602 6761 uint32_t level_trigger = 0;
316670eb 6762
39236c6e 6763 if (ev_hint & SO_FILT_HINT_CONNRESET) {
3e170ce0 6764 kn->kn_fflags |= NOTE_CONNRESET;
39236c6e
A
6765 }
6766 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
3e170ce0 6767 kn->kn_fflags |= NOTE_TIMEOUT;
39236c6e
A
6768 }
6769 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
3e170ce0 6770 kn->kn_fflags |= NOTE_NOSRCADDR;
39236c6e
A
6771 }
6772 if (ev_hint & SO_FILT_HINT_IFDENIED) {
3e170ce0 6773 kn->kn_fflags |= NOTE_IFDENIED;
39236c6e
A
6774 }
6775 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
3e170ce0 6776 kn->kn_fflags |= NOTE_KEEPALIVE;
316670eb 6777 }
39236c6e 6778 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
3e170ce0 6779 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
39236c6e
A
6780 }
6781 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
3e170ce0 6782 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
39236c6e 6783 }
3e170ce0
A
6784 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6785 (so->so_state & SS_ISCONNECTED)) {
6786 kn->kn_fflags |= NOTE_CONNECTED;
6787 level_trigger |= NOTE_CONNECTED;
39236c6e 6788 }
3e170ce0
A
6789 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6790 (so->so_state & SS_ISDISCONNECTED)) {
6791 kn->kn_fflags |= NOTE_DISCONNECTED;
6792 level_trigger |= NOTE_DISCONNECTED;
39236c6e
A
6793 }
6794 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6795 if (so->so_proto != NULL &&
0a7de745 6796 (so->so_proto->pr_flags & PR_EVCONNINFO)) {
39236c6e 6797 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
0a7de745 6798 }
39236c6e 6799 }
316670eb 6800
39037602
A
6801 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6802 tcp_notify_ack_active(so)) {
6803 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6804 }
6805
3e170ce0 6806 if ((so->so_state & SS_CANTRCVMORE)
fe8ab488 6807#if CONTENT_FILTER
3e170ce0 6808 && cfil_sock_data_pending(&so->so_rcv) == 0
fe8ab488 6809#endif /* CONTENT_FILTER */
3e170ce0 6810 ) {
316670eb 6811 kn->kn_fflags |= NOTE_READCLOSED;
3e170ce0
A
6812 level_trigger |= NOTE_READCLOSED;
6813 }
316670eb 6814
3e170ce0 6815 if (so->so_state & SS_CANTSENDMORE) {
316670eb 6816 kn->kn_fflags |= NOTE_WRITECLOSED;
3e170ce0
A
6817 level_trigger |= NOTE_WRITECLOSED;
6818 }
316670eb 6819
3e170ce0
A
6820 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6821 (so->so_flags & SOF_SUSPENDED)) {
39236c6e 6822 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
3e170ce0
A
6823
6824 /* If resume event was delivered before, reset it */
cb323159 6825 kn->kn_hook32 &= ~NOTE_RESUME;
3e170ce0 6826
316670eb 6827 kn->kn_fflags |= NOTE_SUSPEND;
3e170ce0 6828 level_trigger |= NOTE_SUSPEND;
316670eb
A
6829 }
6830
3e170ce0
A
6831 if ((ev_hint & SO_FILT_HINT_RESUME) ||
6832 (so->so_flags & SOF_SUSPENDED) == 0) {
39236c6e 6833 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
3e170ce0
A
6834
6835 /* If suspend event was delivered before, reset it */
cb323159 6836 kn->kn_hook32 &= ~NOTE_SUSPEND;
3e170ce0 6837
316670eb 6838 kn->kn_fflags |= NOTE_RESUME;
3e170ce0 6839 level_trigger |= NOTE_RESUME;
316670eb
A
6840 }
6841
6842 if (so->so_error != 0) {
6843 ret = 1;
cb323159 6844 data = so->so_error;
316670eb
A
6845 kn->kn_flags |= EV_EOF;
6846 } else {
f427ee49 6847 u_int32_t data32 = 0;
cb323159
A
6848 get_sockev_state(so, &data32);
6849 data = data32;
316670eb
A
6850 }
6851
3e170ce0
A
6852 /* Reset any events that are not requested on this knote */
6853 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6854 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6855
6856 /* Find the level triggerred events that are already delivered */
cb323159 6857 level_trigger &= kn->kn_hook32;
3e170ce0
A
6858 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6859
6860 /* Do not deliver level triggerred events more than once */
0a7de745 6861 if ((kn->kn_fflags & ~level_trigger) != 0) {
316670eb 6862 ret = 1;
0a7de745 6863 }
316670eb 6864
cb323159
A
6865 if (ret && kev) {
6866 /*
6867 * Store the state of the events being delivered. This
6868 * state can be used to deliver level triggered events
6869 * ateast once and still avoid waking up the application
6870 * multiple times as long as the event is active.
6871 */
6872 if (kn->kn_fflags != 0) {
6873 kn->kn_hook32 |= (kn->kn_fflags &
6874 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6875 }
6876
6877 /*
6878 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6879 * only one of them and remember the last one that was
6880 * delivered last
6881 */
6882 if (kn->kn_fflags & NOTE_SUSPEND) {
6883 kn->kn_hook32 &= ~NOTE_RESUME;
6884 }
6885 if (kn->kn_fflags & NOTE_RESUME) {
6886 kn->kn_hook32 &= ~NOTE_SUSPEND;
6887 }
6888
6889 knote_fill_kevent(kn, kev, data);
6890 }
0a7de745 6891 return ret;
316670eb
A
6892}
6893
39037602 6894static int
cb323159 6895filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
39037602 6896{
f427ee49 6897 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
39037602
A
6898
6899 /* socket locked */
cb323159 6900 kn->kn_hook32 = 0;
0a7de745 6901 if (KNOTE_ATTACH(&so->so_klist, kn)) {
39037602 6902 so->so_flags |= SOF_KNOTE;
0a7de745 6903 }
39037602
A
6904
6905 /* determine if event already fired */
cb323159 6906 return filt_sockev_common(kn, NULL, so, 0);
39037602
A
6907}
6908
3e170ce0 6909static void
39037602 6910filt_sockdetach(struct knote *kn)
3e170ce0 6911{
f427ee49 6912 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
39037602 6913 socket_lock(so, 1);
3e170ce0 6914
0a7de745
A
6915 if ((so->so_flags & SOF_KNOTE) != 0) {
6916 if (KNOTE_DETACH(&so->so_klist, kn)) {
39037602 6917 so->so_flags &= ~SOF_KNOTE;
0a7de745
A
6918 }
6919 }
39037602
A
6920 socket_unlock(so, 1);
6921}
6922
6923static int
6924filt_sockev(struct knote *kn, long hint)
6925{
6926 int ret = 0, locked = 0;
f427ee49 6927 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
39037602
A
6928 long ev_hint = (hint & SO_FILT_HINT_EV);
6929
6930 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6931 socket_lock(so, 1);
6932 locked = 1;
3e170ce0 6933 }
39037602 6934
cb323159 6935 ret = filt_sockev_common(kn, NULL, so, ev_hint);
39037602 6936
0a7de745 6937 if (locked) {
39037602 6938 socket_unlock(so, 1);
0a7de745 6939 }
39037602
A
6940
6941 return ret;
6942}
6943
6944
6945
6946/*
6947 * filt_socktouch - update event state
6948 */
6949static int
6950filt_socktouch(
6951 struct knote *kn,
cb323159 6952 struct kevent_qos_s *kev)
39037602 6953{
f427ee49 6954 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
39037602
A
6955 uint32_t changed_flags;
6956 int ret;
6957
6958 socket_lock(so, 1);
6959
6960 /* save off the [result] data and fflags */
cb323159 6961 changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
39037602
A
6962
6963 /* save off the new input fflags and data */
6964 kn->kn_sfflags = kev->fflags;
6965 kn->kn_sdata = kev->data;
39037602
A
6966
6967 /* restrict the current results to the (smaller?) set of new interest */
6968 /*
6969 * For compatibility with previous implementations, we leave kn_fflags
6970 * as they were before.
6971 */
6972 //kn->kn_fflags &= kev->fflags;
6973
6974 /*
6975 * Since we keep track of events that are already
6976 * delivered, if any of those events are not requested
6977 * anymore the state related to them can be reset
6978 */
cb323159 6979 kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
39037602
A
6980
6981 /* determine if we have events to deliver */
cb323159 6982 ret = filt_sockev_common(kn, NULL, so, 0);
39037602
A
6983
6984 socket_unlock(so, 1);
6985
6986 return ret;
6987}
6988
6989/*
6990 * filt_sockprocess - query event fired state and return data
6991 */
6992static int
cb323159 6993filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
39037602 6994{
f427ee49 6995 struct socket *so = (struct socket *)kn->kn_fp->fp_glob->fg_data;
39037602
A
6996 int ret = 0;
6997
6998 socket_lock(so, 1);
6999
cb323159 7000 ret = filt_sockev_common(kn, kev, so, 0);
39037602
A
7001
7002 socket_unlock(so, 1);
7003
7004 return ret;
3e170ce0
A
7005}
7006
316670eb 7007void
39236c6e
A
7008get_sockev_state(struct socket *so, u_int32_t *statep)
7009{
316670eb
A
7010 u_int32_t state = *(statep);
7011
39037602
A
7012 /*
7013 * If the state variable is already used by a previous event,
7014 * reset it.
7015 */
0a7de745 7016 if (state != 0) {
39037602 7017 return;
0a7de745 7018 }
39037602 7019
0a7de745 7020 if (so->so_state & SS_ISCONNECTED) {
316670eb 7021 state |= SOCKEV_CONNECTED;
0a7de745 7022 } else {
316670eb 7023 state &= ~(SOCKEV_CONNECTED);
0a7de745 7024 }
39236c6e 7025 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
316670eb 7026 *(statep) = state;
55e303ae
A
7027}
7028
0a7de745 7029#define SO_LOCK_HISTORY_STR_LEN \
39236c6e 7030 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
b0d623f7 7031
39236c6e
A
7032__private_extern__ const char *
7033solockhistory_nr(struct socket *so)
55e303ae 7034{
39236c6e
A
7035 size_t n = 0;
7036 int i;
7037 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7038
0a7de745 7039 bzero(lock_history_str, sizeof(lock_history_str));
39236c6e 7040 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
4ba76501 7041 n += scnprintf(lock_history_str + n,
39236c6e
A
7042 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7043 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7044 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
b0d623f7 7045 }
0a7de745 7046 return lock_history_str;
55e303ae
A
7047}
7048
cb323159
A
7049lck_mtx_t *
7050socket_getlock(struct socket *so, int flags)
7051{
7052 if (so->so_proto->pr_getlock != NULL) {
7053 return (*so->so_proto->pr_getlock)(so, flags);
7054 } else {
7055 return so->so_proto->pr_domain->dom_mtx;
7056 }
7057}
7058
5ba3f43e 7059void
2d21ac55 7060socket_lock(struct socket *so, int refcount)
91447636 7061{
b0d623f7 7062 void *lr_saved;
0c530ab8 7063
b0d623f7 7064 lr_saved = __builtin_return_address(0);
91447636
A
7065
7066 if (so->so_proto->pr_lock) {
5ba3f43e 7067 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
2d21ac55 7068 } else {
91447636 7069#ifdef MORE_LOCKING_DEBUG
5ba3f43e 7070 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
2d21ac55 7071 LCK_MTX_ASSERT_NOTOWNED);
91447636
A
7072#endif
7073 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
0a7de745 7074 if (refcount) {
91447636 7075 so->so_usecount++;
0a7de745 7076 }
b0d623f7 7077 so->lock_lr[so->next_lock_lr] = lr_saved;
0a7de745 7078 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
91447636 7079 }
5ba3f43e 7080}
91447636 7081
5ba3f43e
A
7082void
7083socket_lock_assert_owned(struct socket *so)
7084{
7085 lck_mtx_t *mutex_held;
7086
0a7de745 7087 if (so->so_proto->pr_getlock != NULL) {
5ba3f43e 7088 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
0a7de745 7089 } else {
5ba3f43e 7090 mutex_held = so->so_proto->pr_domain->dom_mtx;
0a7de745 7091 }
5ba3f43e
A
7092
7093 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
91447636
A
7094}
7095
7096int
5ba3f43e
A
7097socket_try_lock(struct socket *so)
7098{
7099 lck_mtx_t *mtx;
7100
0a7de745 7101 if (so->so_proto->pr_getlock != NULL) {
5ba3f43e 7102 mtx = (*so->so_proto->pr_getlock)(so, 0);
0a7de745 7103 } else {
5ba3f43e 7104 mtx = so->so_proto->pr_domain->dom_mtx;
0a7de745 7105 }
5ba3f43e 7106
0a7de745 7107 return lck_mtx_try_lock(mtx);
5ba3f43e
A
7108}
7109
7110void
2d21ac55 7111socket_unlock(struct socket *so, int refcount)
91447636 7112{
b0d623f7 7113 void *lr_saved;
2d21ac55 7114 lck_mtx_t *mutex_held;
91447636 7115
b0d623f7 7116 lr_saved = __builtin_return_address(0);
91447636 7117
cb323159 7118 if (so == NULL || so->so_proto == NULL) {
39236c6e
A
7119 panic("%s: null so_proto so=%p\n", __func__, so);
7120 /* NOTREACHED */
7121 }
91447636 7122
cb323159 7123 if (so->so_proto->pr_unlock) {
5ba3f43e 7124 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
2d21ac55 7125 } else {
91447636
A
7126 mutex_held = so->so_proto->pr_domain->dom_mtx;
7127#ifdef MORE_LOCKING_DEBUG
5ba3f43e 7128 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
91447636 7129#endif
b0d623f7 7130 so->unlock_lr[so->next_unlock_lr] = lr_saved;
0a7de745 7131 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
0c530ab8 7132
91447636 7133 if (refcount) {
39236c6e
A
7134 if (so->so_usecount <= 0) {
7135 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7136 "lrh=%s", __func__, so->so_usecount, so,
7137 SOCK_DOM(so), so->so_type,
7138 SOCK_PROTO(so), solockhistory_nr(so));
7139 /* NOTREACHED */
7140 }
7141
91447636 7142 so->so_usecount--;
0a7de745 7143 if (so->so_usecount == 0) {
91447636 7144 sofreelastref(so, 1);
0a7de745 7145 }
91447636
A
7146 }
7147 lck_mtx_unlock(mutex_held);
7148 }
91447636 7149}
2d21ac55
A
7150
7151/* Called with socket locked, will unlock socket */
91447636 7152void
2d21ac55 7153sofree(struct socket *so)
91447636 7154{
2d21ac55 7155 lck_mtx_t *mutex_held;
39236c6e 7156
0a7de745 7157 if (so->so_proto->pr_getlock != NULL) {
91447636 7158 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
0a7de745 7159 } else {
91447636 7160 mutex_held = so->so_proto->pr_domain->dom_mtx;
0a7de745 7161 }
5ba3f43e 7162 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
2d21ac55 7163
91447636
A
7164 sofreelastref(so, 0);
7165}
7166
7167void
2d21ac55 7168soreference(struct socket *so)
91447636 7169{
0a7de745
A
7170 socket_lock(so, 1); /* locks & take one reference on socket */
7171 socket_unlock(so, 0); /* unlock only */
91447636
A
7172}
7173
7174void
2d21ac55 7175sodereference(struct socket *so)
91447636
A
7176{
7177 socket_lock(so, 0);
7178 socket_unlock(so, 1);
7179}
2d21ac55
A
7180
7181/*
7182 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7183 * possibility of using jumbo clusters. Caller must ensure to hold
7184 * the socket lock.
7185 */
7186void
7187somultipages(struct socket *so, boolean_t set)
7188{
0a7de745 7189 if (set) {
2d21ac55 7190 so->so_flags |= SOF_MULTIPAGES;
0a7de745 7191 } else {
2d21ac55 7192 so->so_flags &= ~SOF_MULTIPAGES;
0a7de745 7193 }
2d21ac55 7194}
b0d623f7 7195
fe8ab488
A
7196void
7197soif2kcl(struct socket *so, boolean_t set)
7198{
0a7de745 7199 if (set) {
fe8ab488 7200 so->so_flags1 |= SOF1_IF_2KCL;
0a7de745 7201 } else {
fe8ab488 7202 so->so_flags1 &= ~SOF1_IF_2KCL;
0a7de745 7203 }
fe8ab488
A
7204}
7205
b0d623f7 7206int
0a7de745
A
7207so_isdstlocal(struct socket *so)
7208{
b0d623f7
A
7209 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7210
0a7de745
A
7211 if (SOCK_DOM(so) == PF_INET) {
7212 return inaddr_local(inp->inp_faddr);
7213 } else if (SOCK_DOM(so) == PF_INET6) {
7214 return in6addr_local(&inp->in6p_faddr);
7215 }
39236c6e 7216
0a7de745 7217 return 0;
b0d623f7 7218}
6d2010ae
A
7219
7220int
7221sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7222{
39236c6e 7223 struct sockbuf *rcv, *snd;
6d2010ae
A
7224 int err = 0, defunct;
7225
39236c6e
A
7226 rcv = &so->so_rcv;
7227 snd = &so->so_snd;
7228
6d2010ae
A
7229 defunct = (so->so_flags & SOF_DEFUNCT);
7230 if (defunct) {
39236c6e 7231 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
6d2010ae 7232 panic("%s: SB_DROP not set", __func__);
39236c6e
A
7233 /* NOTREACHED */
7234 }
6d2010ae
A
7235 goto done;
7236 }
7237
7238 if (so->so_flags & SOF_NODEFUNCT) {
7239 if (noforce) {
7240 err = EOPNOTSUPP;
d9a64523
A
7241 if (p != PROC_NULL) {
7242 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7243 "name %s level %d) so 0x%llx [%d,%d] "
7244 "is not eligible for defunct "
7245 "(%d)\n", __func__, proc_selfpid(),
7246 proc_best_name(current_proc()), proc_pid(p),
7247 proc_best_name(p), level,
7248 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7249 SOCK_DOM(so), SOCK_TYPE(so), err);
7250 }
0a7de745 7251 return err;
d9a64523
A
7252 }
7253 so->so_flags &= ~SOF_NODEFUNCT;
7254 if (p != PROC_NULL) {
39037602
A
7255 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7256 "name %s level %d) so 0x%llx [%d,%d] "
d9a64523 7257 "defunct by force "
39037602
A
7258 "(%d)\n", __func__, proc_selfpid(),
7259 proc_best_name(current_proc()), proc_pid(p),
7260 proc_best_name(p), level,
7261 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7262 SOCK_DOM(so), SOCK_TYPE(so), err);
6d2010ae 7263 }
3e170ce0
A
7264 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7265 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7266 struct ifnet *ifp = inp->inp_last_outifp;
7267
7268 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7269 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7270 } else if (so->so_flags & SOF_DELEGATED) {
7271 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7272 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7273 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
d9a64523 7274 } else if (noforce && p != PROC_NULL) {
3e170ce0 7275 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
39037602 7276
3e170ce0
A
7277 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7278 so->so_extended_bk_start = net_uptime();
7279 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
39037602 7280
3e170ce0 7281 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
39037602 7282
3e170ce0 7283 err = EOPNOTSUPP;
d9a64523
A
7284 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7285 "name %s level %d) so 0x%llx [%d,%d] "
7286 "extend bk idle "
7287 "(%d)\n", __func__, proc_selfpid(),
39037602
A
7288 proc_best_name(current_proc()), proc_pid(p),
7289 proc_best_name(p), level,
7290 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
d9a64523 7291 SOCK_DOM(so), SOCK_TYPE(so), err);
0a7de745 7292 return err;
3e170ce0
A
7293 } else {
7294 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7295 }
6d2010ae
A
7296 }
7297
7298 so->so_flags |= SOF_DEFUNCT;
39236c6e 7299
6d2010ae 7300 /* Prevent further data from being appended to the socket buffers */
39236c6e
A
7301 snd->sb_flags |= SB_DROP;
7302 rcv->sb_flags |= SB_DROP;
7303
7304 /* Flush any existing data in the socket buffers */
7305 if (rcv->sb_cc != 0) {
7306 rcv->sb_flags &= ~SB_SEL;
7307 selthreadclear(&rcv->sb_sel);
7308 sbrelease(rcv);
7309 }
7310 if (snd->sb_cc != 0) {
7311 snd->sb_flags &= ~SB_SEL;
7312 selthreadclear(&snd->sb_sel);
7313 sbrelease(snd);
7314 }
6d2010ae
A
7315
7316done:
d9a64523
A
7317 if (p != PROC_NULL) {
7318 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7319 "so 0x%llx [%d,%d] %s defunct%s\n", __func__,
7320 proc_selfpid(), proc_best_name(current_proc()),
7321 proc_pid(p), proc_best_name(p), level,
7322 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7323 SOCK_TYPE(so), defunct ? "is already" : "marked as",
7324 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7325 " extbkidle" : "");
7326 }
0a7de745 7327 return err;
6d2010ae
A
7328}
7329
7330int
7331sodefunct(struct proc *p, struct socket *so, int level)
7332{
7333 struct sockbuf *rcv, *snd;
7334
39236c6e 7335 if (!(so->so_flags & SOF_DEFUNCT)) {
6d2010ae 7336 panic("%s improperly called", __func__);
39236c6e
A
7337 /* NOTREACHED */
7338 }
0a7de745 7339 if (so->so_state & SS_DEFUNCT) {
6d2010ae 7340 goto done;
0a7de745 7341 }
6d2010ae
A
7342
7343 rcv = &so->so_rcv;
7344 snd = &so->so_snd;
7345
39236c6e
A
7346 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7347 char s[MAX_IPv6_STR_LEN];
7348 char d[MAX_IPv6_STR_LEN];
7349 struct inpcb *inp = sotoinpcb(so);
7350
d9a64523
A
7351 if (p != PROC_NULL) {
7352 SODEFUNCTLOG(
0a7de745
A
7353 "%s[%d, %s]: (target pid %d name %s level %d) "
7354 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
7355 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7356 " snd_fl 0x%x]\n", __func__,
7357 proc_selfpid(), proc_best_name(current_proc()),
7358 proc_pid(p), proc_best_name(p), level,
7359 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7360 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7361 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7362 (void *)&inp->inp_laddr.s_addr :
7363 (void *)&inp->in6p_laddr),
7364 s, sizeof(s)), ntohs(inp->in6p_lport),
7365 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7366 (void *)&inp->inp_faddr.s_addr :
7367 (void *)&inp->in6p_faddr,
7368 d, sizeof(d)), ntohs(inp->in6p_fport),
7369 (uint32_t)rcv->sb_sel.si_flags,
7370 (uint32_t)snd->sb_sel.si_flags,
7371 rcv->sb_flags, snd->sb_flags);
7372 }
7373 } else if (p != PROC_NULL) {
39037602
A
7374 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7375 "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7376 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7377 proc_selfpid(), proc_best_name(current_proc()),
7378 proc_pid(p), proc_best_name(p), level,
7379 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7380 SOCK_DOM(so), SOCK_TYPE(so),
7381 (uint32_t)rcv->sb_sel.si_flags,
39236c6e 7382 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
39037602 7383 snd->sb_flags);
39236c6e 7384 }
6d2010ae
A
7385
7386 /*
7387 * Unwedge threads blocked on sbwait() and sb_lock().
7388 */
7389 sbwakeup(rcv);
7390 sbwakeup(snd);
7391
fe8ab488 7392 so->so_flags1 |= SOF1_DEFUNCTINPROG;
0a7de745
A
7393 if (rcv->sb_flags & SB_LOCK) {
7394 sbunlock(rcv, TRUE); /* keep socket locked */
7395 }
7396 if (snd->sb_flags & SB_LOCK) {
7397 sbunlock(snd, TRUE); /* keep socket locked */
7398 }
6d2010ae
A
7399 /*
7400 * Flush the buffers and disconnect. We explicitly call shutdown
7401 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7402 * states are set for the socket. This would also flush out data
7403 * hanging off the receive list of this socket.
7404 */
fe8ab488
A
7405 (void) soshutdownlock_final(so, SHUT_RD);
7406 (void) soshutdownlock_final(so, SHUT_WR);
6d2010ae
A
7407 (void) sodisconnectlocked(so);
7408
7409 /*
7410 * Explicitly handle connectionless-protocol disconnection
7411 * and release any remaining data in the socket buffers.
7412 */
0a7de745 7413 if (!(so->so_state & SS_ISDISCONNECTED)) {
6d2010ae 7414 (void) soisdisconnected(so);
0a7de745 7415 }
6d2010ae 7416
0a7de745 7417 if (so->so_error == 0) {
6d2010ae 7418 so->so_error = EBADF;
0a7de745 7419 }
6d2010ae 7420
39236c6e
A
7421 if (rcv->sb_cc != 0) {
7422 rcv->sb_flags &= ~SB_SEL;
7423 selthreadclear(&rcv->sb_sel);
6d2010ae 7424 sbrelease(rcv);
39236c6e
A
7425 }
7426 if (snd->sb_cc != 0) {
7427 snd->sb_flags &= ~SB_SEL;
7428 selthreadclear(&snd->sb_sel);
6d2010ae 7429 sbrelease(snd);
39236c6e 7430 }
6d2010ae 7431 so->so_state |= SS_DEFUNCT;
39037602 7432 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
6d2010ae
A
7433
7434done:
0a7de745 7435 return 0;
6d2010ae 7436}
316670eb 7437
3e170ce0
A
7438int
7439soresume(struct proc *p, struct socket *so, int locked)
7440{
0a7de745 7441 if (locked == 0) {
3e170ce0 7442 socket_lock(so, 1);
0a7de745 7443 }
3e170ce0
A
7444
7445 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
39037602
A
7446 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7447 "[%d,%d] resumed from bk idle\n",
7448 __func__, proc_selfpid(), proc_best_name(current_proc()),
7449 proc_pid(p), proc_best_name(p),
3e170ce0 7450 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 7451 SOCK_DOM(so), SOCK_TYPE(so));
3e170ce0
A
7452
7453 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7454 so->so_extended_bk_start = 0;
7455 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7456
7457 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7458 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7459 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7460 }
0a7de745 7461 if (locked == 0) {
3e170ce0 7462 socket_unlock(so, 1);
0a7de745 7463 }
3e170ce0 7464
0a7de745 7465 return 0;
3e170ce0
A
7466}
7467
7468/*
7469 * Does not attempt to account for sockets that are delegated from
7470 * the current process
7471 */
7472int
7473so_set_extended_bk_idle(struct socket *so, int optval)
7474{
7475 int error = 0;
7476
7477 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7478 SOCK_PROTO(so) != IPPROTO_TCP) {
7479 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7480 error = EOPNOTSUPP;
7481 } else if (optval == 0) {
7482 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7483
7484 soresume(current_proc(), so, 1);
7485 } else {
7486 struct proc *p = current_proc();
f427ee49 7487 struct fileproc *fp;
3e170ce0
A
7488 int count = 0;
7489
5ba3f43e
A
7490 /*
7491 * Unlock socket to avoid lock ordering issue with
7492 * the proc fd table lock
0a7de745 7493 */
5ba3f43e
A
7494 socket_unlock(so, 0);
7495
3e170ce0 7496 proc_fdlock(p);
f427ee49 7497 fdt_foreach(fp, p) {
3e170ce0
A
7498 struct socket *so2;
7499
f427ee49 7500 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
3e170ce0 7501 continue;
0a7de745 7502 }
3e170ce0 7503
f427ee49 7504 so2 = (struct socket *)fp->fp_glob->fg_data;
3e170ce0 7505 if (so != so2 &&
0a7de745 7506 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
3e170ce0 7507 count++;
0a7de745
A
7508 }
7509 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
3e170ce0 7510 break;
0a7de745 7511 }
3e170ce0 7512 }
5ba3f43e
A
7513 proc_fdunlock(p);
7514
7515 socket_lock(so, 0);
7516
3e170ce0
A
7517 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7518 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7519 error = EBUSY;
7520 } else if (so->so_flags & SOF_DELEGATED) {
7521 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7522 error = EBUSY;
7523 } else {
7524 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7525 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7526 }
39037602 7527 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
3e170ce0 7528 "%s marked for extended bk idle\n",
39037602 7529 __func__, proc_selfpid(), proc_best_name(current_proc()),
3e170ce0
A
7530 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7531 SOCK_DOM(so), SOCK_TYPE(so),
7532 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
39037602 7533 "is" : "not");
3e170ce0
A
7534 }
7535
0a7de745 7536 return error;
3e170ce0
A
7537}
7538
7539static void
7540so_stop_extended_bk_idle(struct socket *so)
7541{
7542 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7543 so->so_extended_bk_start = 0;
7544
7545 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7546 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7547 /*
7548 * Force defunct
7549 */
7550 sosetdefunct(current_proc(), so,
7551 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7552 if (so->so_flags & SOF_DEFUNCT) {
7553 sodefunct(current_proc(), so,
7554 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7555 }
7556}
7557
7558void
7559so_drain_extended_bk_idle(struct socket *so)
7560{
7561 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7562 /*
7563 * Only penalize sockets that have outstanding data
7564 */
7565 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7566 so_stop_extended_bk_idle(so);
7567
7568 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7569 }
7570 }
7571}
7572
7573/*
7574 * Return values tells if socket is still in extended background idle
7575 */
7576int
7577so_check_extended_bk_idle_time(struct socket *so)
7578{
7579 int ret = 1;
7580
7581 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
39037602
A
7582 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7583 __func__, proc_selfpid(), proc_best_name(current_proc()),
3e170ce0 7584 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 7585 SOCK_DOM(so), SOCK_TYPE(so));
3e170ce0
A
7586 if (net_uptime() - so->so_extended_bk_start >
7587 soextbkidlestat.so_xbkidle_time) {
7588 so_stop_extended_bk_idle(so);
7589
7590 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7591
7592 ret = 0;
7593 } else {
7594 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7595
7596 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7597 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7598 }
7599 }
39037602 7600
0a7de745 7601 return ret;
3e170ce0
A
7602}
7603
7604void
7605resume_proc_sockets(proc_t p)
7606{
7607 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
f427ee49
A
7608 struct fileproc *fp;
7609 struct socket *so;
3e170ce0
A
7610
7611 proc_fdlock(p);
f427ee49
A
7612 fdt_foreach(fp, p) {
7613 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
3e170ce0 7614 continue;
0a7de745 7615 }
3e170ce0 7616
f427ee49 7617 so = (struct socket *)fp->fp_glob->fg_data;
3e170ce0
A
7618 (void) soresume(p, so, 0);
7619 }
7620 proc_fdunlock(p);
7621
7622 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7623 }
7624}
7625
316670eb
A
7626__private_extern__ int
7627so_set_recv_anyif(struct socket *so, int optval)
7628{
7629 int ret = 0;
7630
39236c6e 7631 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
0a7de745 7632 if (optval) {
316670eb 7633 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
0a7de745 7634 } else {
316670eb 7635 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
0a7de745 7636 }
316670eb
A
7637 }
7638
5ba3f43e 7639
0a7de745 7640 return ret;
316670eb
A
7641}
7642
7643__private_extern__ int
7644so_get_recv_anyif(struct socket *so)
7645{
7646 int ret = 0;
7647
39236c6e 7648 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
316670eb
A
7649 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7650 }
7651
0a7de745 7652 return ret;
316670eb 7653}
39236c6e
A
7654
7655int
7656so_set_restrictions(struct socket *so, uint32_t vals)
7657{
7658 int nocell_old, nocell_new;
fe8ab488 7659 int noexpensive_old, noexpensive_new;
cb323159 7660 int noconstrained_old, noconstrained_new;
39236c6e
A
7661
7662 /*
7663 * Deny-type restrictions are trapdoors; once set they cannot be
7664 * unset for the lifetime of the socket. This allows them to be
7665 * issued by a framework on behalf of the application without
7666 * having to worry that they can be undone.
7667 *
7668 * Note here that socket-level restrictions overrides any protocol
7669 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7670 * socket restriction issued on the socket has a higher precendence
7671 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7672 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7673 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7674 */
7675 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
fe8ab488 7676 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
cb323159 7677 noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
39236c6e 7678 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
3e170ce0 7679 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
cb323159 7680 SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
39236c6e 7681 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
fe8ab488 7682 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
cb323159 7683 noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
39236c6e
A
7684
7685 /* we can only set, not clear restrictions */
fe8ab488 7686 if ((nocell_new - nocell_old) == 0 &&
cb323159
A
7687 (noexpensive_new - noexpensive_old) == 0 &&
7688 (noconstrained_new - noconstrained_old) == 0) {
0a7de745
A
7689 return 0;
7690 }
39236c6e 7691 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
fe8ab488 7692 if (nocell_new - nocell_old != 0) {
3e170ce0
A
7693 /*
7694 * if deny cellular is now set, do what's needed
7695 * for INPCB
7696 */
fe8ab488
A
7697 inp_set_nocellular(sotoinpcb(so));
7698 }
7699 if (noexpensive_new - noexpensive_old != 0) {
7700 inp_set_noexpensive(sotoinpcb(so));
7701 }
cb323159
A
7702 if (noconstrained_new - noconstrained_old != 0) {
7703 inp_set_noconstrained(sotoinpcb(so));
7704 }
39236c6e
A
7705 }
7706
0a7de745 7707 if (SOCK_DOM(so) == PF_MULTIPATH) {
5ba3f43e 7708 mptcp_set_restrictions(so);
0a7de745 7709 }
5ba3f43e 7710
0a7de745 7711 return 0;
39236c6e
A
7712}
7713
7714uint32_t
7715so_get_restrictions(struct socket *so)
7716{
0a7de745
A
7717 return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7718 SO_RESTRICT_DENY_OUT |
7719 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
39236c6e
A
7720}
7721
39236c6e 7722int
cb323159 7723so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
39236c6e
A
7724{
7725 struct proc *ep = PROC_NULL;
7726 int error = 0;
7727
7728 /* pid 0 is reserved for kernel */
7729 if (epid == 0) {
7730 error = EINVAL;
7731 goto done;
7732 }
7733
7734 /*
7735 * If this is an in-kernel socket, prevent its delegate
7736 * association from changing unless the socket option is
7737 * coming from within the kernel itself.
7738 */
7739 if (so->last_pid == 0 && p != kernproc) {
7740 error = EACCES;
7741 goto done;
7742 }
7743
7744 /*
7745 * If this is issued by a process that's recorded as the
7746 * real owner of the socket, or if the pid is the same as
7747 * the process's own pid, then proceed. Otherwise ensure
7748 * that the issuing process has the necessary privileges.
7749 */
cb323159 7750 if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
39236c6e
A
7751 if ((error = priv_check_cred(kauth_cred_get(),
7752 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7753 error = EACCES;
7754 goto done;
7755 }
7756 }
7757
7758 /* Find the process that corresponds to the effective pid */
7759 if ((ep = proc_find(epid)) == PROC_NULL) {
7760 error = ESRCH;
7761 goto done;
7762 }
7763
7764 /*
7765 * If a process tries to delegate the socket to itself, then
7766 * there's really nothing to do; treat it as a way for the
7767 * delegate association to be cleared. Note that we check
7768 * the passed-in proc rather than calling proc_selfpid(),
7769 * as we need to check the process issuing the socket option
7770 * which could be kernproc. Given that we don't allow 0 for
7771 * effective pid, it means that a delegated in-kernel socket
7772 * stays delegated during its lifetime (which is probably OK.)
7773 */
7774 if (epid == proc_pid(p)) {
7775 so->so_flags &= ~SOF_DELEGATED;
7776 so->e_upid = 0;
7777 so->e_pid = 0;
7778 uuid_clear(so->e_uuid);
7779 } else {
7780 so->so_flags |= SOF_DELEGATED;
7781 so->e_upid = proc_uniqueid(ep);
7782 so->e_pid = proc_pid(ep);
0a7de745 7783 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
ea3f0419
A
7784
7785#if defined(XNU_TARGET_OS_OSX)
7786 if (ep->p_responsible_pid != so->e_pid) {
7787 proc_t rp = proc_find(ep->p_responsible_pid);
7788 if (rp != PROC_NULL) {
7789 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
7790 so->so_rpid = ep->p_responsible_pid;
7791 proc_rele(rp);
7792 } else {
7793 uuid_clear(so->so_ruuid);
7794 so->so_rpid = -1;
7795 }
7796 }
7797#endif
39236c6e 7798 }
cb323159
A
7799 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7800 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
7801 }
39236c6e
A
7802done:
7803 if (error == 0 && net_io_policy_log) {
7804 uuid_string_t buf;
7805
7806 uuid_unparse(so->e_uuid, buf);
7807 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7808 "euuid %s%s\n", __func__, proc_name_address(p),
3e170ce0
A
7809 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7810 SOCK_DOM(so), SOCK_TYPE(so),
7811 so->e_pid, proc_name_address(ep), buf,
39236c6e
A
7812 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7813 } else if (error != 0 && net_io_policy_log) {
7814 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7815 "ERROR (%d)\n", __func__, proc_name_address(p),
3e170ce0
A
7816 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7817 SOCK_DOM(so), SOCK_TYPE(so),
7818 epid, (ep == PROC_NULL) ? "PROC_NULL" :
39236c6e
A
7819 proc_name_address(ep), error);
7820 }
7821
fe8ab488
A
7822 /* Update this socket's policy upon success */
7823 if (error == 0) {
7824 so->so_policy_gencnt *= -1;
7825 so_update_policy(so);
7826#if NECP
7827 so_update_necp_policy(so, NULL, NULL);
7828#endif /* NECP */
7829 }
7830
0a7de745 7831 if (ep != PROC_NULL) {
39236c6e 7832 proc_rele(ep);
0a7de745 7833 }
39236c6e 7834
0a7de745 7835 return error;
39236c6e
A
7836}
7837
7838int
cb323159 7839so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
39236c6e
A
7840{
7841 uuid_string_t buf;
7842 uuid_t uuid;
7843 int error = 0;
7844
7845 /* UUID must not be all-zeroes (reserved for kernel) */
7846 if (uuid_is_null(euuid)) {
7847 error = EINVAL;
3e170ce0 7848 goto done;
39236c6e
A
7849 }
7850
7851 /*
7852 * If this is an in-kernel socket, prevent its delegate
7853 * association from changing unless the socket option is
7854 * coming from within the kernel itself.
7855 */
7856 if (so->last_pid == 0 && p != kernproc) {
7857 error = EACCES;
7858 goto done;
7859 }
7860
7861 /* Get the UUID of the issuing process */
0a7de745 7862 proc_getexecutableuuid(p, uuid, sizeof(uuid));
39236c6e
A
7863
7864 /*
7865 * If this is issued by a process that's recorded as the
7866 * real owner of the socket, or if the uuid is the same as
7867 * the process's own uuid, then proceed. Otherwise ensure
7868 * that the issuing process has the necessary privileges.
7869 */
cb323159
A
7870 if (check_cred &&
7871 (uuid_compare(euuid, so->last_uuid) != 0 ||
7872 uuid_compare(euuid, uuid) != 0)) {
39236c6e
A
7873 if ((error = priv_check_cred(kauth_cred_get(),
7874 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7875 error = EACCES;
7876 goto done;
7877 }
7878 }
7879
7880 /*
7881 * If a process tries to delegate the socket to itself, then
7882 * there's really nothing to do; treat it as a way for the
7883 * delegate association to be cleared. Note that we check
7884 * the uuid of the passed-in proc rather than that of the
7885 * current process, as we need to check the process issuing
7886 * the socket option which could be kernproc itself. Given
7887 * that we don't allow 0 for effective uuid, it means that
7888 * a delegated in-kernel socket stays delegated during its
7889 * lifetime (which is okay.)
7890 */
7891 if (uuid_compare(euuid, uuid) == 0) {
7892 so->so_flags &= ~SOF_DELEGATED;
7893 so->e_upid = 0;
7894 so->e_pid = 0;
7895 uuid_clear(so->e_uuid);
7896 } else {
7897 so->so_flags |= SOF_DELEGATED;
7898 /*
7899 * Unlike so_set_effective_pid(), we only have the UUID
7900 * here and the process ID is not known. Inherit the
7901 * real {pid,upid} of the socket.
7902 */
7903 so->e_upid = so->last_upid;
7904 so->e_pid = so->last_pid;
7905 uuid_copy(so->e_uuid, euuid);
7906 }
cb323159
A
7907 /*
7908 * The following will clear the effective process name as it's the same
7909 * as the real process
7910 */
7911 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7912 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
7913 }
39236c6e
A
7914done:
7915 if (error == 0 && net_io_policy_log) {
7916 uuid_unparse(so->e_uuid, buf);
7917 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7918 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
3e170ce0 7919 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
39236c6e
A
7920 SOCK_TYPE(so), so->e_pid, buf,
7921 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7922 } else if (error != 0 && net_io_policy_log) {
7923 uuid_unparse(euuid, buf);
7924 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7925 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
3e170ce0 7926 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
39236c6e
A
7927 SOCK_TYPE(so), buf, error);
7928 }
7929
fe8ab488
A
7930 /* Update this socket's policy upon success */
7931 if (error == 0) {
7932 so->so_policy_gencnt *= -1;
7933 so_update_policy(so);
7934#if NECP
7935 so_update_necp_policy(so, NULL, NULL);
7936#endif /* NECP */
7937 }
7938
0a7de745 7939 return error;
39236c6e
A
7940}
7941
7942void
7943netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
7944 uint32_t ev_datalen)
7945{
7946 struct kev_msg ev_msg;
7947
7948 /*
7949 * A netpolicy event always starts with a netpolicy_event_data
7950 * structure, but the caller can provide for a longer event
7951 * structure to post, depending on the event code.
7952 */
0a7de745 7953 VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
39236c6e 7954
0a7de745
A
7955 bzero(&ev_msg, sizeof(ev_msg));
7956 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7957 ev_msg.kev_class = KEV_NETWORK_CLASS;
7958 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
7959 ev_msg.event_code = ev_code;
39236c6e 7960
0a7de745 7961 ev_msg.dv[0].data_ptr = ev_data;
39236c6e
A
7962 ev_msg.dv[0].data_length = ev_datalen;
7963
7964 kev_post_msg(&ev_msg);
7965}
fe8ab488
A
7966
7967void
3e170ce0 7968socket_post_kev_msg(uint32_t ev_code,
fe8ab488
A
7969 struct kev_socket_event_data *ev_data,
7970 uint32_t ev_datalen)
7971{
7972 struct kev_msg ev_msg;
7973
7974 bzero(&ev_msg, sizeof(ev_msg));
7975 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7976 ev_msg.kev_class = KEV_NETWORK_CLASS;
7977 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
7978 ev_msg.event_code = ev_code;
7979
7980 ev_msg.dv[0].data_ptr = ev_data;
0a7de745 7981 ev_msg.dv[0].data_length = ev_datalen;
fe8ab488
A
7982
7983 kev_post_msg(&ev_msg);
7984}
7985
7986void
7987socket_post_kev_msg_closed(struct socket *so)
7988{
f427ee49 7989 struct kev_socket_closed ev = {};
fe8ab488
A
7990 struct sockaddr *socksa = NULL, *peersa = NULL;
7991 int err;
f427ee49
A
7992
7993 if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
7994 return;
7995 }
fe8ab488
A
7996 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
7997 if (err == 0) {
7998 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
7999 &peersa);
8000 if (err == 0) {
8001 memcpy(&ev.ev_data.kev_sockname, socksa,
8002 min(socksa->sa_len,
0a7de745 8003 sizeof(ev.ev_data.kev_sockname)));
fe8ab488
A
8004 memcpy(&ev.ev_data.kev_peername, peersa,
8005 min(peersa->sa_len,
0a7de745 8006 sizeof(ev.ev_data.kev_peername)));
fe8ab488 8007 socket_post_kev_msg(KEV_SOCKET_CLOSED,
0a7de745 8008 &ev.ev_data, sizeof(ev));
fe8ab488
A
8009 }
8010 }
c3c9b80d
A
8011 FREE(socksa, M_SONAME);
8012 FREE(peersa, M_SONAME);
fe8ab488 8013}