]> git.saurik.com Git - apple/xnu.git/blame - bsd/kern/uipc_socket.c
xnu-6153.61.1.tar.gz
[apple/xnu.git] / bsd / kern / uipc_socket.c
CommitLineData
1c79356b 1/*
cb323159 2 * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
39236c6e 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
39236c6e 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
39236c6e 17 *
2d21ac55
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
39236c6e 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b 27 */
1c79356b
A
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
9bccf70c 61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
1c79356b 62 */
2d21ac55
A
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
1c79356b
A
69
70#include <sys/param.h>
71#include <sys/systm.h>
55e303ae 72#include <sys/filedesc.h>
2d21ac55 73#include <sys/proc.h>
91447636
A
74#include <sys/proc_internal.h>
75#include <sys/kauth.h>
76#include <sys/file_internal.h>
1c79356b
A
77#include <sys/fcntl.h>
78#include <sys/malloc.h>
79#include <sys/mbuf.h>
80#include <sys/domain.h>
81#include <sys/kernel.h>
55e303ae 82#include <sys/event.h>
1c79356b
A
83#include <sys/poll.h>
84#include <sys/protosw.h>
85#include <sys/socket.h>
86#include <sys/socketvar.h>
87#include <sys/resourcevar.h>
88#include <sys/signalvar.h>
89#include <sys/sysctl.h>
39236c6e 90#include <sys/syslog.h>
1c79356b 91#include <sys/uio.h>
fe8ab488 92#include <sys/uio_internal.h>
1c79356b
A
93#include <sys/ev.h>
94#include <sys/kdebug.h>
2d21ac55 95#include <sys/un.h>
d1ecb069 96#include <sys/user.h>
316670eb 97#include <sys/priv.h>
39236c6e 98#include <sys/kern_event.h>
1c79356b 99#include <net/route.h>
39236c6e 100#include <net/init.h>
5ba3f43e 101#include <net/net_api_stats.h>
316670eb 102#include <net/ntstat.h>
fe8ab488 103#include <net/content_filter.h>
1c79356b
A
104#include <netinet/in.h>
105#include <netinet/in_pcb.h>
39037602 106#include <netinet/in_tclass.h>
cb323159 107#include <netinet/in_var.h>
39037602 108#include <netinet/tcp_var.h>
6d2010ae
A
109#include <netinet/ip6.h>
110#include <netinet6/ip6_var.h>
39236c6e 111#include <netinet/flow_divert.h>
1c79356b 112#include <kern/zalloc.h>
91447636 113#include <kern/locks.h>
1c79356b 114#include <machine/limits.h>
2d21ac55
A
115#include <libkern/OSAtomic.h>
116#include <pexpert/pexpert.h>
b0d623f7 117#include <kern/assert.h>
6d2010ae 118#include <kern/task.h>
39037602
A
119#include <kern/policy_internal.h>
120
316670eb 121#include <sys/kpi_mbuf.h>
6d2010ae 122#include <sys/mcache.h>
fe8ab488 123#include <sys/unpcb.h>
5ba3f43e 124#include <libkern/section_keywords.h>
2d21ac55
A
125
126#if CONFIG_MACF
2d21ac55
A
127#include <security/mac_framework.h>
128#endif /* MAC */
129
39236c6e
A
130#if MULTIPATH
131#include <netinet/mp_pcb.h>
fe8ab488 132#include <netinet/mptcp_var.h>
39236c6e
A
133#endif /* MULTIPATH */
134
3e170ce0
A
135#define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
136
137#if DEBUG || DEVELOPMENT
0a7de745 138#define DEBUG_KERNEL_ADDRPERM(_v) (_v)
3e170ce0 139#else
0a7de745 140#define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
3e170ce0
A
141#endif
142
39236c6e
A
143/* TODO: this should be in a header file somewhere */
144extern char *proc_name_address(void *p);
145
0a7de745
A
146static u_int32_t so_cache_hw; /* High water mark for socache */
147static u_int32_t so_cache_timeouts; /* number of timeouts */
148static u_int32_t so_cache_max_freed; /* max freed per timeout */
149static u_int32_t cached_sock_count = 0;
150STAILQ_HEAD(, socket) so_cache_head;
151int max_cached_sock_count = MAX_CACHED_SOCKETS;
152static u_int32_t so_cache_time;
153static int socketinit_done;
154static struct zone *so_cache_zone;
155
156static lck_grp_t *so_cache_mtx_grp;
157static lck_attr_t *so_cache_mtx_attr;
158static lck_grp_attr_t *so_cache_mtx_grp_attr;
159static lck_mtx_t *so_cache_mtx;
91447636 160
1c79356b
A
161#include <machine/limits.h>
162
cb323159 163static int filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
0a7de745
A
164static void filt_sordetach(struct knote *kn);
165static int filt_soread(struct knote *kn, long hint);
cb323159
A
166static int filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
167static int filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
39037602 168
cb323159 169static int filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
0a7de745
A
170static void filt_sowdetach(struct knote *kn);
171static int filt_sowrite(struct knote *kn, long hint);
cb323159
A
172static int filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
173static int filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
39037602 174
cb323159 175static int filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
0a7de745
A
176static void filt_sockdetach(struct knote *kn);
177static int filt_sockev(struct knote *kn, long hint);
cb323159
A
178static int filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
179static int filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
2d21ac55 180
39236c6e
A
181static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
182static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
55e303ae 183
5ba3f43e 184SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
39236c6e 185 .f_isfd = 1,
39037602 186 .f_attach = filt_sorattach,
39236c6e
A
187 .f_detach = filt_sordetach,
188 .f_event = filt_soread,
39037602
A
189 .f_touch = filt_sortouch,
190 .f_process = filt_sorprocess,
b0d623f7 191};
39236c6e 192
5ba3f43e 193SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
39236c6e 194 .f_isfd = 1,
39037602 195 .f_attach = filt_sowattach,
39236c6e
A
196 .f_detach = filt_sowdetach,
197 .f_event = filt_sowrite,
39037602
A
198 .f_touch = filt_sowtouch,
199 .f_process = filt_sowprocess,
b0d623f7 200};
39236c6e 201
5ba3f43e 202SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
316670eb 203 .f_isfd = 1,
39037602 204 .f_attach = filt_sockattach,
316670eb
A
205 .f_detach = filt_sockdetach,
206 .f_event = filt_sockev,
3e170ce0 207 .f_touch = filt_socktouch,
39037602
A
208 .f_process = filt_sockprocess,
209};
210
5ba3f43e 211SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
39037602
A
212 .f_isfd = 1,
213 .f_attach = filt_sorattach,
214 .f_detach = filt_sordetach,
215 .f_event = filt_soread,
216 .f_touch = filt_sortouch,
217 .f_process = filt_sorprocess,
316670eb 218};
55e303ae 219
fe8ab488
A
220SYSCTL_DECL(_kern_ipc);
221
0a7de745 222#define EVEN_MORE_LOCKING_DEBUG 0
fe8ab488 223
1c79356b 224int socket_debug = 0;
fe8ab488 225SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
0a7de745 226 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
fe8ab488 227
39037602
A
228static unsigned long sodefunct_calls = 0;
229SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
230 &sodefunct_calls, "");
231
39236c6e 232static int socket_zone = M_SOCKET;
0a7de745 233so_gen_t so_gencnt; /* generation count for sockets */
1c79356b
A
234
235MALLOC_DEFINE(M_SONAME, "soname", "socket name");
236MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
237
0a7de745
A
238#define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
239#define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
240#define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
241#define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
242#define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
243#define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
244#define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
245#define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
246#define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
1c79356b 247
0a7de745 248#define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
1c79356b 249
2d21ac55 250int somaxconn = SOMAXCONN;
39236c6e 251SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
0a7de745 252 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
1c79356b
A
253
254/* Should we get a maximum also ??? */
fa4905b1 255static int sosendmaxchain = 65536;
1c79356b 256static int sosendminchain = 16384;
55e303ae 257static int sorecvmincopy = 16384;
39236c6e 258SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
0a7de745 259 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
39236c6e 260SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
0a7de745 261 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
2d21ac55
A
262
263/*
264 * Set to enable jumbo clusters (if available) for large writes when
265 * the socket is marked with SOF_MULTIPAGES; see below.
266 */
267int sosendjcl = 1;
39236c6e 268SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
0a7de745 269 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
1c79356b 270
2d21ac55
A
271/*
272 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
273 * writes on the socket for all protocols on any network interfaces,
274 * depending upon sosendjcl above. Be extra careful when setting this
275 * to 1, because sending down packets that cross physical pages down to
276 * broken drivers (those that falsely assume that the physical pages
277 * are contiguous) might lead to system panics or silent data corruption.
278 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
279 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
280 * capable. Set this to 1 only for testing/debugging purposes.
281 */
282int sosendjcl_ignore_capab = 0;
39236c6e 283SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
0a7de745 284 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
1c79356b 285
3e170ce0
A
286/*
287 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
288 * writes on the socket for all protocols on any network interfaces.
289 * Be extra careful when setting this to 1, because sending down packets with
290 * clusters larger that 2 KB might lead to system panics or data corruption.
291 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
292 * on the outgoing interface
293 * Set this to 1 for testing/debugging purposes only.
294 */
fe8ab488
A
295int sosendbigcl_ignore_capab = 0;
296SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
0a7de745 297 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
fe8ab488 298
6d2010ae
A
299int sodefunctlog = 0;
300SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 301 &sodefunctlog, 0, "");
6d2010ae 302
316670eb
A
303int sothrottlelog = 0;
304SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 305 &sothrottlelog, 0, "");
39236c6e
A
306
307int sorestrictrecv = 1;
308SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 309 &sorestrictrecv, 0, "Enable inbound interface restrictions");
316670eb 310
fe8ab488
A
311int sorestrictsend = 1;
312SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 313 &sorestrictsend, 0, "Enable outbound interface restrictions");
1c79356b 314
3e170ce0
A
315int soreserveheadroom = 1;
316SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 317 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
3e170ce0 318
39037602
A
319#if (DEBUG || DEVELOPMENT)
320int so_notsent_lowat_check = 1;
0a7de745 321SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
39037602
A
322 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
323#endif /* DEBUG || DEVELOPMENT */
324
813fb2f6
A
325int so_accept_list_waits = 0;
326#if (DEBUG || DEVELOPMENT)
0a7de745 327SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
813fb2f6
A
328 &so_accept_list_waits, 0, "number of waits for listener incomp list");
329#endif /* DEBUG || DEVELOPMENT */
330
39236c6e 331extern struct inpcbinfo tcbinfo;
2d21ac55
A
332
333/* TODO: these should be in header file */
334extern int get_inpcb_str_size(void);
335extern int get_tcp_str_size(void);
2d21ac55 336
0a7de745 337vm_size_t so_cache_zone_element_size;
91447636 338
3e170ce0
A
339static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
340 user_ssize_t *);
2d21ac55
A
341static void cached_sock_alloc(struct socket **, int);
342static void cached_sock_free(struct socket *);
91447636 343
3e170ce0
A
344/*
345 * Maximum of extended background idle sockets per process
346 * Set to zero to disable further setting of the option
347 */
348
0a7de745
A
349#define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
350#define SO_IDLE_BK_IDLE_TIME 600
351#define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
3e170ce0
A
352
353struct soextbkidlestat soextbkidlestat;
354
355SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
0a7de745
A
356 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
357 "Maximum of extended background idle sockets per process");
3e170ce0
A
358
359SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745
A
360 &soextbkidlestat.so_xbkidle_time, 0,
361 "Time in seconds to keep extended background idle sockets");
3e170ce0
A
362
363SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745
A
364 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
365 "High water mark for extended background idle sockets");
3e170ce0
A
366
367SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
0a7de745 368 &soextbkidlestat, soextbkidlestat, "");
3e170ce0
A
369
370int so_set_extended_bk_idle(struct socket *, int);
371
5ba3f43e 372
316670eb
A
373/*
374 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
375 * setting the DSCP code on the packet based on the service class; see
376 * <rdar://problem/11277343> for details.
377 */
39037602 378__private_extern__ u_int32_t sotcdb = 0;
6d2010ae 379SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 380 &sotcdb, 0, "");
91447636 381
2d21ac55
A
382void
383socketinit(void)
1c79356b 384{
fe8ab488
A
385 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
386 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
387
3e170ce0
A
388#ifdef __LP64__
389 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
390 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
391 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
392 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
393 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
394 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
395#else
396 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
397 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
398 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
399 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
400 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
401 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
402#endif
403
39236c6e 404 if (socketinit_done) {
91447636
A
405 printf("socketinit: already called...\n");
406 return;
407 }
39236c6e 408 socketinit_done = 1;
91447636 409
39236c6e 410 PE_parse_boot_argn("socket_debug", &socket_debug,
0a7de745 411 sizeof(socket_debug));
2d21ac55 412
91447636
A
413 /*
414 * allocate lock group attribute and group for socket cache mutex
415 */
416 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
2d21ac55
A
417 so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
418 so_cache_mtx_grp_attr);
419
91447636
A
420 /*
421 * allocate the lock attribute for socket cache mutex
422 */
423 so_cache_mtx_attr = lck_attr_alloc_init();
91447636 424
2d21ac55
A
425 /* cached sockets mutex */
426 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
39236c6e
A
427 if (so_cache_mtx == NULL) {
428 panic("%s: unable to allocate so_cache_mtx\n", __func__);
429 /* NOTREACHED */
430 }
431 STAILQ_INIT(&so_cache_head);
1c79356b 432
0a7de745 433 so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
39236c6e 434 + get_inpcb_str_size() + 4 + get_tcp_str_size());
2d21ac55 435
3e170ce0 436 so_cache_zone = zinit(so_cache_zone_element_size,
39236c6e 437 (120000 * so_cache_zone_element_size), 8192, "socache zone");
6d2010ae 438 zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
0b4c1975 439 zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
1c79356b 440
3e170ce0
A
441 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
442 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
443 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
444 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
316670eb 445
39236c6e
A
446 in_pcbinit();
447 sflt_init();
6d2010ae 448 socket_tclass_init();
39236c6e
A
449#if MULTIPATH
450 mp_pcbinit();
451#endif /* MULTIPATH */
1c79356b
A
452}
453
2d21ac55
A
454static void
455cached_sock_alloc(struct socket **so, int waitok)
1c79356b 456{
0a7de745 457 caddr_t temp;
39236c6e 458 uintptr_t offset;
1c79356b 459
91447636
A
460 lck_mtx_lock(so_cache_mtx);
461
39236c6e
A
462 if (!STAILQ_EMPTY(&so_cache_head)) {
463 VERIFY(cached_sock_count > 0);
1c79356b 464
39236c6e
A
465 *so = STAILQ_FIRST(&so_cache_head);
466 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
467 STAILQ_NEXT((*so), so_cache_ent) = NULL;
91447636 468
39236c6e 469 cached_sock_count--;
91447636 470 lck_mtx_unlock(so_cache_mtx);
1c79356b 471
2d21ac55 472 temp = (*so)->so_saved_pcb;
0a7de745 473 bzero((caddr_t)*so, sizeof(struct socket));
39236c6e 474
2d21ac55 475 (*so)->so_saved_pcb = temp;
2d21ac55 476 } else {
2d21ac55 477 lck_mtx_unlock(so_cache_mtx);
1c79356b 478
0a7de745 479 if (waitok) {
2d21ac55 480 *so = (struct socket *)zalloc(so_cache_zone);
0a7de745 481 } else {
2d21ac55 482 *so = (struct socket *)zalloc_noblock(so_cache_zone);
0a7de745 483 }
1c79356b 484
0a7de745 485 if (*so == NULL) {
2d21ac55 486 return;
0a7de745 487 }
1c79356b 488
0a7de745 489 bzero((caddr_t)*so, sizeof(struct socket));
1c79356b 490
2d21ac55 491 /*
3e170ce0
A
492 * Define offsets for extra structures into our
493 * single block of memory. Align extra structures
39236c6e 494 * on longword boundaries.
2d21ac55 495 */
b0d623f7 496
39236c6e 497 offset = (uintptr_t)*so;
0a7de745 498 offset += sizeof(struct socket);
b0d623f7
A
499
500 offset = ALIGN(offset);
501
2d21ac55
A
502 (*so)->so_saved_pcb = (caddr_t)offset;
503 offset += get_inpcb_str_size();
b0d623f7
A
504
505 offset = ALIGN(offset);
1c79356b 506
316670eb 507 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
2d21ac55 508 (caddr_t)offset;
2d21ac55 509 }
1c79356b 510
3e170ce0 511 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
1c79356b
A
512}
513
2d21ac55
A
514static void
515cached_sock_free(struct socket *so)
1c79356b 516{
91447636 517 lck_mtx_lock(so_cache_mtx);
1c79356b 518
39236c6e 519 so_cache_time = net_uptime();
b0d623f7 520 if (++cached_sock_count > max_cached_sock_count) {
1c79356b 521 --cached_sock_count;
91447636 522 lck_mtx_unlock(so_cache_mtx);
91447636 523 zfree(so_cache_zone, so);
2d21ac55 524 } else {
0a7de745 525 if (so_cache_hw < cached_sock_count) {
1c79356b 526 so_cache_hw = cached_sock_count;
0a7de745 527 }
1c79356b 528
39236c6e 529 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
1c79356b
A
530
531 so->cache_timestamp = so_cache_time;
91447636 532 lck_mtx_unlock(so_cache_mtx);
1c79356b 533 }
1c79356b
A
534}
535
39236c6e
A
536void
537so_update_last_owner_locked(struct socket *so, proc_t self)
6d2010ae 538{
39236c6e
A
539 if (so->last_pid != 0) {
540 /*
541 * last_pid and last_upid should remain zero for sockets
542 * created using sock_socket. The check above achieves that
543 */
0a7de745 544 if (self == PROC_NULL) {
316670eb 545 self = current_proc();
0a7de745 546 }
39236c6e
A
547
548 if (so->last_upid != proc_uniqueid(self) ||
549 so->last_pid != proc_pid(self)) {
316670eb
A
550 so->last_upid = proc_uniqueid(self);
551 so->last_pid = proc_pid(self);
39236c6e 552 proc_getexecutableuuid(self, so->last_uuid,
0a7de745 553 sizeof(so->last_uuid));
cb323159
A
554 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
555 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
556 }
316670eb 557 }
fe8ab488 558 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
6d2010ae
A
559 }
560}
561
39236c6e
A
562void
563so_update_policy(struct socket *so)
1c79356b 564{
0a7de745 565 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
39236c6e 566 (void) inp_update_policy(sotoinpcb(so));
0a7de745 567 }
39236c6e 568}
1c79356b 569
fe8ab488
A
570#if NECP
571static void
3e170ce0
A
572so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
573 struct sockaddr *override_remote_addr)
fe8ab488 574{
0a7de745 575 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
3e170ce0
A
576 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
577 override_remote_addr, 0);
0a7de745 578 }
fe8ab488
A
579}
580#endif /* NECP */
581
39236c6e
A
582boolean_t
583so_cache_timer(void)
584{
0a7de745
A
585 struct socket *p;
586 int n_freed = 0;
39236c6e 587 boolean_t rc = FALSE;
1c79356b 588
39236c6e
A
589 lck_mtx_lock(so_cache_mtx);
590 so_cache_timeouts++;
591 so_cache_time = net_uptime();
592
593 while (!STAILQ_EMPTY(&so_cache_head)) {
594 VERIFY(cached_sock_count > 0);
595 p = STAILQ_FIRST(&so_cache_head);
3e170ce0 596 if ((so_cache_time - p->cache_timestamp) <
0a7de745 597 SO_CACHE_TIME_LIMIT) {
2d21ac55 598 break;
0a7de745 599 }
1c79356b 600
39236c6e
A
601 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
602 --cached_sock_count;
1c79356b 603
91447636 604 zfree(so_cache_zone, p);
2d21ac55
A
605
606 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
607 so_cache_max_freed++;
1c79356b
A
608 break;
609 }
610 }
1c79356b 611
39236c6e 612 /* Schedule again if there is more to cleanup */
0a7de745 613 if (!STAILQ_EMPTY(&so_cache_head)) {
39236c6e 614 rc = TRUE;
0a7de745 615 }
39236c6e
A
616
617 lck_mtx_unlock(so_cache_mtx);
0a7de745 618 return rc;
1c79356b 619}
1c79356b
A
620
621/*
622 * Get a socket structure from our zone, and initialize it.
623 * We don't implement `waitok' yet (see comments in uipc_domain.c).
624 * Note that it would probably be better to allocate socket
625 * and PCB at the same time, but I'm not convinced that all
626 * the protocols can be easily modified to do this.
627 */
628struct socket *
2d21ac55 629soalloc(int waitok, int dom, int type)
1c79356b
A
630{
631 struct socket *so;
632
2d21ac55
A
633 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
634 cached_sock_alloc(&so, waitok);
635 } else {
0a7de745 636 MALLOC_ZONE(so, struct socket *, sizeof(*so), socket_zone,
2d21ac55 637 M_WAITOK);
0a7de745
A
638 if (so != NULL) {
639 bzero(so, sizeof(*so));
640 }
1c79356b 641 }
2d21ac55 642 if (so != NULL) {
fe8ab488 643 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
2d21ac55 644 so->so_zone = socket_zone;
5ba3f43e
A
645
646 /*
647 * Increment the socket allocation statistics
648 */
649 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
650
2d21ac55 651#if CONFIG_MACF_SOCKET
39236c6e
A
652 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
653 if (mac_socket_label_init(so, !waitok) != 0) {
2d21ac55 654 sodealloc(so);
0a7de745 655 return NULL;
2d21ac55
A
656 }
657#endif /* MAC_SOCKET */
1c79356b
A
658 }
659
0a7de745 660 return so;
1c79356b
A
661}
662
663int
39236c6e
A
664socreate_internal(int dom, struct socket **aso, int type, int proto,
665 struct proc *p, uint32_t flags, struct proc *ep)
1c79356b 666{
39236c6e
A
667 struct protosw *prp;
668 struct socket *so;
669 int error = 0;
d1ecb069 670
55e303ae
A
671#if TCPDEBUG
672 extern int tcpconsdebug;
673#endif
39236c6e
A
674
675 VERIFY(aso != NULL);
676 *aso = NULL;
677
0a7de745 678 if (proto != 0) {
1c79356b 679 prp = pffindproto(dom, proto, type);
0a7de745 680 } else {
1c79356b 681 prp = pffindtype(dom, type);
0a7de745 682 }
9bccf70c 683
39236c6e 684 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
0a7de745
A
685 if (pffinddomain(dom) == NULL) {
686 return EAFNOSUPPORT;
687 }
2d21ac55 688 if (proto != 0) {
0a7de745
A
689 if (pffindprotonotype(dom, proto) != NULL) {
690 return EPROTOTYPE;
691 }
2d21ac55 692 }
0a7de745
A
693 return EPROTONOSUPPORT;
694 }
695 if (prp->pr_type != type) {
696 return EPROTOTYPE;
9bccf70c 697 }
b0d623f7 698 so = soalloc(1, dom, type);
0a7de745
A
699 if (so == NULL) {
700 return ENOBUFS;
701 }
1c79356b 702
5ba3f43e 703 switch (dom) {
0a7de745
A
704 case PF_LOCAL:
705 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
706 break;
707 case PF_INET:
708 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
709 if (type == SOCK_STREAM) {
710 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
711 } else {
712 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
713 }
714 break;
715 case PF_ROUTE:
716 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
717 break;
718 case PF_NDRV:
719 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
720 break;
721 case PF_KEY:
722 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
723 break;
724 case PF_INET6:
725 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
726 if (type == SOCK_STREAM) {
727 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
728 } else {
729 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
730 }
731 break;
732 case PF_SYSTEM:
733 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
734 break;
735 case PF_MULTIPATH:
736 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
737 break;
738 default:
739 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
740 break;
5ba3f43e
A
741 }
742
cb323159 743 if (flags & SOCF_MPTCP) {
39236c6e 744 so->so_state |= SS_NBIO;
0a7de745 745 }
39236c6e 746
1c79356b
A
747 TAILQ_INIT(&so->so_incomp);
748 TAILQ_INIT(&so->so_comp);
749 so->so_type = type;
316670eb
A
750 so->last_upid = proc_uniqueid(p);
751 so->last_pid = proc_pid(p);
0a7de745 752 proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
fe8ab488 753 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
39236c6e
A
754
755 if (ep != PROC_NULL && ep != p) {
756 so->e_upid = proc_uniqueid(ep);
757 so->e_pid = proc_pid(ep);
0a7de745 758 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
39236c6e
A
759 so->so_flags |= SOF_DELEGATED;
760 }
1c79356b 761
316670eb 762 so->so_cred = kauth_cred_proc_ref(p);
0a7de745 763 if (!suser(kauth_cred_get(), NULL)) {
39236c6e 764 so->so_state |= SS_PRIV;
0a7de745 765 }
b0d623f7 766
1c79356b 767 so->so_proto = prp;
39236c6e 768 so->so_rcv.sb_flags |= SB_RECV;
91447636 769 so->so_rcv.sb_so = so->so_snd.sb_so = so;
0c530ab8
A
770 so->next_lock_lr = 0;
771 so->next_unlock_lr = 0;
2d21ac55
A
772
773#if CONFIG_MACF_SOCKET
774 mac_socket_label_associate(kauth_cred_get(), so);
775#endif /* MAC_SOCKET */
776
2d21ac55 777 /*
39236c6e
A
778 * Attachment will create the per pcb lock if necessary and
779 * increase refcount for creation, make sure it's done before
780 * socket is inserted in lists.
2d21ac55
A
781 */
782 so->so_usecount++;
91447636
A
783
784 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
39236c6e 785 if (error != 0) {
2d21ac55
A
786 /*
787 * Warning:
788 * If so_pcb is not zero, the socket will be leaked,
789 * so protocol attachment handler must be coded carefuly
55e303ae 790 */
1c79356b 791 so->so_state |= SS_NOFDREF;
d190cdc3 792 VERIFY(so->so_usecount > 0);
37839358 793 so->so_usecount--;
0a7de745
A
794 sofreelastref(so, 1); /* will deallocate the socket */
795 return error;
1c79356b 796 }
39236c6e 797
cb323159
A
798 /*
799 * Note: needs so_pcb to be set after pru_attach
800 */
801 if (prp->pr_update_last_owner != NULL) {
802 (*prp->pr_update_last_owner)(so, p, ep);
803 }
804
39236c6e 805 atomic_add_32(&prp->pr_domain->dom_refs, 1);
1c79356b 806 TAILQ_INIT(&so->so_evlist);
91447636
A
807
808 /* Attach socket filters for this protocol */
809 sflt_initsock(so);
55e303ae 810#if TCPDEBUG
0a7de745 811 if (tcpconsdebug == 2) {
55e303ae 812 so->so_options |= SO_DEBUG;
0a7de745 813 }
9bccf70c 814#endif
6d2010ae 815 so_set_default_traffic_class(so);
39236c6e 816
d1ecb069 817 /*
39236c6e
A
818 * If this thread or task is marked to create backgrounded sockets,
819 * mark the socket as background.
d1ecb069 820 */
cb323159
A
821 if (!(flags & SOCF_MPTCP) &&
822 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
d1ecb069 823 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
6d2010ae
A
824 so->so_background_thread = current_thread();
825 }
826
827 switch (dom) {
316670eb 828 /*
94ff46dc 829 * Don't mark Unix domain or system
39236c6e
A
830 * eligible for defunct by default.
831 */
6d2010ae 832 case PF_LOCAL:
316670eb 833 case PF_SYSTEM:
6d2010ae
A
834 so->so_flags |= SOF_NODEFUNCT;
835 break;
316670eb
A
836 default:
837 break;
d1ecb069
A
838 }
839
fe8ab488
A
840 /*
841 * Entitlements can't be checked at socket creation time except if the
842 * application requested a feature guarded by a privilege (c.f., socket
843 * delegation).
844 * The priv(9) and the Sandboxing APIs are designed with the idea that
845 * a privilege check should only be triggered by a userland request.
846 * A privilege check at socket creation time is time consuming and
847 * could trigger many authorisation error messages from the security
848 * APIs.
849 */
850
1c79356b 851 *aso = so;
39236c6e 852
0a7de745 853 return 0;
1c79356b
A
854}
855
39236c6e
A
856/*
857 * Returns: 0 Success
858 * EAFNOSUPPORT
859 * EPROTOTYPE
860 * EPROTONOSUPPORT
861 * ENOBUFS
862 * <pru_attach>:ENOBUFS[AF_UNIX]
863 * <pru_attach>:ENOBUFS[TCP]
864 * <pru_attach>:ENOMEM[TCP]
865 * <pru_attach>:??? [other protocol families, IPSEC]
866 */
867int
868socreate(int dom, struct socket **aso, int type, int proto)
869{
0a7de745
A
870 return socreate_internal(dom, aso, type, proto, current_proc(), 0,
871 PROC_NULL);
39236c6e
A
872}
873
874int
875socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
876{
877 int error = 0;
878 struct proc *ep = PROC_NULL;
879
880 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
881 error = ESRCH;
882 goto done;
883 }
884
885 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
886
887 /*
888 * It might not be wise to hold the proc reference when calling
889 * socreate_internal since it calls soalloc with M_WAITOK
890 */
891done:
0a7de745 892 if (ep != PROC_NULL) {
39236c6e 893 proc_rele(ep);
0a7de745 894 }
39236c6e 895
0a7de745 896 return error;
39236c6e
A
897}
898
2d21ac55
A
899/*
900 * Returns: 0 Success
901 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
902 * <pru_bind>:EAFNOSUPPORT Address family not supported
903 * <pru_bind>:EADDRNOTAVAIL Address not available.
904 * <pru_bind>:EINVAL Invalid argument
905 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
906 * <pru_bind>:EACCES Permission denied
907 * <pru_bind>:EADDRINUSE Address in use
908 * <pru_bind>:EAGAIN Resource unavailable, try again
909 * <pru_bind>:EPERM Operation not permitted
910 * <pru_bind>:???
911 * <sf_bind>:???
912 *
913 * Notes: It's not possible to fully enumerate the return codes above,
914 * since socket filter authors and protocol family authors may
915 * not choose to limit their error returns to those listed, even
916 * though this may result in some software operating incorrectly.
917 *
918 * The error codes which are enumerated above are those known to
919 * be returned by the tcp_usr_bind function supplied.
920 */
1c79356b 921int
39236c6e 922sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
1c79356b
A
923{
924 struct proc *p = current_proc();
91447636 925 int error = 0;
1c79356b 926
0a7de745 927 if (dolock) {
39236c6e 928 socket_lock(so, 1);
0a7de745 929 }
39236c6e 930
6d2010ae 931 so_update_last_owner_locked(so, p);
39236c6e 932 so_update_policy(so);
3e170ce0 933
fe8ab488
A
934#if NECP
935 so_update_necp_policy(so, nam, NULL);
936#endif /* NECP */
3e170ce0 937
2d21ac55 938 /*
6d2010ae
A
939 * If this is a bind request on a socket that has been marked
940 * as inactive, reject it now before we go any further.
2d21ac55
A
941 */
942 if (so->so_flags & SOF_DEFUNCT) {
943 error = EINVAL;
39037602
A
944 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
945 __func__, proc_pid(p), proc_best_name(p),
946 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
947 SOCK_DOM(so), SOCK_TYPE(so), error);
2d21ac55
A
948 goto out;
949 }
950
91447636 951 /* Socket filter */
6d2010ae 952 error = sflt_bind(so, nam);
2d21ac55 953
0a7de745 954 if (error == 0) {
91447636 955 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
0a7de745 956 }
2d21ac55 957out:
0a7de745 958 if (dolock) {
39236c6e 959 socket_unlock(so, 1);
0a7de745 960 }
2d21ac55 961
0a7de745 962 if (error == EJUSTRETURN) {
91447636 963 error = 0;
0a7de745 964 }
2d21ac55 965
0a7de745 966 return error;
1c79356b
A
967}
968
969void
2d21ac55 970sodealloc(struct socket *so)
1c79356b 971{
316670eb
A
972 kauth_cred_unref(&so->so_cred);
973
6d2010ae
A
974 /* Remove any filters */
975 sflt_termsock(so);
976
fe8ab488
A
977#if CONTENT_FILTER
978 cfil_sock_detach(so);
979#endif /* CONTENT_FILTER */
980
39236c6e
A
981 /* Delete the state allocated for msg queues on a socket */
982 if (so->so_flags & SOF_ENABLE_MSGS) {
983 FREE(so->so_msg_state, M_TEMP);
984 so->so_msg_state = NULL;
985 }
986 VERIFY(so->so_msg_state == NULL);
987
fe8ab488 988 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
1c79356b 989
2d21ac55
A
990#if CONFIG_MACF_SOCKET
991 mac_socket_label_destroy(so);
992#endif /* MAC_SOCKET */
39236c6e 993
3e170ce0 994 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
2d21ac55
A
995 cached_sock_free(so);
996 } else {
0a7de745 997 FREE_ZONE(so, sizeof(*so), so->so_zone);
91447636 998 }
1c79356b
A
999}
1000
2d21ac55
A
1001/*
1002 * Returns: 0 Success
1003 * EINVAL
1004 * EOPNOTSUPP
1005 * <pru_listen>:EINVAL[AF_UNIX]
1006 * <pru_listen>:EINVAL[TCP]
1007 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
1008 * <pru_listen>:EINVAL[TCP] Invalid argument
1009 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
1010 * <pru_listen>:EACCES[TCP] Permission denied
1011 * <pru_listen>:EADDRINUSE[TCP] Address in use
1012 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
1013 * <pru_listen>:EPERM[TCP] Operation not permitted
1014 * <sf_listen>:???
1015 *
1016 * Notes: Other <pru_listen> returns depend on the protocol family; all
1017 * <sf_listen> returns depend on what the filter author causes
1018 * their filter to return.
1019 */
1c79356b 1020int
2d21ac55 1021solisten(struct socket *so, int backlog)
1c79356b 1022{
1c79356b 1023 struct proc *p = current_proc();
2d21ac55 1024 int error = 0;
1c79356b 1025
91447636 1026 socket_lock(so, 1);
39236c6e
A
1027
1028 so_update_last_owner_locked(so, p);
1029 so_update_policy(so);
3e170ce0 1030
fe8ab488
A
1031#if NECP
1032 so_update_necp_policy(so, NULL, NULL);
1033#endif /* NECP */
3e170ce0 1034
2d21ac55
A
1035 if (so->so_proto == NULL) {
1036 error = EINVAL;
1037 goto out;
1038 }
1039 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1040 error = EOPNOTSUPP;
1041 goto out;
1042 }
1043
1044 /*
1045 * If the listen request is made on a socket that is not fully
6d2010ae
A
1046 * disconnected, or on a socket that has been marked as inactive,
1047 * reject the request now.
2d21ac55
A
1048 */
1049 if ((so->so_state &
0a7de745 1050 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
2d21ac55
A
1051 (so->so_flags & SOF_DEFUNCT)) {
1052 error = EINVAL;
6d2010ae 1053 if (so->so_flags & SOF_DEFUNCT) {
39037602 1054 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
39236c6e 1055 "(%d)\n", __func__, proc_pid(p),
39037602 1056 proc_best_name(p),
3e170ce0 1057 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 1058 SOCK_DOM(so), SOCK_TYPE(so), error);
6d2010ae 1059 }
2d21ac55
A
1060 goto out;
1061 }
1062
39236c6e 1063 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
2d21ac55
A
1064 error = EPERM;
1065 goto out;
1066 }
1067
6d2010ae 1068 error = sflt_listen(so);
0a7de745 1069 if (error == 0) {
91447636 1070 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
0a7de745 1071 }
2d21ac55 1072
1c79356b 1073 if (error) {
0a7de745 1074 if (error == EJUSTRETURN) {
91447636 1075 error = 0;
0a7de745 1076 }
2d21ac55 1077 goto out;
1c79356b 1078 }
2d21ac55 1079
0a7de745 1080 if (TAILQ_EMPTY(&so->so_comp)) {
1c79356b 1081 so->so_options |= SO_ACCEPTCONN;
0a7de745 1082 }
2d21ac55
A
1083 /*
1084 * POSIX: The implementation may have an upper limit on the length of
1085 * the listen queue-either global or per accepting socket. If backlog
1086 * exceeds this limit, the length of the listen queue is set to the
1087 * limit.
1088 *
1089 * If listen() is called with a backlog argument value that is less
1090 * than 0, the function behaves as if it had been called with a backlog
1091 * argument value of 0.
1092 *
1093 * A backlog argument of 0 may allow the socket to accept connections,
1094 * in which case the length of the listen queue may be set to an
1095 * implementation-defined minimum value.
1096 */
0a7de745 1097 if (backlog <= 0 || backlog > somaxconn) {
1c79356b 1098 backlog = somaxconn;
0a7de745 1099 }
1c79356b 1100
2d21ac55
A
1101 so->so_qlimit = backlog;
1102out:
91447636 1103 socket_unlock(so, 1);
0a7de745 1104 return error;
1c79356b
A
1105}
1106
813fb2f6
A
1107/*
1108 * The "accept list lock" protects the fields related to the listener queues
1109 * because we can unlock a socket to respect the lock ordering between
1110 * the listener socket and its clients sockets. The lock ordering is first to
1111 * acquire the client socket before the listener socket.
1112 *
1113 * The accept list lock serializes access to the following fields:
1114 * - of the listener socket:
1115 * - so_comp
1116 * - so_incomp
1117 * - so_qlen
1118 * - so_inqlen
1119 * - of client sockets that are in so_comp or so_incomp:
1120 * - so_head
1121 * - so_list
1122 *
1123 * As one can see the accept list lock protects the consistent of the
1124 * linkage of the client sockets.
1125 *
1126 * Note that those fields may be read without holding the accept list lock
1127 * for a preflight provided the accept list lock is taken when committing
1128 * to take an action based on the result of the preflight. The preflight
1129 * saves the cost of doing the unlock/lock dance.
1130 */
1131void
1132so_acquire_accept_list(struct socket *head, struct socket *so)
1133{
1134 lck_mtx_t *mutex_held;
1135
1136 if (head->so_proto->pr_getlock == NULL) {
1137 return;
1138 }
5ba3f43e
A
1139 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1140 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
813fb2f6
A
1141
1142 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1143 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1144 return;
1145 }
1146 if (so != NULL) {
1147 socket_unlock(so, 0);
1148 }
1149 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1150 so_accept_list_waits += 1;
1151 msleep((caddr_t)&head->so_incomp, mutex_held,
1152 PSOCK | PCATCH, __func__, NULL);
1153 }
1154 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1155 if (so != NULL) {
1156 socket_unlock(head, 0);
1157 socket_lock(so, 0);
1158 socket_lock(head, 0);
1159 }
1160}
1161
1162void
1163so_release_accept_list(struct socket *head)
1164{
1165 if (head->so_proto->pr_getlock != NULL) {
1166 lck_mtx_t *mutex_held;
1167
1168 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
5ba3f43e
A
1169 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1170
813fb2f6
A
1171 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1172 wakeup((caddr_t)&head->so_incomp);
1173 }
1174}
1175
1c79356b 1176void
2d21ac55 1177sofreelastref(struct socket *so, int dealloc)
9bccf70c 1178{
1c79356b
A
1179 struct socket *head = so->so_head;
1180
2d21ac55 1181 /* Assume socket is locked */
1c79356b 1182
39236c6e 1183 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
0b4e3aa0
A
1184 selthreadclear(&so->so_snd.sb_sel);
1185 selthreadclear(&so->so_rcv.sb_sel);
0a7de745
A
1186 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1187 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
fe8ab488 1188 so->so_event = sonullevent;
1c79356b 1189 return;
0b4e3aa0 1190 }
9bccf70c 1191 if (head != NULL) {
d190cdc3
A
1192 /*
1193 * Need to lock the listener when the protocol has
1194 * per socket locks
1195 */
813fb2f6 1196 if (head->so_proto->pr_getlock != NULL) {
d190cdc3 1197 socket_lock(head, 1);
813fb2f6
A
1198 so_acquire_accept_list(head, so);
1199 }
9bccf70c 1200 if (so->so_state & SS_INCOMP) {
d190cdc3 1201 so->so_state &= ~SS_INCOMP;
9bccf70c
A
1202 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1203 head->so_incqlen--;
d190cdc3
A
1204 head->so_qlen--;
1205 so->so_head = NULL;
813fb2f6
A
1206
1207 if (head->so_proto->pr_getlock != NULL) {
1208 so_release_accept_list(head);
1209 socket_unlock(head, 1);
1210 }
9bccf70c 1211 } else if (so->so_state & SS_COMP) {
813fb2f6
A
1212 if (head->so_proto->pr_getlock != NULL) {
1213 so_release_accept_list(head);
1214 socket_unlock(head, 1);
1215 }
9bccf70c
A
1216 /*
1217 * We must not decommission a socket that's
1218 * on the accept(2) queue. If we do, then
1219 * accept(2) may hang after select(2) indicated
1220 * that the listening socket was ready.
1221 */
9bccf70c
A
1222 selthreadclear(&so->so_snd.sb_sel);
1223 selthreadclear(&so->so_rcv.sb_sel);
0a7de745
A
1224 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1225 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
fe8ab488 1226 so->so_event = sonullevent;
9bccf70c
A
1227 return;
1228 } else {
813fb2f6
A
1229 if (head->so_proto->pr_getlock != NULL) {
1230 so_release_accept_list(head);
0a7de745
A
1231 socket_unlock(head, 1);
1232 }
813fb2f6 1233 printf("sofree: not queued\n");
9bccf70c 1234 }
1c79356b 1235 }
39236c6e 1236 sowflush(so);
1c79356b 1237 sorflush(so);
2d21ac55 1238
39236c6e
A
1239#if FLOW_DIVERT
1240 if (so->so_flags & SOF_FLOW_DIVERT) {
1241 flow_divert_detach(so);
1242 }
0a7de745 1243#endif /* FLOW_DIVERT */
39236c6e 1244
91447636
A
1245 /* 3932268: disable upcall */
1246 so->so_rcv.sb_flags &= ~SB_UPCALL;
0a7de745 1247 so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
fe8ab488 1248 so->so_event = sonullevent;
2d21ac55 1249
0a7de745 1250 if (dealloc) {
91447636 1251 sodealloc(so);
0a7de745 1252 }
1c79356b
A
1253}
1254
2d21ac55
A
1255void
1256soclose_wait_locked(struct socket *so)
1257{
1258 lck_mtx_t *mutex_held;
1259
0a7de745 1260 if (so->so_proto->pr_getlock != NULL) {
5ba3f43e 1261 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
0a7de745 1262 } else {
2d21ac55 1263 mutex_held = so->so_proto->pr_domain->dom_mtx;
0a7de745 1264 }
5ba3f43e 1265 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
2d21ac55 1266
4a3eedf9
A
1267 /*
1268 * Double check here and return if there's no outstanding upcall;
1269 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1270 */
0a7de745 1271 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
2d21ac55 1272 return;
0a7de745 1273 }
316670eb
A
1274 so->so_rcv.sb_flags &= ~SB_UPCALL;
1275 so->so_snd.sb_flags &= ~SB_UPCALL;
2d21ac55 1276 so->so_flags |= SOF_CLOSEWAIT;
5ba3f43e 1277
39236c6e 1278 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
2d21ac55 1279 "soclose_wait_locked", NULL);
5ba3f43e 1280 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
2d21ac55
A
1281 so->so_flags &= ~SOF_CLOSEWAIT;
1282}
1283
1c79356b
A
1284/*
1285 * Close a socket on last file table reference removal.
1286 * Initiate disconnect if connected.
1287 * Free socket when disconnect complete.
1288 */
1289int
2d21ac55 1290soclose_locked(struct socket *so)
1c79356b 1291{
1c79356b 1292 int error = 0;
91447636 1293 struct timespec ts;
1c79356b 1294
91447636 1295 if (so->so_usecount == 0) {
2d21ac55 1296 panic("soclose: so=%p refcount=0\n", so);
39236c6e 1297 /* NOTREACHED */
1c79356b
A
1298 }
1299
91447636 1300 sflt_notify(so, sock_evt_closing, NULL);
2d21ac55 1301
0a7de745 1302 if (so->so_upcallusecount) {
39236c6e 1303 soclose_wait_locked(so);
0a7de745 1304 }
39236c6e 1305
fe8ab488
A
1306#if CONTENT_FILTER
1307 /*
1308 * We have to wait until the content filters are done
1309 */
1310 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1311 cfil_sock_close_wait(so);
1312 cfil_sock_is_closed(so);
1313 cfil_sock_detach(so);
1314 }
1315#endif /* CONTENT_FILTER */
1316
3e170ce0
A
1317 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1318 soresume(current_proc(), so, 1);
1319 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1320 }
1321
91447636 1322 if ((so->so_options & SO_ACCEPTCONN)) {
813fb2f6
A
1323 struct socket *sp, *sonext;
1324 int persocklock = 0;
1325 int incomp_overflow_only;
2d21ac55
A
1326
1327 /*
1328 * We do not want new connection to be added
1329 * to the connection queues
1330 */
91447636 1331 so->so_options &= ~SO_ACCEPTCONN;
2d21ac55 1332
813fb2f6
A
1333 /*
1334 * We can drop the lock on the listener once
1335 * we've acquired the incoming list
1336 */
1337 if (so->so_proto->pr_getlock != NULL) {
1338 persocklock = 1;
1339 so_acquire_accept_list(so, NULL);
1340 socket_unlock(so, 0);
1341 }
1342again:
1343 incomp_overflow_only = 1;
2d21ac55 1344
813fb2f6 1345 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
39236c6e
A
1346 /*
1347 * Radar 5350314
2d21ac55
A
1348 * skip sockets thrown away by tcpdropdropblreq
1349 * they will get cleanup by the garbage collection.
1350 * otherwise, remove the incomp socket from the queue
1351 * and let soabort trigger the appropriate cleanup.
91447636 1352 */
0a7de745 1353 if (sp->so_flags & SOF_OVERFLOW) {
2d21ac55 1354 continue;
0a7de745 1355 }
2d21ac55 1356
0a7de745 1357 if (persocklock != 0) {
ff6e181a 1358 socket_lock(sp, 1);
0a7de745 1359 }
2d21ac55 1360
d190cdc3
A
1361 /*
1362 * Radar 27945981
1363 * The extra reference for the list insure the
1364 * validity of the socket pointer when we perform the
1365 * unlock of the head above
1366 */
2d21ac55
A
1367 if (sp->so_state & SS_INCOMP) {
1368 sp->so_state &= ~SS_INCOMP;
1369 sp->so_head = NULL;
d190cdc3
A
1370 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1371 so->so_incqlen--;
1372 so->so_qlen--;
2d21ac55
A
1373
1374 (void) soabort(sp);
813fb2f6
A
1375 } else {
1376 panic("%s sp %p in so_incomp but !SS_INCOMP",
1377 __func__, sp);
ff6e181a 1378 }
2d21ac55 1379
0a7de745 1380 if (persocklock != 0) {
2d21ac55 1381 socket_unlock(sp, 1);
0a7de745 1382 }
91447636
A
1383 }
1384
813fb2f6 1385 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
91447636 1386 /* Dequeue from so_comp since sofree() won't do it */
0a7de745 1387 if (persocklock != 0) {
ff6e181a 1388 socket_lock(sp, 1);
0a7de745 1389 }
ff6e181a 1390
2d21ac55
A
1391 if (sp->so_state & SS_COMP) {
1392 sp->so_state &= ~SS_COMP;
1393 sp->so_head = NULL;
d190cdc3
A
1394 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1395 so->so_qlen--;
2d21ac55
A
1396
1397 (void) soabort(sp);
813fb2f6
A
1398 } else {
1399 panic("%s sp %p in so_comp but !SS_COMP",
1400 __func__, sp);
2d21ac55 1401 }
91447636 1402
0a7de745 1403 if (persocklock) {
91447636 1404 socket_unlock(sp, 1);
ff6e181a 1405 }
0a7de745 1406 }
813fb2f6
A
1407
1408 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
0a7de745 1409#if (DEBUG | DEVELOPMENT)
813fb2f6
A
1410 panic("%s head %p so_comp not empty\n", __func__, so);
1411#endif /* (DEVELOPMENT || DEBUG) */
1412
1413 goto again;
91447636 1414 }
813fb2f6
A
1415
1416 if (!TAILQ_EMPTY(&so->so_comp)) {
0a7de745 1417#if (DEBUG | DEVELOPMENT)
813fb2f6
A
1418 panic("%s head %p so_comp not empty\n", __func__, so);
1419#endif /* (DEVELOPMENT || DEBUG) */
1420
1421 goto again;
1422 }
1423
1424 if (persocklock) {
1425 socket_lock(so, 0);
1426 so_release_accept_list(so);
1427 }
1428 }
39236c6e 1429 if (so->so_pcb == NULL) {
91447636
A
1430 /* 3915887: mark the socket as ready for dealloc */
1431 so->so_flags |= SOF_PCBCLEARING;
1c79356b 1432 goto discard;
91447636 1433 }
1c79356b
A
1434 if (so->so_state & SS_ISCONNECTED) {
1435 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
91447636 1436 error = sodisconnectlocked(so);
0a7de745 1437 if (error) {
1c79356b 1438 goto drop;
0a7de745 1439 }
1c79356b
A
1440 }
1441 if (so->so_options & SO_LINGER) {
813fb2f6
A
1442 lck_mtx_t *mutex_held;
1443
1c79356b 1444 if ((so->so_state & SS_ISDISCONNECTING) &&
0a7de745 1445 (so->so_state & SS_NBIO)) {
1c79356b 1446 goto drop;
0a7de745
A
1447 }
1448 if (so->so_proto->pr_getlock != NULL) {
5ba3f43e 1449 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
0a7de745 1450 } else {
91447636 1451 mutex_held = so->so_proto->pr_domain->dom_mtx;
0a7de745 1452 }
1c79356b 1453 while (so->so_state & SS_ISCONNECTED) {
0a7de745 1454 ts.tv_sec = (so->so_linger / 100);
2d21ac55
A
1455 ts.tv_nsec = (so->so_linger % 100) *
1456 NSEC_PER_USEC * 1000 * 10;
1457 error = msleep((caddr_t)&so->so_timeo,
1458 mutex_held, PSOCK | PCATCH, "soclose", &ts);
91447636 1459 if (error) {
2d21ac55
A
1460 /*
1461 * It's OK when the time fires,
1462 * don't report an error
1463 */
0a7de745 1464 if (error == EWOULDBLOCK) {
91447636 1465 error = 0;
0a7de745 1466 }
1c79356b 1467 break;
91447636 1468 }
1c79356b
A
1469 }
1470 }
1471 }
1472drop:
39236c6e 1473 if (so->so_usecount == 0) {
2d21ac55 1474 panic("soclose: usecount is zero so=%p\n", so);
39236c6e
A
1475 /* NOTREACHED */
1476 }
1477 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1c79356b 1478 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
0a7de745 1479 if (error == 0) {
1c79356b 1480 error = error2;
0a7de745 1481 }
1c79356b 1482 }
39236c6e 1483 if (so->so_usecount <= 0) {
2d21ac55 1484 panic("soclose: usecount is zero so=%p\n", so);
39236c6e
A
1485 /* NOTREACHED */
1486 }
1c79356b 1487discard:
39236c6e
A
1488 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1489 (so->so_state & SS_NOFDREF)) {
1c79356b 1490 panic("soclose: NOFDREF");
39236c6e
A
1491 /* NOTREACHED */
1492 }
1c79356b 1493 so->so_state |= SS_NOFDREF;
39236c6e 1494
0a7de745 1495 if ((so->so_flags & SOF_KNOTE) != 0) {
316670eb 1496 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
0a7de745 1497 }
39236c6e
A
1498
1499 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1c79356b 1500 evsofree(so);
39236c6e 1501
d190cdc3 1502 VERIFY(so->so_usecount > 0);
91447636 1503 so->so_usecount--;
1c79356b 1504 sofree(so);
0a7de745 1505 return error;
1c79356b
A
1506}
1507
91447636 1508int
2d21ac55 1509soclose(struct socket *so)
91447636
A
1510{
1511 int error = 0;
1512 socket_lock(so, 1);
2d21ac55 1513
2d21ac55 1514 if (so->so_retaincnt == 0) {
91447636 1515 error = soclose_locked(so);
2d21ac55
A
1516 } else {
1517 /*
1518 * if the FD is going away, but socket is
1519 * retained in kernel remove its reference
1520 */
91447636 1521 so->so_usecount--;
0a7de745 1522 if (so->so_usecount < 2) {
2d21ac55
A
1523 panic("soclose: retaincnt non null and so=%p "
1524 "usecount=%d\n", so, so->so_usecount);
0a7de745 1525 }
91447636
A
1526 }
1527 socket_unlock(so, 1);
0a7de745 1528 return error;
91447636
A
1529}
1530
1c79356b
A
1531/*
1532 * Must be called at splnet...
1533 */
2d21ac55 1534/* Should already be locked */
1c79356b 1535int
2d21ac55 1536soabort(struct socket *so)
1c79356b 1537{
9bccf70c 1538 int error;
1c79356b 1539
91447636 1540#ifdef MORE_LOCKING_DEBUG
2d21ac55 1541 lck_mtx_t *mutex_held;
91447636 1542
0a7de745 1543 if (so->so_proto->pr_getlock != NULL) {
91447636 1544 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
0a7de745 1545 } else {
91447636 1546 mutex_held = so->so_proto->pr_domain->dom_mtx;
0a7de745 1547 }
5ba3f43e 1548 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
91447636
A
1549#endif
1550
2d21ac55
A
1551 if ((so->so_flags & SOF_ABORTED) == 0) {
1552 so->so_flags |= SOF_ABORTED;
1553 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1554 if (error) {
1555 sofree(so);
0a7de745 1556 return error;
2d21ac55 1557 }
9bccf70c 1558 }
0a7de745 1559 return 0;
1c79356b
A
1560}
1561
1562int
2d21ac55 1563soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
9bccf70c 1564{
1c79356b 1565 int error;
91447636 1566
0a7de745 1567 if (dolock) {
2d21ac55 1568 socket_lock(so, 1);
0a7de745 1569 }
1c79356b 1570
39236c6e
A
1571 so_update_last_owner_locked(so, PROC_NULL);
1572 so_update_policy(so);
fe8ab488
A
1573#if NECP
1574 so_update_necp_policy(so, NULL, NULL);
1575#endif /* NECP */
39236c6e 1576
0a7de745 1577 if ((so->so_state & SS_NOFDREF) == 0) {
1c79356b 1578 panic("soaccept: !NOFDREF");
0a7de745 1579 }
1c79356b
A
1580 so->so_state &= ~SS_NOFDREF;
1581 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
2d21ac55 1582
0a7de745 1583 if (dolock) {
2d21ac55 1584 socket_unlock(so, 1);
0a7de745
A
1585 }
1586 return error;
1c79356b 1587}
2d21ac55 1588
91447636 1589int
2d21ac55 1590soaccept(struct socket *so, struct sockaddr **nam)
91447636 1591{
0a7de745 1592 return soacceptlock(so, nam, 1);
91447636 1593}
1c79356b
A
1594
1595int
d190cdc3 1596soacceptfilter(struct socket *so, struct socket *head)
2d21ac55
A
1597{
1598 struct sockaddr *local = NULL, *remote = NULL;
6d2010ae 1599 int error = 0;
2d21ac55
A
1600
1601 /*
39236c6e
A
1602 * Hold the lock even if this socket has not been made visible
1603 * to the filter(s). For sockets with global locks, this protects
1604 * against the head or peer going away
2d21ac55 1605 */
b0d623f7
A
1606 socket_lock(so, 1);
1607 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1608 sogetaddr_locked(so, &local, 0) != 0) {
d190cdc3 1609 so->so_state &= ~SS_NOFDREF;
b0d623f7 1610 socket_unlock(so, 1);
2d21ac55
A
1611 soclose(so);
1612 /* Out of resources; try it again next time */
1613 error = ECONNABORTED;
1614 goto done;
1615 }
1616
6d2010ae 1617 error = sflt_accept(head, so, local, remote);
2d21ac55
A
1618
1619 /*
1620 * If we get EJUSTRETURN from one of the filters, mark this socket
1621 * as inactive and return it anyway. This newly accepted socket
1622 * will be disconnected later before we hand it off to the caller.
1623 */
1624 if (error == EJUSTRETURN) {
1625 error = 0;
6d2010ae
A
1626 (void) sosetdefunct(current_proc(), so,
1627 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
2d21ac55
A
1628 }
1629
1630 if (error != 0) {
1631 /*
1632 * This may seem like a duplication to the above error
1633 * handling part when we return ECONNABORTED, except
1634 * the following is done while holding the lock since
1635 * the socket has been exposed to the filter(s) earlier.
1636 */
5ba3f43e 1637 so->so_state &= ~SS_NOFDREF;
2d21ac55
A
1638 socket_unlock(so, 1);
1639 soclose(so);
1640 /* Propagate socket filter's error code to the caller */
1641 } else {
1642 socket_unlock(so, 1);
1643 }
1644done:
1645 /* Callee checks for NULL pointer */
1646 sock_freeaddr(remote);
1647 sock_freeaddr(local);
0a7de745 1648 return error;
2d21ac55 1649}
1c79356b 1650
2d21ac55
A
1651/*
1652 * Returns: 0 Success
1653 * EOPNOTSUPP Operation not supported on socket
1654 * EISCONN Socket is connected
1655 * <pru_connect>:EADDRNOTAVAIL Address not available.
1656 * <pru_connect>:EINVAL Invalid argument
1657 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1658 * <pru_connect>:EACCES Permission denied
1659 * <pru_connect>:EADDRINUSE Address in use
1660 * <pru_connect>:EAGAIN Resource unavailable, try again
1661 * <pru_connect>:EPERM Operation not permitted
1662 * <sf_connect_out>:??? [anything a filter writer might set]
1663 */
1664int
1665soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1c79356b 1666{
1c79356b
A
1667 int error;
1668 struct proc *p = current_proc();
1c79356b 1669
0a7de745 1670 if (dolock) {
2d21ac55 1671 socket_lock(so, 1);
0a7de745 1672 }
39236c6e
A
1673
1674 so_update_last_owner_locked(so, p);
1675 so_update_policy(so);
1676
fe8ab488
A
1677#if NECP
1678 so_update_necp_policy(so, NULL, nam);
1679#endif /* NECP */
1680
2d21ac55
A
1681 /*
1682 * If this is a listening socket or if this is a previously-accepted
1683 * socket that has been marked as inactive, reject the connect request.
1684 */
1685 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
6d2010ae
A
1686 error = EOPNOTSUPP;
1687 if (so->so_flags & SOF_DEFUNCT) {
39037602 1688 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
39236c6e 1689 "(%d)\n", __func__, proc_pid(p),
39037602 1690 proc_best_name(p),
3e170ce0 1691 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 1692 SOCK_DOM(so), SOCK_TYPE(so), error);
6d2010ae 1693 }
0a7de745 1694 if (dolock) {
2d21ac55 1695 socket_unlock(so, 1);
0a7de745
A
1696 }
1697 return error;
91447636 1698 }
2d21ac55 1699
39236c6e 1700 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
0a7de745 1701 if (dolock) {
2d21ac55 1702 socket_unlock(so, 1);
0a7de745
A
1703 }
1704 return EPERM;
2d21ac55
A
1705 }
1706
1c79356b
A
1707 /*
1708 * If protocol is connection-based, can only connect once.
1709 * Otherwise, if connected, try to disconnect first.
1710 * This allows user to disconnect by connecting to, e.g.,
1711 * a null address.
1712 */
0a7de745 1713 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1c79356b 1714 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
2d21ac55 1715 (error = sodisconnectlocked(so)))) {
1c79356b 1716 error = EISCONN;
2d21ac55 1717 } else {
91447636
A
1718 /*
1719 * Run connect filter before calling protocol:
1720 * - non-blocking connect returns before completion;
1721 */
6d2010ae 1722 error = sflt_connectout(so, nam);
39236c6e 1723 if (error != 0) {
0a7de745 1724 if (error == EJUSTRETURN) {
91447636 1725 error = 0;
0a7de745 1726 }
6d2010ae 1727 } else {
39236c6e
A
1728 error = (*so->so_proto->pr_usrreqs->pru_connect)
1729 (so, nam, p);
4ba76501
A
1730 if (error != 0) {
1731 so->so_state &= ~SS_ISCONNECTING;
1732 }
91447636 1733 }
1c79356b 1734 }
0a7de745 1735 if (dolock) {
2d21ac55 1736 socket_unlock(so, 1);
0a7de745
A
1737 }
1738 return error;
1c79356b
A
1739}
1740
91447636 1741int
2d21ac55 1742soconnect(struct socket *so, struct sockaddr *nam)
91447636 1743{
0a7de745 1744 return soconnectlock(so, nam, 1);
91447636
A
1745}
1746
2d21ac55
A
1747/*
1748 * Returns: 0 Success
1749 * <pru_connect2>:EINVAL[AF_UNIX]
1750 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1751 * <pru_connect2>:??? [other protocol families]
1752 *
1753 * Notes: <pru_connect2> is not supported by [TCP].
1754 */
1c79356b 1755int
2d21ac55 1756soconnect2(struct socket *so1, struct socket *so2)
1c79356b 1757{
1c79356b 1758 int error;
91447636 1759
0c530ab8 1760 socket_lock(so1, 1);
0a7de745 1761 if (so2->so_proto->pr_lock) {
0c530ab8 1762 socket_lock(so2, 1);
0a7de745 1763 }
1c79356b
A
1764
1765 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
2d21ac55 1766
0c530ab8 1767 socket_unlock(so1, 1);
0a7de745 1768 if (so2->so_proto->pr_lock) {
0c530ab8 1769 socket_unlock(so2, 1);
0a7de745
A
1770 }
1771 return error;
1c79356b
A
1772}
1773
39236c6e 1774int
813fb2f6
A
1775soconnectxlocked(struct socket *so, struct sockaddr *src,
1776 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
3e170ce0
A
1777 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1778 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
39236c6e
A
1779{
1780 int error;
1781
fe8ab488
A
1782 so_update_last_owner_locked(so, p);
1783 so_update_policy(so);
3e170ce0 1784
39236c6e
A
1785 /*
1786 * If this is a listening socket or if this is a previously-accepted
1787 * socket that has been marked as inactive, reject the connect request.
1788 */
1789 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1790 error = EOPNOTSUPP;
1791 if (so->so_flags & SOF_DEFUNCT) {
39037602 1792 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
39236c6e 1793 "(%d)\n", __func__, proc_pid(p),
39037602 1794 proc_best_name(p),
3e170ce0 1795 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 1796 SOCK_DOM(so), SOCK_TYPE(so), error);
39236c6e 1797 }
0a7de745 1798 return error;
39236c6e
A
1799 }
1800
0a7de745
A
1801 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1802 return EPERM;
1803 }
39236c6e
A
1804
1805 /*
1806 * If protocol is connection-based, can only connect once
1807 * unless PR_MULTICONN is set. Otherwise, if connected,
1808 * try to disconnect first. This allows user to disconnect
1809 * by connecting to, e.g., a null address.
1810 */
0a7de745 1811 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
39236c6e
A
1812 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1813 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1814 (error = sodisconnectlocked(so)) != 0)) {
1815 error = EISCONN;
1816 } else {
1817 /*
1818 * Run connect filter before calling protocol:
1819 * - non-blocking connect returns before completion;
1820 */
813fb2f6 1821 error = sflt_connectout(so, dst);
39236c6e 1822 if (error != 0) {
490019cf
A
1823 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1824 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
0a7de745 1825 if (error == EJUSTRETURN) {
39236c6e 1826 error = 0;
0a7de745 1827 }
39236c6e
A
1828 } else {
1829 error = (*so->so_proto->pr_usrreqs->pru_connectx)
813fb2f6 1830 (so, src, dst, p, ifscope, aid, pcid,
3e170ce0 1831 flags, arg, arglen, auio, bytes_written);
4ba76501
A
1832 if (error != 0) {
1833 so->so_state &= ~SS_ISCONNECTING;
1834 }
39236c6e
A
1835 }
1836 }
1837
0a7de745 1838 return error;
39236c6e
A
1839}
1840
1c79356b 1841int
2d21ac55 1842sodisconnectlocked(struct socket *so)
1c79356b 1843{
1c79356b 1844 int error;
1c79356b
A
1845
1846 if ((so->so_state & SS_ISCONNECTED) == 0) {
1847 error = ENOTCONN;
1848 goto bad;
1849 }
1850 if (so->so_state & SS_ISDISCONNECTING) {
1851 error = EALREADY;
1852 goto bad;
1853 }
2d21ac55 1854
1c79356b 1855 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
0a7de745 1856 if (error == 0) {
91447636 1857 sflt_notify(so, sock_evt_disconnected, NULL);
0a7de745 1858 }
39236c6e 1859
1c79356b 1860bad:
0a7de745 1861 return error;
1c79356b 1862}
2d21ac55
A
1863
1864/* Locking version */
91447636 1865int
2d21ac55 1866sodisconnect(struct socket *so)
91447636 1867{
2d21ac55 1868 int error;
91447636
A
1869
1870 socket_lock(so, 1);
1871 error = sodisconnectlocked(so);
1872 socket_unlock(so, 1);
0a7de745 1873 return error;
91447636 1874}
1c79356b 1875
39236c6e 1876int
3e170ce0 1877sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
39236c6e
A
1878{
1879 int error;
1880
1881 /*
1882 * Call the protocol disconnectx handler; let it handle all
1883 * matters related to the connection state of this session.
1884 */
1885 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1886 if (error == 0) {
1887 /*
1888 * The event applies only for the session, not for
1889 * the disconnection of individual subflows.
1890 */
0a7de745 1891 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
39236c6e 1892 sflt_notify(so, sock_evt_disconnected, NULL);
0a7de745 1893 }
39236c6e 1894 }
0a7de745 1895 return error;
39236c6e
A
1896}
1897
1898int
3e170ce0 1899sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
39236c6e
A
1900{
1901 int error;
1902
1903 socket_lock(so, 1);
1904 error = sodisconnectxlocked(so, aid, cid);
1905 socket_unlock(so, 1);
0a7de745 1906 return error;
39236c6e
A
1907}
1908
0a7de745 1909#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
91447636
A
1910
1911/*
1912 * sosendcheck will lock the socket buffer if it isn't locked and
1913 * verify that there is space for the data being inserted.
2d21ac55
A
1914 *
1915 * Returns: 0 Success
1916 * EPIPE
1917 * sblock:EWOULDBLOCK
1918 * sblock:EINTR
1919 * sbwait:EBADF
1920 * sbwait:EINTR
1921 * [so_error]:???
91447636 1922 */
39236c6e
A
1923int
1924sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1925 int32_t clen, int32_t atomic, int flags, int *sblocked,
1926 struct mbuf *control)
91447636 1927{
0a7de745 1928 int error = 0;
b0d623f7 1929 int32_t space;
0a7de745 1930 int assumelock = 0;
91447636
A
1931
1932restart:
1933 if (*sblocked == 0) {
3a60a9f5 1934 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
2d21ac55
A
1935 so->so_send_filt_thread != 0 &&
1936 so->so_send_filt_thread == current_thread()) {
3a60a9f5
A
1937 /*
1938 * We're being called recursively from a filter,
1939 * allow this to continue. Radar 4150520.
1940 * Don't set sblocked because we don't want
1941 * to perform an unlock later.
1942 */
1943 assumelock = 1;
2d21ac55 1944 } else {
3a60a9f5
A
1945 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1946 if (error) {
0a7de745 1947 if (so->so_flags & SOF_DEFUNCT) {
6d2010ae 1948 goto defunct;
0a7de745
A
1949 }
1950 return error;
3a60a9f5
A
1951 }
1952 *sblocked = 1;
1953 }
91447636 1954 }
2d21ac55
A
1955
1956 /*
6d2010ae
A
1957 * If a send attempt is made on a socket that has been marked
1958 * as inactive (disconnected), reject the request.
2d21ac55 1959 */
6d2010ae
A
1960 if (so->so_flags & SOF_DEFUNCT) {
1961defunct:
1962 error = EPIPE;
39037602
A
1963 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1964 __func__, proc_selfpid(), proc_best_name(current_proc()),
3e170ce0 1965 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 1966 SOCK_DOM(so), SOCK_TYPE(so), error);
0a7de745 1967 return error;
6d2010ae 1968 }
2d21ac55 1969
fe8ab488
A
1970 if (so->so_state & SS_CANTSENDMORE) {
1971#if CONTENT_FILTER
1972 /*
1973 * Can re-inject data of half closed connections
1974 */
1975 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
0a7de745
A
1976 so->so_snd.sb_cfil_thread == current_thread() &&
1977 cfil_sock_data_pending(&so->so_snd) != 0) {
fe8ab488 1978 CFIL_LOG(LOG_INFO,
0a7de745
A
1979 "so %llx ignore SS_CANTSENDMORE",
1980 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1981 } else
fe8ab488 1982#endif /* CONTENT_FILTER */
0a7de745 1983 return EPIPE;
fe8ab488 1984 }
91447636
A
1985 if (so->so_error) {
1986 error = so->so_error;
1987 so->so_error = 0;
0a7de745 1988 return error;
91447636 1989 }
2d21ac55 1990
91447636 1991 if ((so->so_state & SS_ISCONNECTED) == 0) {
2d21ac55 1992 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
fe8ab488 1993 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
3e170ce0 1994 (resid != 0 || clen == 0) &&
0a7de745
A
1995 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1996 return ENOTCONN;
1997 }
cb323159 1998 } else if (addr == 0) {
0a7de745
A
1999 return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2000 ENOTCONN : EDESTADDRREQ;
2d21ac55 2001 }
91447636 2002 }
3e170ce0 2003
0a7de745 2004 if (so->so_flags & SOF_ENABLE_MSGS) {
39236c6e 2005 space = msgq_sbspace(so, control);
0a7de745 2006 } else {
39236c6e 2007 space = sbspace(&so->so_snd);
0a7de745 2008 }
39236c6e 2009
0a7de745 2010 if (flags & MSG_OOB) {
91447636 2011 space += 1024;
0a7de745 2012 }
91447636 2013 if ((atomic && resid > so->so_snd.sb_hiwat) ||
0a7de745
A
2014 clen > so->so_snd.sb_hiwat) {
2015 return EMSGSIZE;
2016 }
39236c6e 2017
316670eb 2018 if ((space < resid + clen &&
3e170ce0
A
2019 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2020 space < clen)) ||
316670eb 2021 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
3e170ce0
A
2022 /*
2023 * don't block the connectx call when there's more data
2024 * than can be copied.
2025 */
2026 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2027 if (space == 0) {
0a7de745 2028 return EWOULDBLOCK;
3e170ce0
A
2029 }
2030 if (space < (int32_t)so->so_snd.sb_lowat) {
0a7de745 2031 return 0;
3e170ce0
A
2032 }
2033 }
2d21ac55
A
2034 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2035 assumelock) {
0a7de745 2036 return EWOULDBLOCK;
3a60a9f5 2037 }
0a7de745 2038 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
6d2010ae 2039 *sblocked = 0;
91447636
A
2040 error = sbwait(&so->so_snd);
2041 if (error) {
0a7de745 2042 if (so->so_flags & SOF_DEFUNCT) {
6d2010ae 2043 goto defunct;
0a7de745
A
2044 }
2045 return error;
91447636
A
2046 }
2047 goto restart;
2048 }
0a7de745 2049 return 0;
91447636
A
2050}
2051
1c79356b
A
2052/*
2053 * Send on a socket.
2054 * If send must go all at once and message is larger than
2055 * send buffering, then hard error.
2056 * Lock against other senders.
2057 * If must go all at once and not enough room now, then
2058 * inform user that this would block and do nothing.
2059 * Otherwise, if nonblocking, send as much as possible.
2060 * The data to be sent is described by "uio" if nonzero,
2061 * otherwise by the mbuf chain "top" (which must be null
2062 * if uio is not). Data provided in mbuf chain must be small
2063 * enough to send all at once.
2064 *
2065 * Returns nonzero on error, timeout or signal; callers
2066 * must check for short counts if EINTR/ERESTART are returned.
2067 * Data and control buffers are freed on return.
2d21ac55
A
2068 *
2069 * Returns: 0 Success
2070 * EOPNOTSUPP
2071 * EINVAL
2072 * ENOBUFS
2073 * uiomove:EFAULT
2074 * sosendcheck:EPIPE
2075 * sosendcheck:EWOULDBLOCK
2076 * sosendcheck:EINTR
2077 * sosendcheck:EBADF
2078 * sosendcheck:EINTR
2079 * sosendcheck:??? [value from so_error]
2080 * <pru_send>:ECONNRESET[TCP]
2081 * <pru_send>:EINVAL[TCP]
2082 * <pru_send>:ENOBUFS[TCP]
2083 * <pru_send>:EADDRINUSE[TCP]
2084 * <pru_send>:EADDRNOTAVAIL[TCP]
2085 * <pru_send>:EAFNOSUPPORT[TCP]
2086 * <pru_send>:EACCES[TCP]
2087 * <pru_send>:EAGAIN[TCP]
2088 * <pru_send>:EPERM[TCP]
2089 * <pru_send>:EMSGSIZE[TCP]
2090 * <pru_send>:EHOSTUNREACH[TCP]
2091 * <pru_send>:ENETUNREACH[TCP]
2092 * <pru_send>:ENETDOWN[TCP]
2093 * <pru_send>:ENOMEM[TCP]
2094 * <pru_send>:ENOBUFS[TCP]
2095 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2096 * <pru_send>:EINVAL[AF_UNIX]
2097 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2098 * <pru_send>:EPIPE[AF_UNIX]
2099 * <pru_send>:ENOTCONN[AF_UNIX]
2100 * <pru_send>:EISCONN[AF_UNIX]
2101 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2102 * <sf_data_out>:??? [whatever a filter author chooses]
2103 *
2104 * Notes: Other <pru_send> returns depend on the protocol family; all
2105 * <sf_data_out> returns depend on what the filter author causes
2106 * their filter to return.
1c79356b
A
2107 */
2108int
2d21ac55
A
2109sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2110 struct mbuf *top, struct mbuf *control, int flags)
1c79356b
A
2111{
2112 struct mbuf **mp;
39236c6e 2113 struct mbuf *m, *freelist = NULL;
3e170ce0 2114 user_ssize_t space, len, resid, orig_resid;
91447636 2115 int clen = 0, error, dontroute, mlen, sendflags;
1c79356b 2116 int atomic = sosendallatonce(so) || top;
91447636 2117 int sblocked = 0;
1c79356b 2118 struct proc *p = current_proc();
39236c6e 2119 struct mbuf *control_copy = NULL;
3e170ce0
A
2120 uint16_t headroom = 0;
2121 boolean_t en_tracing = FALSE;
1c79356b 2122
0a7de745 2123 if (uio != NULL) {
91447636 2124 resid = uio_resid(uio);
0a7de745 2125 } else {
1c79356b 2126 resid = top->m_pkthdr.len;
0a7de745 2127 }
39236c6e 2128
2d21ac55
A
2129 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2130 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1c79356b 2131
91447636 2132 socket_lock(so, 1);
fe8ab488 2133
3e170ce0
A
2134 /*
2135 * trace if tracing & network (vs. unix) sockets & and
2136 * non-loopback
2137 */
2138 if (ENTR_SHOULDTRACE &&
2139 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2140 struct inpcb *inp = sotoinpcb(so);
2141 if (inp->inp_last_outifp != NULL &&
2142 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2143 en_tracing = TRUE;
2144 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2145 VM_KERNEL_ADDRPERM(so),
2146 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2147 (int64_t)resid);
2148 orig_resid = resid;
2149 }
2150 }
2151
fe8ab488
A
2152 /*
2153 * Re-injection should not affect process accounting
2154 */
2155 if ((flags & MSG_SKIPCFIL) == 0) {
3e170ce0
A
2156 so_update_last_owner_locked(so, p);
2157 so_update_policy(so);
2158
fe8ab488 2159#if NECP
3e170ce0 2160 so_update_necp_policy(so, NULL, addr);
fe8ab488
A
2161#endif /* NECP */
2162 }
3e170ce0 2163
2d21ac55
A
2164 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2165 error = EOPNOTSUPP;
5ba3f43e 2166 goto out_locked;
2d21ac55 2167 }
91447636 2168
1c79356b
A
2169 /*
2170 * In theory resid should be unsigned.
2171 * However, space must be signed, as it might be less than 0
2172 * if we over-committed, and we must use a signed comparison
2173 * of space and resid. On the other hand, a negative resid
2174 * causes us to loop sending 0-length segments to the protocol.
2175 *
39236c6e
A
2176 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2177 * But it will be used by sockets doing message delivery.
2178 *
fe8ab488 2179 * Note: We limit resid to be a positive int value as we use
39236c6e 2180 * imin() to set bytes_to_copy -- radr://14558484
1c79356b 2181 */
fe8ab488 2182 if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
39236c6e 2183 !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
1c79356b 2184 error = EINVAL;
5ba3f43e 2185 goto out_locked;
1c79356b
A
2186 }
2187
39236c6e
A
2188 dontroute = (flags & MSG_DONTROUTE) &&
2189 (so->so_options & SO_DONTROUTE) == 0 &&
1c79356b 2190 (so->so_proto->pr_flags & PR_ATOMIC);
b0d623f7 2191 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
39236c6e 2192
0a7de745 2193 if (control != NULL) {
1c79356b 2194 clen = control->m_len;
0a7de745 2195 }
1c79356b 2196
0a7de745 2197 if (soreserveheadroom != 0) {
3e170ce0 2198 headroom = so->so_pktheadroom;
0a7de745 2199 }
3e170ce0 2200
1c79356b 2201 do {
2d21ac55 2202 error = sosendcheck(so, addr, resid, clen, atomic, flags,
39236c6e 2203 &sblocked, control);
0a7de745 2204 if (error) {
5ba3f43e 2205 goto out_locked;
0a7de745 2206 }
39236c6e 2207
1c79356b 2208 mp = &top;
0a7de745 2209 if (so->so_flags & SOF_ENABLE_MSGS) {
39236c6e 2210 space = msgq_sbspace(so, control);
0a7de745 2211 } else {
39236c6e 2212 space = sbspace(&so->so_snd) - clen;
0a7de745 2213 }
39236c6e 2214 space += ((flags & MSG_OOB) ? 1024 : 0);
fa4905b1 2215
1c79356b 2216 do {
2d21ac55 2217 if (uio == NULL) {
91447636
A
2218 /*
2219 * Data is prepackaged in "top".
2220 */
2221 resid = 0;
0a7de745 2222 if (flags & MSG_EOR) {
1c79356b 2223 top->m_flags |= M_EOR;
0a7de745 2224 }
91447636 2225 } else {
2d21ac55
A
2226 int chainlength;
2227 int bytes_to_copy;
2228 boolean_t jumbocl;
fe8ab488 2229 boolean_t bigcl;
3e170ce0 2230 int bytes_to_alloc;
2d21ac55 2231
b0d623f7 2232 bytes_to_copy = imin(resid, space);
2d21ac55 2233
3e170ce0 2234 bytes_to_alloc = bytes_to_copy;
0a7de745 2235 if (top == NULL) {
3e170ce0 2236 bytes_to_alloc += headroom;
0a7de745 2237 }
3e170ce0 2238
0a7de745 2239 if (sosendminchain > 0) {
91447636 2240 chainlength = 0;
0a7de745 2241 } else {
91447636 2242 chainlength = sosendmaxchain;
0a7de745 2243 }
2d21ac55 2244
fe8ab488 2245 /*
3e170ce0
A
2246 * Use big 4 KB cluster when the outgoing interface
2247 * does not prefer 2 KB clusters
fe8ab488 2248 */
3e170ce0 2249 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
fe8ab488 2250 sosendbigcl_ignore_capab;
3e170ce0 2251
2d21ac55
A
2252 /*
2253 * Attempt to use larger than system page-size
2254 * clusters for large writes only if there is
2255 * a jumbo cluster pool and if the socket is
2256 * marked accordingly.
2257 */
2258 jumbocl = sosendjcl && njcl > 0 &&
2259 ((so->so_flags & SOF_MULTIPAGES) ||
fe8ab488
A
2260 sosendjcl_ignore_capab) &&
2261 bigcl;
2d21ac55 2262
91447636 2263 socket_unlock(so, 0);
2d21ac55 2264
91447636
A
2265 do {
2266 int num_needed;
39236c6e 2267 int hdrs_needed = (top == NULL) ? 1 : 0;
2d21ac55 2268
91447636 2269 /*
2d21ac55
A
2270 * try to maintain a local cache of mbuf
2271 * clusters needed to complete this
2272 * write the list is further limited to
2273 * the number that are currently needed
2274 * to fill the socket this mechanism
2275 * allows a large number of mbufs/
2276 * clusters to be grabbed under a single
2277 * mbuf lock... if we can't get any
2278 * clusters, than fall back to trying
2279 * for mbufs if we fail early (or
2280 * miscalcluate the number needed) make
2281 * sure to release any clusters we
2282 * haven't yet consumed.
91447636 2283 */
2d21ac55 2284 if (freelist == NULL &&
3e170ce0 2285 bytes_to_alloc > MBIGCLBYTES &&
6d2010ae 2286 jumbocl) {
2d21ac55 2287 num_needed =
3e170ce0 2288 bytes_to_alloc / M16KCLBYTES;
2d21ac55 2289
3e170ce0 2290 if ((bytes_to_alloc -
2d21ac55 2291 (num_needed * M16KCLBYTES))
0a7de745 2292 >= MINCLSIZE) {
2d21ac55 2293 num_needed++;
0a7de745 2294 }
91447636 2295
2d21ac55
A
2296 freelist =
2297 m_getpackets_internal(
0a7de745
A
2298 (unsigned int *)&num_needed,
2299 hdrs_needed, M_WAIT, 0,
2300 M16KCLBYTES);
2d21ac55
A
2301 /*
2302 * Fall back to 4K cluster size
2303 * if allocation failed
2304 */
2305 }
2306
2307 if (freelist == NULL &&
3e170ce0 2308 bytes_to_alloc > MCLBYTES &&
fe8ab488 2309 bigcl) {
2d21ac55 2310 num_needed =
3e170ce0 2311 bytes_to_alloc / MBIGCLBYTES;
2d21ac55 2312
3e170ce0 2313 if ((bytes_to_alloc -
6d2010ae 2314 (num_needed * MBIGCLBYTES)) >=
0a7de745 2315 MINCLSIZE) {
91447636 2316 num_needed++;
0a7de745 2317 }
2d21ac55
A
2318
2319 freelist =
2320 m_getpackets_internal(
0a7de745
A
2321 (unsigned int *)&num_needed,
2322 hdrs_needed, M_WAIT, 0,
2323 MBIGCLBYTES);
2d21ac55
A
2324 /*
2325 * Fall back to cluster size
2326 * if allocation failed
2327 */
91447636 2328 }
2d21ac55 2329
3e170ce0
A
2330 /*
2331 * Allocate a cluster as we want to
2332 * avoid to split the data in more
2333 * that one segment and using MINCLSIZE
2334 * would lead us to allocate two mbufs
2335 */
2336 if (soreserveheadroom != 0 &&
2337 freelist == NULL &&
2338 ((top == NULL &&
2339 bytes_to_alloc > _MHLEN) ||
2340 bytes_to_alloc > _MLEN)) {
2341 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2342 MCLBYTES;
2343 freelist =
2344 m_getpackets_internal(
0a7de745
A
2345 (unsigned int *)&num_needed,
2346 hdrs_needed, M_WAIT, 0,
2347 MCLBYTES);
3e170ce0
A
2348 /*
2349 * Fall back to a single mbuf
2350 * if allocation failed
2351 */
2352 } else if (freelist == NULL &&
2353 bytes_to_alloc > MINCLSIZE) {
2d21ac55 2354 num_needed =
3e170ce0 2355 bytes_to_alloc / MCLBYTES;
2d21ac55 2356
3e170ce0 2357 if ((bytes_to_alloc -
2d21ac55 2358 (num_needed * MCLBYTES)) >=
0a7de745 2359 MINCLSIZE) {
91447636 2360 num_needed++;
0a7de745 2361 }
2d21ac55
A
2362
2363 freelist =
2364 m_getpackets_internal(
0a7de745
A
2365 (unsigned int *)&num_needed,
2366 hdrs_needed, M_WAIT, 0,
2367 MCLBYTES);
2d21ac55
A
2368 /*
2369 * Fall back to a single mbuf
2370 * if allocation failed
2371 */
91447636 2372 }
3e170ce0
A
2373 /*
2374 * For datagram protocols, leave
2375 * headroom for protocol headers
2376 * in the first cluster of the chain
2377 */
2378 if (freelist != NULL && atomic &&
2379 top == NULL && headroom > 0) {
2380 freelist->m_data += headroom;
2381 }
39037602 2382
3e170ce0
A
2383 /*
2384 * Fall back to regular mbufs without
2385 * reserving the socket headroom
2386 */
91447636 2387 if (freelist == NULL) {
0a7de745 2388 if (top == NULL) {
2d21ac55
A
2389 MGETHDR(freelist,
2390 M_WAIT, MT_DATA);
0a7de745 2391 } else {
2d21ac55
A
2392 MGET(freelist,
2393 M_WAIT, MT_DATA);
0a7de745 2394 }
91447636
A
2395
2396 if (freelist == NULL) {
2397 error = ENOBUFS;
2398 socket_lock(so, 0);
5ba3f43e 2399 goto out_locked;
91447636
A
2400 }
2401 /*
2d21ac55
A
2402 * For datagram protocols,
2403 * leave room for protocol
2404 * headers in first mbuf.
91447636 2405 */
39236c6e 2406 if (atomic && top == NULL &&
2d21ac55
A
2407 bytes_to_copy < MHLEN) {
2408 MH_ALIGN(freelist,
2409 bytes_to_copy);
2410 }
91447636
A
2411 }
2412 m = freelist;
2413 freelist = m->m_next;
2414 m->m_next = NULL;
2d21ac55 2415
0a7de745 2416 if ((m->m_flags & M_EXT)) {
3e170ce0 2417 mlen = m->m_ext.ext_size -
d9a64523 2418 M_LEADINGSPACE(m);
0a7de745 2419 } else if ((m->m_flags & M_PKTHDR)) {
2d21ac55 2420 mlen =
d9a64523 2421 MHLEN - M_LEADINGSPACE(m);
0a7de745 2422 } else {
d9a64523 2423 mlen = MLEN - M_LEADINGSPACE(m);
0a7de745 2424 }
b0d623f7 2425 len = imin(mlen, bytes_to_copy);
91447636
A
2426
2427 chainlength += len;
2d21ac55 2428
91447636 2429 space -= len;
fa4905b1 2430
2d21ac55 2431 error = uiomove(mtod(m, caddr_t),
b0d623f7 2432 len, uio);
2d21ac55 2433
91447636 2434 resid = uio_resid(uio);
2d21ac55 2435
91447636
A
2436 m->m_len = len;
2437 *mp = m;
2438 top->m_pkthdr.len += len;
0a7de745 2439 if (error) {
91447636 2440 break;
0a7de745 2441 }
91447636
A
2442 mp = &m->m_next;
2443 if (resid <= 0) {
0a7de745 2444 if (flags & MSG_EOR) {
91447636 2445 top->m_flags |= M_EOR;
0a7de745 2446 }
91447636
A
2447 break;
2448 }
2449 bytes_to_copy = min(resid, space);
2d21ac55
A
2450 } while (space > 0 &&
2451 (chainlength < sosendmaxchain || atomic ||
2452 resid < MINCLSIZE));
2453
91447636 2454 socket_lock(so, 0);
2d21ac55 2455
0a7de745 2456 if (error) {
5ba3f43e 2457 goto out_locked;
0a7de745 2458 }
91447636 2459 }
2d21ac55 2460
0a7de745 2461 if (dontroute) {
2d21ac55 2462 so->so_options |= SO_DONTROUTE;
0a7de745 2463 }
2d21ac55 2464
3e170ce0
A
2465 /*
2466 * Compute flags here, for pru_send and NKEs
2467 *
2468 * If the user set MSG_EOF, the protocol
2469 * understands this flag and nothing left to
2470 * send then use PRU_SEND_EOF instead of PRU_SEND.
2471 */
2d21ac55 2472 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2d21ac55 2473 ((flags & MSG_EOF) &&
3e170ce0
A
2474 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2475 (resid <= 0)) ? PRUS_EOF :
2476 /* If there is more to send set PRUS_MORETOCOME */
2477 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2478
fe8ab488
A
2479 if ((flags & MSG_SKIPCFIL) == 0) {
2480 /*
2481 * Socket filter processing
2482 */
2483 error = sflt_data_out(so, addr, &top,
2484 &control, (sendflags & MSG_OOB) ?
2485 sock_data_filt_flag_oob : 0);
2486 if (error) {
2487 if (error == EJUSTRETURN) {
2488 error = 0;
2489 clen = 0;
2490 control = NULL;
2491 top = NULL;
2492 }
5ba3f43e 2493 goto out_locked;
91447636 2494 }
fe8ab488
A
2495#if CONTENT_FILTER
2496 /*
2497 * Content filter processing
2498 */
2499 error = cfil_sock_data_out(so, addr, top,
d9a64523 2500 control, sendflags);
fe8ab488
A
2501 if (error) {
2502 if (error == EJUSTRETURN) {
2503 error = 0;
2504 clen = 0;
2505 control = NULL;
2506 top = NULL;
0a7de745 2507 }
5ba3f43e 2508 goto out_locked;
fe8ab488
A
2509 }
2510#endif /* CONTENT_FILTER */
1c79356b 2511 }
39236c6e
A
2512 if (so->so_flags & SOF_ENABLE_MSGS) {
2513 /*
2514 * Make a copy of control mbuf,
2515 * so that msg priority can be
2516 * passed to subsequent mbufs.
2517 */
2518 control_copy = m_dup(control, M_NOWAIT);
2519 }
6d2010ae 2520 error = (*so->so_proto->pr_usrreqs->pru_send)
39236c6e
A
2521 (so, sendflags, top, addr, control, p);
2522
0a7de745 2523 if (dontroute) {
2d21ac55 2524 so->so_options &= ~SO_DONTROUTE;
0a7de745 2525 }
2d21ac55
A
2526
2527 clen = 0;
39236c6e
A
2528 control = control_copy;
2529 control_copy = NULL;
2530 top = NULL;
2d21ac55 2531 mp = &top;
0a7de745 2532 if (error) {
5ba3f43e 2533 goto out_locked;
0a7de745 2534 }
1c79356b
A
2535 } while (resid && space > 0);
2536 } while (resid);
2537
5ba3f43e 2538out_locked:
0a7de745
A
2539 if (sblocked) {
2540 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2541 } else {
3a60a9f5 2542 socket_unlock(so, 1);
0a7de745
A
2543 }
2544 if (top != NULL) {
1c79356b 2545 m_freem(top);
0a7de745
A
2546 }
2547 if (control != NULL) {
1c79356b 2548 m_freem(control);
0a7de745
A
2549 }
2550 if (freelist != NULL) {
2d21ac55 2551 m_freem_list(freelist);
0a7de745
A
2552 }
2553 if (control_copy != NULL) {
39236c6e 2554 m_freem(control_copy);
0a7de745 2555 }
1c79356b 2556
5ba3f43e 2557 soclearfastopen(so);
3e170ce0
A
2558
2559 if (en_tracing) {
2560 /* resid passed here is the bytes left in uio */
2561 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2562 VM_KERNEL_ADDRPERM(so),
2563 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2564 (int64_t)(orig_resid - resid));
2565 }
2566 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2567 so->so_snd.sb_cc, space, error);
1c79356b 2568
0a7de745 2569 return error;
1c79356b
A
2570}
2571
d9a64523
A
2572int
2573sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2574{
cb323159 2575 struct mbuf *m0 = NULL, *control_end = NULL;
d9a64523
A
2576
2577 socket_lock_assert_owned(so);
2578
2579 /*
2580 * top must points to mbuf chain to be sent.
2581 * If control is not NULL, top must be packet header
2582 */
2583 VERIFY(top != NULL &&
0a7de745 2584 (control == NULL || top->m_flags & M_PKTHDR));
d9a64523
A
2585
2586 /*
2587 * If control is not passed in, see if we can get it
2588 * from top.
2589 */
2590 if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2591 // Locate start of control if present and start of data
2592 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2593 if (m0->m_flags & M_PKTHDR) {
2594 top = m0;
2595 break;
2596 } else if (m0->m_type == MT_CONTROL) {
2597 if (control == NULL) {
2598 // Found start of control
2599 control = m0;
2600 }
2601 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2602 // Found end of control
2603 control_end = m0;
2604 }
2605 }
2606 }
0a7de745 2607 if (control_end != NULL) {
d9a64523 2608 control_end->m_next = NULL;
0a7de745 2609 }
d9a64523
A
2610 }
2611
2612 int error = (*so->so_proto->pr_usrreqs->pru_send)
0a7de745 2613 (so, sendflags, top, addr, control, current_proc());
d9a64523
A
2614
2615 return error;
2616}
2617
3e170ce0
A
2618/*
2619 * Supported only connected sockets (no address) without ancillary data
2620 * (control mbuf) for atomic protocols
2621 */
fe8ab488 2622int
3e170ce0 2623sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
fe8ab488
A
2624{
2625 struct mbuf *m, *freelist = NULL;
2626 user_ssize_t len, resid;
3e170ce0
A
2627 int error, dontroute, mlen;
2628 int atomic = sosendallatonce(so);
fe8ab488
A
2629 int sblocked = 0;
2630 struct proc *p = current_proc();
2631 u_int uiofirst = 0;
2632 u_int uiolast = 0;
3e170ce0
A
2633 struct mbuf *top = NULL;
2634 uint16_t headroom = 0;
2635 boolean_t bigcl;
fe8ab488
A
2636
2637 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2638 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2639
2640 if (so->so_type != SOCK_DGRAM) {
2641 error = EINVAL;
2642 goto out;
2643 }
2644 if (atomic == 0) {
2645 error = EINVAL;
2646 goto out;
2647 }
2648 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2649 error = EPROTONOSUPPORT;
2650 goto out;
2651 }
2652 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2653 error = EINVAL;
2654 goto out;
2655 }
3e170ce0 2656 resid = uio_array_resid(uioarray, uiocnt);
fe8ab488
A
2657
2658 /*
2659 * In theory resid should be unsigned.
2660 * However, space must be signed, as it might be less than 0
2661 * if we over-committed, and we must use a signed comparison
2662 * of space and resid. On the other hand, a negative resid
2663 * causes us to loop sending 0-length segments to the protocol.
2664 *
2665 * Note: We limit resid to be a positive int value as we use
2666 * imin() to set bytes_to_copy -- radr://14558484
2667 */
2668 if (resid < 0 || resid > INT_MAX) {
2669 error = EINVAL;
2670 goto out;
2671 }
fe8ab488
A
2672
2673 socket_lock(so, 1);
2674 so_update_last_owner_locked(so, p);
2675 so_update_policy(so);
3e170ce0 2676
fe8ab488 2677#if NECP
3e170ce0 2678 so_update_necp_policy(so, NULL, NULL);
fe8ab488 2679#endif /* NECP */
3e170ce0 2680
fe8ab488
A
2681 dontroute = (flags & MSG_DONTROUTE) &&
2682 (so->so_options & SO_DONTROUTE) == 0 &&
2683 (so->so_proto->pr_flags & PR_ATOMIC);
2684 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2685
3e170ce0
A
2686 error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2687 &sblocked, NULL);
0a7de745 2688 if (error) {
fe8ab488 2689 goto release;
0a7de745 2690 }
fe8ab488 2691
3e170ce0
A
2692 /*
2693 * Use big 4 KB clusters when the outgoing interface does not prefer
2694 * 2 KB clusters
2695 */
2696 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2697
0a7de745 2698 if (soreserveheadroom != 0) {
3e170ce0 2699 headroom = so->so_pktheadroom;
0a7de745 2700 }
3e170ce0 2701
fe8ab488
A
2702 do {
2703 int i;
3e170ce0
A
2704 int num_needed = 0;
2705 int chainlength;
2706 size_t maxpktlen = 0;
2707 int bytes_to_alloc;
fe8ab488 2708
0a7de745 2709 if (sosendminchain > 0) {
3e170ce0 2710 chainlength = 0;
0a7de745 2711 } else {
3e170ce0 2712 chainlength = sosendmaxchain;
0a7de745 2713 }
fe8ab488 2714
3e170ce0 2715 socket_unlock(so, 0);
fe8ab488 2716
3e170ce0
A
2717 /*
2718 * Find a set of uio that fit in a reasonable number
2719 * of mbuf packets
2720 */
2721 for (i = uiofirst; i < uiocnt; i++) {
2722 struct uio *auio = uioarray[i];
fe8ab488 2723
3e170ce0 2724 len = uio_resid(auio);
fe8ab488 2725
3e170ce0 2726 /* Do nothing for empty messages */
0a7de745 2727 if (len == 0) {
3e170ce0 2728 continue;
0a7de745 2729 }
fe8ab488 2730
3e170ce0
A
2731 num_needed += 1;
2732 uiolast += 1;
fe8ab488 2733
0a7de745 2734 if (len > maxpktlen) {
3e170ce0 2735 maxpktlen = len;
0a7de745 2736 }
fe8ab488 2737
3e170ce0 2738 chainlength += len;
0a7de745 2739 if (chainlength > sosendmaxchain) {
fe8ab488 2740 break;
0a7de745 2741 }
3e170ce0
A
2742 }
2743 /*
2744 * Nothing left to send
2745 */
2746 if (num_needed == 0) {
2747 socket_lock(so, 0);
2748 break;
2749 }
2750 /*
2751 * Allocate buffer large enough to include headroom space for
2752 * network and link header
39037602 2753 *
3e170ce0
A
2754 */
2755 bytes_to_alloc = maxpktlen + headroom;
2756
2757 /*
2758 * Allocate a single contiguous buffer of the smallest available
2759 * size when possible
2760 */
2761 if (bytes_to_alloc > MCLBYTES &&
2762 bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2763 freelist = m_getpackets_internal(
0a7de745
A
2764 (unsigned int *)&num_needed,
2765 num_needed, M_WAIT, 1,
2766 MBIGCLBYTES);
3e170ce0
A
2767 } else if (bytes_to_alloc > _MHLEN &&
2768 bytes_to_alloc <= MCLBYTES) {
2769 freelist = m_getpackets_internal(
0a7de745
A
2770 (unsigned int *)&num_needed,
2771 num_needed, M_WAIT, 1,
2772 MCLBYTES);
3e170ce0 2773 } else {
fe8ab488 2774 freelist = m_allocpacket_internal(
0a7de745
A
2775 (unsigned int *)&num_needed,
2776 bytes_to_alloc, NULL, M_WAIT, 1, 0);
3e170ce0 2777 }
39037602 2778
3e170ce0
A
2779 if (freelist == NULL) {
2780 socket_lock(so, 0);
2781 error = ENOMEM;
2782 goto release;
2783 }
2784 /*
2785 * Copy each uio of the set into its own mbuf packet
2786 */
2787 for (i = uiofirst, m = freelist;
2788 i < uiolast && m != NULL;
2789 i++) {
2790 int bytes_to_copy;
2791 struct mbuf *n;
2792 struct uio *auio = uioarray[i];
fe8ab488 2793
3e170ce0
A
2794 bytes_to_copy = uio_resid(auio);
2795
2796 /* Do nothing for empty messages */
0a7de745 2797 if (bytes_to_copy == 0) {
3e170ce0 2798 continue;
0a7de745 2799 }
fe8ab488 2800 /*
3e170ce0
A
2801 * Leave headroom for protocol headers
2802 * in the first mbuf of the chain
fe8ab488 2803 */
3e170ce0
A
2804 m->m_data += headroom;
2805
2806 for (n = m; n != NULL; n = n->m_next) {
0a7de745 2807 if ((m->m_flags & M_EXT)) {
3e170ce0 2808 mlen = m->m_ext.ext_size -
d9a64523 2809 M_LEADINGSPACE(m);
0a7de745 2810 } else if ((m->m_flags & M_PKTHDR)) {
3e170ce0 2811 mlen =
d9a64523 2812 MHLEN - M_LEADINGSPACE(m);
0a7de745 2813 } else {
d9a64523 2814 mlen = MLEN - M_LEADINGSPACE(m);
0a7de745 2815 }
3e170ce0 2816 len = imin(mlen, bytes_to_copy);
fe8ab488 2817
3e170ce0
A
2818 /*
2819 * Note: uiomove() decrements the iovec
2820 * length
2821 */
2822 error = uiomove(mtod(n, caddr_t),
2823 len, auio);
0a7de745 2824 if (error != 0) {
fe8ab488 2825 break;
0a7de745 2826 }
3e170ce0
A
2827 n->m_len = len;
2828 m->m_pkthdr.len += len;
fe8ab488 2829
3e170ce0 2830 VERIFY(m->m_pkthdr.len <= maxpktlen);
fe8ab488 2831
3e170ce0
A
2832 bytes_to_copy -= len;
2833 resid -= len;
2834 }
2835 if (m->m_pkthdr.len == 0) {
2836 printf(
0a7de745
A
2837 "%s:%d so %llx pkt %llx type %u len null\n",
2838 __func__, __LINE__,
2839 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2840 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2841 m->m_type);
3e170ce0 2842 }
0a7de745 2843 if (error != 0) {
3e170ce0 2844 break;
0a7de745 2845 }
3e170ce0 2846 m = m->m_nextpkt;
fe8ab488
A
2847 }
2848
3e170ce0
A
2849 socket_lock(so, 0);
2850
0a7de745 2851 if (error) {
3e170ce0 2852 goto release;
0a7de745 2853 }
3e170ce0
A
2854 top = freelist;
2855 freelist = NULL;
2856
0a7de745 2857 if (dontroute) {
fe8ab488 2858 so->so_options |= SO_DONTROUTE;
0a7de745 2859 }
fe8ab488
A
2860
2861 if ((flags & MSG_SKIPCFIL) == 0) {
2862 struct mbuf **prevnextp = NULL;
3e170ce0 2863
fe8ab488
A
2864 for (i = uiofirst, m = top;
2865 i < uiolast && m != NULL;
2866 i++) {
2867 struct mbuf *nextpkt = m->m_nextpkt;
2868
2869 /*
2870 * Socket filter processing
2871 */
3e170ce0
A
2872 error = sflt_data_out(so, NULL, &m,
2873 NULL, 0);
0a7de745 2874 if (error != 0 && error != EJUSTRETURN) {
fe8ab488 2875 goto release;
0a7de745 2876 }
3e170ce0 2877
fe8ab488
A
2878#if CONTENT_FILTER
2879 if (error == 0) {
2880 /*
2881 * Content filter processing
2882 */
3e170ce0
A
2883 error = cfil_sock_data_out(so, NULL, m,
2884 NULL, 0);
0a7de745 2885 if (error != 0 && error != EJUSTRETURN) {
fe8ab488 2886 goto release;
0a7de745 2887 }
fe8ab488
A
2888 }
2889#endif /* CONTENT_FILTER */
2890 /*
2891 * Remove packet from the list when
2892 * swallowed by a filter
2893 */
2894 if (error == EJUSTRETURN) {
2895 error = 0;
0a7de745 2896 if (prevnextp != NULL) {
fe8ab488 2897 *prevnextp = nextpkt;
0a7de745 2898 } else {
fe8ab488 2899 top = nextpkt;
0a7de745 2900 }
3e170ce0
A
2901 }
2902
fe8ab488 2903 m = nextpkt;
0a7de745 2904 if (m != NULL) {
fe8ab488 2905 prevnextp = &m->m_nextpkt;
0a7de745 2906 }
fe8ab488
A
2907 }
2908 }
0a7de745 2909 if (top != NULL) {
fe8ab488 2910 error = (*so->so_proto->pr_usrreqs->pru_send_list)
3e170ce0 2911 (so, 0, top, NULL, NULL, p);
0a7de745 2912 }
fe8ab488 2913
0a7de745 2914 if (dontroute) {
fe8ab488 2915 so->so_options &= ~SO_DONTROUTE;
0a7de745 2916 }
fe8ab488 2917
fe8ab488
A
2918 top = NULL;
2919 uiofirst = uiolast;
2920 } while (resid > 0 && error == 0);
2921release:
0a7de745
A
2922 if (sblocked) {
2923 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2924 } else {
fe8ab488 2925 socket_unlock(so, 1);
0a7de745 2926 }
fe8ab488 2927out:
0a7de745 2928 if (top != NULL) {
fe8ab488 2929 m_freem(top);
0a7de745
A
2930 }
2931 if (freelist != NULL) {
fe8ab488 2932 m_freem_list(freelist);
0a7de745 2933 }
fe8ab488
A
2934
2935 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2936 so->so_snd.sb_cc, 0, error);
2937
0a7de745 2938 return error;
fe8ab488
A
2939}
2940
3e170ce0
A
2941/*
2942 * May return ERESTART when packet is dropped by MAC policy check
2943 */
2944static int
2945soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2946 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2947{
2948 int error = 0;
2949 struct mbuf *m = *mp;
2950 struct mbuf *nextrecord = *nextrecordp;
2951
2952 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2953#if CONFIG_MACF_SOCKET_SUBSET
2954 /*
2955 * Call the MAC framework for policy checking if we're in
2956 * the user process context and the socket isn't connected.
2957 */
2958 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2959 struct mbuf *m0 = m;
2960 /*
2961 * Dequeue this record (temporarily) from the receive
2962 * list since we're about to drop the socket's lock
2963 * where a new record may arrive and be appended to
2964 * the list. Upon MAC policy failure, the record
2965 * will be freed. Otherwise, we'll add it back to
2966 * the head of the list. We cannot rely on SB_LOCK
2967 * because append operation uses the socket's lock.
2968 */
2969 do {
2970 m->m_nextpkt = NULL;
2971 sbfree(&so->so_rcv, m);
2972 m = m->m_next;
2973 } while (m != NULL);
2974 m = m0;
2975 so->so_rcv.sb_mb = nextrecord;
2976 SB_EMPTY_FIXUP(&so->so_rcv);
2977 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2978 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2979 socket_unlock(so, 0);
2980
2981 if (mac_socket_check_received(proc_ucred(p), so,
2982 mtod(m, struct sockaddr *)) != 0) {
2983 /*
2984 * MAC policy failure; free this record and
2985 * process the next record (or block until
2986 * one is available). We have adjusted sb_cc
2987 * and sb_mbcnt above so there is no need to
2988 * call sbfree() again.
2989 */
2990 m_freem(m);
2991 /*
2992 * Clear SB_LOCK but don't unlock the socket.
2993 * Process the next record or wait for one.
2994 */
2995 socket_lock(so, 0);
2996 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2997 error = ERESTART;
2998 goto done;
2999 }
3000 socket_lock(so, 0);
3001 /*
3002 * If the socket has been defunct'd, drop it.
3003 */
3004 if (so->so_flags & SOF_DEFUNCT) {
3005 m_freem(m);
3006 error = ENOTCONN;
3007 goto done;
3008 }
3009 /*
3010 * Re-adjust the socket receive list and re-enqueue
3011 * the record in front of any packets which may have
3012 * been appended while we dropped the lock.
3013 */
0a7de745 3014 for (m = m0; m->m_next != NULL; m = m->m_next) {
3e170ce0 3015 sballoc(&so->so_rcv, m);
0a7de745 3016 }
3e170ce0
A
3017 sballoc(&so->so_rcv, m);
3018 if (so->so_rcv.sb_mb == NULL) {
3019 so->so_rcv.sb_lastrecord = m0;
3020 so->so_rcv.sb_mbtail = m;
3021 }
3022 m = m0;
3023 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3024 so->so_rcv.sb_mb = m;
3025 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3026 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3027 }
3028#endif /* CONFIG_MACF_SOCKET_SUBSET */
3029 if (psa != NULL) {
3030 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3031 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3032 error = EWOULDBLOCK;
3033 goto done;
3034 }
3035 }
3036 if (flags & MSG_PEEK) {
3037 m = m->m_next;
3038 } else {
3039 sbfree(&so->so_rcv, m);
3040 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3041 panic("%s: about to create invalid socketbuf",
3042 __func__);
3043 /* NOTREACHED */
3044 }
3045 MFREE(m, so->so_rcv.sb_mb);
3046 m = so->so_rcv.sb_mb;
3047 if (m != NULL) {
3048 m->m_nextpkt = nextrecord;
3049 } else {
3050 so->so_rcv.sb_mb = nextrecord;
3051 SB_EMPTY_FIXUP(&so->so_rcv);
3052 }
3053 }
3054done:
3055 *mp = m;
3056 *nextrecordp = nextrecord;
3057
0a7de745 3058 return error;
3e170ce0
A
3059}
3060
3061/*
3062 * Process one or more MT_CONTROL mbufs present before any data mbufs
3063 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3064 * just copy the data; if !MSG_PEEK, we call into the protocol to
3065 * perform externalization.
3066 */
3067static int
3068soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3069 struct mbuf **mp, struct mbuf **nextrecordp)
3070{
3071 int error = 0;
3072 struct mbuf *cm = NULL, *cmn;
3073 struct mbuf **cme = &cm;
3074 struct sockbuf *sb_rcv = &so->so_rcv;
3075 struct mbuf **msgpcm = NULL;
3076 struct mbuf *m = *mp;
3077 struct mbuf *nextrecord = *nextrecordp;
3078 struct protosw *pr = so->so_proto;
3079
3080 /*
3081 * Externalizing the control messages would require us to
3082 * drop the socket's lock below. Once we re-acquire the
3083 * lock, the mbuf chain might change. In order to preserve
3084 * consistency, we unlink all control messages from the
3085 * first mbuf chain in one shot and link them separately
3086 * onto a different chain.
3087 */
3088 do {
3089 if (flags & MSG_PEEK) {
3090 if (controlp != NULL) {
3091 if (*controlp == NULL) {
3092 msgpcm = controlp;
3093 }
3094 *controlp = m_copy(m, 0, m->m_len);
3095
3096 /*
3097 * If we failed to allocate an mbuf,
3098 * release any previously allocated
3099 * mbufs for control data. Return
3100 * an error. Keep the mbufs in the
3101 * socket as this is using
3102 * MSG_PEEK flag.
3103 */
3104 if (*controlp == NULL) {
3105 m_freem(*msgpcm);
3106 error = ENOBUFS;
3107 goto done;
3108 }
3109 controlp = &(*controlp)->m_next;
3110 }
3111 m = m->m_next;
3112 } else {
3113 m->m_nextpkt = NULL;
3114 sbfree(sb_rcv, m);
3115 sb_rcv->sb_mb = m->m_next;
3116 m->m_next = NULL;
3117 *cme = m;
3118 cme = &(*cme)->m_next;
3119 m = sb_rcv->sb_mb;
3120 }
3121 } while (m != NULL && m->m_type == MT_CONTROL);
3122
3123 if (!(flags & MSG_PEEK)) {
3124 if (sb_rcv->sb_mb != NULL) {
3125 sb_rcv->sb_mb->m_nextpkt = nextrecord;
3126 } else {
3127 sb_rcv->sb_mb = nextrecord;
3128 SB_EMPTY_FIXUP(sb_rcv);
3129 }
0a7de745 3130 if (nextrecord == NULL) {
3e170ce0 3131 sb_rcv->sb_lastrecord = m;
0a7de745 3132 }
3e170ce0
A
3133 }
3134
3135 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3136 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3137
3138 while (cm != NULL) {
3139 int cmsg_type;
3140
3141 cmn = cm->m_next;
3142 cm->m_next = NULL;
3143 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3144
3145 /*
3146 * Call the protocol to externalize SCM_RIGHTS message
3147 * and return the modified message to the caller upon
3148 * success. Otherwise, all other control messages are
3149 * returned unmodified to the caller. Note that we
3150 * only get into this loop if MSG_PEEK is not set.
3151 */
3152 if (pr->pr_domain->dom_externalize != NULL &&
3153 cmsg_type == SCM_RIGHTS) {
3154 /*
3155 * Release socket lock: see 3903171. This
3156 * would also allow more records to be appended
3157 * to the socket buffer. We still have SB_LOCK
3158 * set on it, so we can be sure that the head
3159 * of the mbuf chain won't change.
3160 */
3161 socket_unlock(so, 0);
3162 error = (*pr->pr_domain->dom_externalize)(cm);
3163 socket_lock(so, 0);
3164 } else {
3165 error = 0;
3166 }
3167
3168 if (controlp != NULL && error == 0) {
3169 *controlp = cm;
3170 controlp = &(*controlp)->m_next;
3171 } else {
3172 (void) m_free(cm);
3173 }
3174 cm = cmn;
3175 }
3176 /*
3177 * Update the value of nextrecord in case we received new
3178 * records when the socket was unlocked above for
3179 * externalizing SCM_RIGHTS.
3180 */
0a7de745 3181 if (m != NULL) {
3e170ce0 3182 nextrecord = sb_rcv->sb_mb->m_nextpkt;
0a7de745 3183 } else {
3e170ce0 3184 nextrecord = sb_rcv->sb_mb;
0a7de745 3185 }
3e170ce0
A
3186
3187done:
3188 *mp = m;
3189 *nextrecordp = nextrecord;
3190
0a7de745 3191 return error;
3e170ce0
A
3192}
3193
1c79356b
A
3194/*
3195 * Implement receive operations on a socket.
3196 * We depend on the way that records are added to the sockbuf
3197 * by sbappend*. In particular, each record (mbufs linked through m_next)
3198 * must begin with an address if the protocol so specifies,
3199 * followed by an optional mbuf or mbufs containing ancillary data,
3200 * and then zero or more mbufs of data.
3201 * In order to avoid blocking network interrupts for the entire time here,
3202 * we splx() while doing the actual copy to user space.
3203 * Although the sockbuf is locked, new data may still be appended,
3204 * and thus we must maintain consistency of the sockbuf during that time.
3205 *
3206 * The caller may receive the data as a single mbuf chain by supplying
3207 * an mbuf **mp0 for use in returning the chain. The uio is then used
3208 * only for the count in uio_resid.
2d21ac55
A
3209 *
3210 * Returns: 0 Success
3211 * ENOBUFS
3212 * ENOTCONN
3213 * EWOULDBLOCK
3214 * uiomove:EFAULT
3215 * sblock:EWOULDBLOCK
3216 * sblock:EINTR
3217 * sbwait:EBADF
3218 * sbwait:EINTR
3219 * sodelayed_copy:EFAULT
3220 * <pru_rcvoob>:EINVAL[TCP]
3221 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3222 * <pru_rcvoob>:???
3223 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3224 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3225 * <pr_domain->dom_externalize>:???
3226 *
3227 * Notes: Additional return values from calls through <pru_rcvoob> and
3228 * <pr_domain->dom_externalize> depend on protocols other than
3229 * TCP or AF_UNIX, which are documented above.
1c79356b
A
3230 */
3231int
2d21ac55
A
3232soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3233 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1c79356b 3234{
39236c6e
A
3235 struct mbuf *m, **mp, *ml = NULL;
3236 struct mbuf *nextrecord, *free_list;
3237 int flags, error, offset;
3238 user_ssize_t len;
1c79356b 3239 struct protosw *pr = so->so_proto;
3e170ce0 3240 int moff, type = 0;
39236c6e
A
3241 user_ssize_t orig_resid = uio_resid(uio);
3242 user_ssize_t delayed_copy_len;
55e303ae
A
3243 int can_delay;
3244 int need_event;
3245 struct proc *p = current_proc();
3e170ce0 3246 boolean_t en_tracing = FALSE;
1c79356b 3247
fe8ab488
A
3248 /*
3249 * Sanity check on the length passed by caller as we are making 'int'
3250 * comparisons
3251 */
0a7de745
A
3252 if (orig_resid < 0 || orig_resid > INT_MAX) {
3253 return EINVAL;
3254 }
fe8ab488 3255
3e170ce0
A
3256 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3257 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3258 so->so_rcv.sb_hiwat);
3259
91447636 3260 socket_lock(so, 1);
6d2010ae 3261 so_update_last_owner_locked(so, p);
39236c6e 3262 so_update_policy(so);
1c79356b 3263
91447636 3264#ifdef MORE_LOCKING_DEBUG
39236c6e
A
3265 if (so->so_usecount == 1) {
3266 panic("%s: so=%x no other reference on socket\n", __func__, so);
3267 /* NOTREACHED */
3268 }
91447636 3269#endif
1c79356b 3270 mp = mp0;
0a7de745 3271 if (psa != NULL) {
39236c6e 3272 *psa = NULL;
0a7de745
A
3273 }
3274 if (controlp != NULL) {
39236c6e 3275 *controlp = NULL;
0a7de745
A
3276 }
3277 if (flagsp != NULL) {
3278 flags = *flagsp & ~MSG_EOR;
3279 } else {
1c79356b 3280 flags = 0;
0a7de745 3281 }
2d21ac55
A
3282
3283 /*
3284 * If a recv attempt is made on a previously-accepted socket
3285 * that has been marked as inactive (disconnected), reject
3286 * the request.
3287 */
3288 if (so->so_flags & SOF_DEFUNCT) {
3289 struct sockbuf *sb = &so->so_rcv;
3290
6d2010ae 3291 error = ENOTCONN;
39037602
A
3292 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3293 __func__, proc_pid(p), proc_best_name(p),
3294 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3295 SOCK_DOM(so), SOCK_TYPE(so), error);
2d21ac55
A
3296 /*
3297 * This socket should have been disconnected and flushed
6d2010ae
A
3298 * prior to being returned from sodefunct(); there should
3299 * be no data on its receive list, so panic otherwise.
2d21ac55 3300 */
0a7de745 3301 if (so->so_state & SS_DEFUNCT) {
6d2010ae 3302 sb_empty_assert(sb, __func__);
0a7de745 3303 }
2d21ac55 3304 socket_unlock(so, 1);
0a7de745 3305 return error;
2d21ac55
A
3306 }
3307
3e170ce0
A
3308 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3309 pr->pr_usrreqs->pru_preconnect) {
3310 /*
3311 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3312 * calling write() right after this. *If* the app calls a read
3313 * we do not want to block this read indefinetely. Thus,
3314 * we trigger a connect so that the session gets initiated.
3315 */
3316 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3317
3318 if (error) {
3319 socket_unlock(so, 1);
0a7de745 3320 return error;
3e170ce0
A
3321 }
3322 }
3323
3324 if (ENTR_SHOULDTRACE &&
3325 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3326 /*
3327 * enable energy tracing for inet sockets that go over
3328 * non-loopback interfaces only.
3329 */
3330 struct inpcb *inp = sotoinpcb(so);
3331 if (inp->inp_last_outifp != NULL &&
3332 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3333 en_tracing = TRUE;
3334 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3335 VM_KERNEL_ADDRPERM(so),
3336 ((so->so_state & SS_NBIO) ?
3337 kEnTrFlagNonBlocking : 0),
3338 (int64_t)orig_resid);
3339 }
3340 }
3341
2d21ac55
A
3342 /*
3343 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3344 * regardless of the flags argument. Here is the case were
3345 * out-of-band data is not inline.
3346 */
3347 if ((flags & MSG_OOB) ||
3348 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3349 (so->so_options & SO_OOBINLINE) == 0 &&
3350 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
1c79356b 3351 m = m_get(M_WAIT, MT_DATA);
55e303ae 3352 if (m == NULL) {
91447636 3353 socket_unlock(so, 1);
2d21ac55
A
3354 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3355 ENOBUFS, 0, 0, 0, 0);
0a7de745 3356 return ENOBUFS;
55e303ae 3357 }
1c79356b 3358 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
0a7de745 3359 if (error) {
1c79356b 3360 goto bad;
0a7de745 3361 }
91447636 3362 socket_unlock(so, 0);
1c79356b
A
3363 do {
3364 error = uiomove(mtod(m, caddr_t),
b0d623f7 3365 imin(uio_resid(uio), m->m_len), uio);
1c79356b 3366 m = m_free(m);
39236c6e 3367 } while (uio_resid(uio) && error == 0 && m != NULL);
91447636 3368 socket_lock(so, 0);
1c79356b 3369bad:
0a7de745 3370 if (m != NULL) {
1c79356b 3371 m_freem(m);
0a7de745 3372 }
39236c6e 3373
9bccf70c
A
3374 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3375 if (error == EWOULDBLOCK || error == EINVAL) {
2d21ac55 3376 /*
9bccf70c 3377 * Let's try to get normal data:
2d21ac55
A
3378 * EWOULDBLOCK: out-of-band data not
3379 * receive yet. EINVAL: out-of-band data
3380 * already read.
9bccf70c
A
3381 */
3382 error = 0;
3383 goto nooob;
39236c6e 3384 } else if (error == 0 && flagsp != NULL) {
9bccf70c 3385 *flagsp |= MSG_OOB;
2d21ac55
A
3386 }
3387 }
91447636 3388 socket_unlock(so, 1);
3e170ce0
A
3389 if (en_tracing) {
3390 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3391 VM_KERNEL_ADDRPERM(so), 0,
3392 (int64_t)(orig_resid - uio_resid(uio)));
3393 }
2d21ac55
A
3394 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3395 0, 0, 0, 0);
39236c6e 3396
0a7de745 3397 return error;
1c79356b
A
3398 }
3399nooob:
0a7de745 3400 if (mp != NULL) {
39236c6e 3401 *mp = NULL;
0a7de745 3402 }
fe8ab488
A
3403
3404 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
1c79356b 3405 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
fe8ab488 3406 }
1c79356b 3407
39236c6e 3408 free_list = NULL;
55e303ae 3409 delayed_copy_len = 0;
1c79356b 3410restart:
91447636 3411#ifdef MORE_LOCKING_DEBUG
0a7de745 3412 if (so->so_usecount <= 1) {
fe8ab488 3413 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3e170ce0 3414 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
0a7de745 3415 }
91447636 3416#endif
6601e61a
A
3417 /*
3418 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3419 * and if so just return to the caller. This could happen when
3420 * soreceive() is called by a socket upcall function during the
3421 * time the socket is freed. The socket buffer would have been
3422 * locked across the upcall, therefore we cannot put this thread
3423 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3424 * we may livelock), because the lock on the socket buffer will
3425 * only be released when the upcall routine returns to its caller.
3426 * Because the socket has been officially closed, there can be
3427 * no further read on it.
39236c6e
A
3428 *
3429 * A multipath subflow socket would have its SS_NOFDREF set by
3430 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3431 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
6601e61a
A
3432 */
3433 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
39236c6e 3434 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
6601e61a 3435 socket_unlock(so, 1);
0a7de745 3436 return 0;
6601e61a
A
3437 }
3438
9bccf70c
A
3439 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3440 if (error) {
91447636 3441 socket_unlock(so, 1);
2d21ac55
A
3442 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3443 0, 0, 0, 0);
3e170ce0
A
3444 if (en_tracing) {
3445 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3446 VM_KERNEL_ADDRPERM(so), 0,
3447 (int64_t)(orig_resid - uio_resid(uio)));
3448 }
0a7de745 3449 return error;
1c79356b 3450 }
1c79356b
A
3451
3452 m = so->so_rcv.sb_mb;
3453 /*
3454 * If we have less data than requested, block awaiting more
3455 * (subject to any timeout) if:
3456 * 1. the current count is less than the low water mark, or
3457 * 2. MSG_WAITALL is set, and it is possible to do the entire
3458 * receive operation at once if we block (resid <= hiwat).
3459 * 3. MSG_DONTWAIT is not set
3460 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3461 * we have to do the receive in sections, and thus risk returning
3462 * a short count if a timeout or signal occurs after we start.
3463 */
39236c6e 3464 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
91447636 3465 so->so_rcv.sb_cc < uio_resid(uio)) &&
2d21ac55 3466 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
91447636 3467 ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
39236c6e 3468 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
2d21ac55
A
3469 /*
3470 * Panic if we notice inconsistencies in the socket's
3471 * receive list; both sb_mb and sb_cc should correctly
3472 * reflect the contents of the list, otherwise we may
3473 * end up with false positives during select() or poll()
3474 * which could put the application in a bad state.
3475 */
316670eb 3476 SB_MB_CHECK(&so->so_rcv);
55e303ae 3477
1c79356b 3478 if (so->so_error) {
0a7de745 3479 if (m != NULL) {
1c79356b 3480 goto dontblock;
0a7de745 3481 }
1c79356b 3482 error = so->so_error;
0a7de745 3483 if ((flags & MSG_PEEK) == 0) {
1c79356b 3484 so->so_error = 0;
0a7de745 3485 }
1c79356b
A
3486 goto release;
3487 }
3488 if (so->so_state & SS_CANTRCVMORE) {
fe8ab488
A
3489#if CONTENT_FILTER
3490 /*
3491 * Deal with half closed connections
3492 */
3493 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
0a7de745 3494 cfil_sock_data_pending(&so->so_rcv) != 0) {
fe8ab488 3495 CFIL_LOG(LOG_INFO,
0a7de745
A
3496 "so %llx ignore SS_CANTRCVMORE",
3497 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3498 } else
fe8ab488 3499#endif /* CONTENT_FILTER */
0a7de745 3500 if (m != NULL) {
1c79356b 3501 goto dontblock;
0a7de745 3502 } else {
1c79356b 3503 goto release;
0a7de745 3504 }
1c79356b 3505 }
0a7de745 3506 for (; m != NULL; m = m->m_next) {
2d21ac55 3507 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
1c79356b
A
3508 m = so->so_rcv.sb_mb;
3509 goto dontblock;
3510 }
0a7de745
A
3511 }
3512 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
1c79356b
A
3513 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3514 error = ENOTCONN;
3515 goto release;
3516 }
0a7de745 3517 if (uio_resid(uio) == 0) {
1c79356b 3518 goto release;
0a7de745 3519 }
3e170ce0 3520
2d21ac55 3521 if ((so->so_state & SS_NBIO) ||
0a7de745 3522 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
1c79356b
A
3523 error = EWOULDBLOCK;
3524 goto release;
3525 }
2d21ac55
A
3526 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3527 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
0a7de745 3528 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
2d21ac55 3529#if EVEN_MORE_LOCKING_DEBUG
0a7de745 3530 if (socket_debug) {
2d21ac55 3531 printf("Waiting for socket data\n");
0a7de745 3532 }
91447636 3533#endif
55e303ae 3534
1c79356b 3535 error = sbwait(&so->so_rcv);
2d21ac55 3536#if EVEN_MORE_LOCKING_DEBUG
0a7de745 3537 if (socket_debug) {
2d21ac55 3538 printf("SORECEIVE - sbwait returned %d\n", error);
0a7de745 3539 }
91447636 3540#endif
39236c6e
A
3541 if (so->so_usecount < 1) {
3542 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3543 __func__, so, so->so_usecount);
3544 /* NOTREACHED */
3545 }
9bccf70c 3546 if (error) {
91447636 3547 socket_unlock(so, 1);
2d21ac55
A
3548 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3549 0, 0, 0, 0);
3e170ce0
A
3550 if (en_tracing) {
3551 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3552 VM_KERNEL_ADDRPERM(so), 0,
3553 (int64_t)(orig_resid - uio_resid(uio)));
3554 }
0a7de745 3555 return error;
1c79356b
A
3556 }
3557 goto restart;
3558 }
3559dontblock:
b0d623f7 3560 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
2d21ac55
A
3561 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3562 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
1c79356b 3563 nextrecord = m->m_nextpkt;
fe8ab488 3564
3e170ce0
A
3565 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3566 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3567 mp0 == NULL);
0a7de745 3568 if (error == ERESTART) {
3e170ce0 3569 goto restart;
0a7de745 3570 } else if (error != 0) {
3e170ce0 3571 goto release;
0a7de745 3572 }
1c79356b 3573 orig_resid = 0;
1c79356b 3574 }
2d21ac55
A
3575
3576 /*
3577 * Process one or more MT_CONTROL mbufs present before any data mbufs
3578 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3579 * just copy the data; if !MSG_PEEK, we call into the protocol to
3580 * perform externalization.
3581 */
3582 if (m != NULL && m->m_type == MT_CONTROL) {
3e170ce0 3583 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
0a7de745 3584 if (error != 0) {
3e170ce0 3585 goto release;
0a7de745 3586 }
316670eb 3587 orig_resid = 0;
1c79356b 3588 }
2d21ac55 3589
39236c6e
A
3590 /*
3591 * If the socket is a TCP socket with message delivery
3592 * enabled, then create a control msg to deliver the
3593 * relative TCP sequence number for this data. Waiting
3594 * until this point will protect against failures to
3595 * allocate an mbuf for control msgs.
3596 */
3597 if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3598 (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3599 struct mbuf *seq_cm;
3600
3601 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
0a7de745 3602 sizeof(uint32_t), SCM_SEQNUM, SOL_SOCKET);
39236c6e
A
3603 if (seq_cm == NULL) {
3604 /* unable to allocate a control mbuf */
3605 error = ENOBUFS;
3606 goto release;
3607 }
3608 *controlp = seq_cm;
3609 controlp = &seq_cm->m_next;
3610 }
3611
2d21ac55
A
3612 if (m != NULL) {
3613 if (!(flags & MSG_PEEK)) {
3614 /*
3615 * We get here because m points to an mbuf following
3616 * any MT_SONAME or MT_CONTROL mbufs which have been
3617 * processed above. In any case, m should be pointing
3618 * to the head of the mbuf chain, and the nextrecord
3619 * should be either NULL or equal to m->m_nextpkt.
3620 * See comments above about SB_LOCK.
3621 */
39236c6e
A
3622 if (m != so->so_rcv.sb_mb ||
3623 m->m_nextpkt != nextrecord) {
3624 panic("%s: post-control !sync so=%p m=%p "
3625 "nextrecord=%p\n", __func__, so, m,
3626 nextrecord);
3627 /* NOTREACHED */
3628 }
0a7de745 3629 if (nextrecord == NULL) {
2d21ac55 3630 so->so_rcv.sb_lastrecord = m;
0a7de745 3631 }
2d21ac55 3632 }
1c79356b 3633 type = m->m_type;
0a7de745 3634 if (type == MT_OOBDATA) {
1c79356b 3635 flags |= MSG_OOB;
0a7de745 3636 }
2d21ac55
A
3637 } else {
3638 if (!(flags & MSG_PEEK)) {
2d21ac55
A
3639 SB_EMPTY_FIXUP(&so->so_rcv);
3640 }
1c79356b 3641 }
2d21ac55
A
3642 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3643 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3644
1c79356b
A
3645 moff = 0;
3646 offset = 0;
fa4905b1 3647
0a7de745 3648 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
2d21ac55 3649 can_delay = 1;
0a7de745 3650 } else {
2d21ac55 3651 can_delay = 0;
0a7de745 3652 }
55e303ae
A
3653
3654 need_event = 0;
fa4905b1 3655
39236c6e
A
3656 while (m != NULL &&
3657 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
1c79356b 3658 if (m->m_type == MT_OOBDATA) {
0a7de745 3659 if (type != MT_OOBDATA) {
1c79356b 3660 break;
0a7de745 3661 }
2d21ac55 3662 } else if (type == MT_OOBDATA) {
1c79356b 3663 break;
2d21ac55 3664 }
9bccf70c 3665 /*
2d21ac55 3666 * Make sure to allways set MSG_OOB event when getting
9bccf70c
A
3667 * out of band data inline.
3668 */
1c79356b 3669 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2d21ac55
A
3670 (so->so_options & SO_OOBINLINE) != 0 &&
3671 (so->so_state & SS_RCVATMARK) != 0) {
9bccf70c
A
3672 flags |= MSG_OOB;
3673 }
1c79356b 3674 so->so_state &= ~SS_RCVATMARK;
91447636 3675 len = uio_resid(uio) - delayed_copy_len;
0a7de745 3676 if (so->so_oobmark && len > so->so_oobmark - offset) {
1c79356b 3677 len = so->so_oobmark - offset;
0a7de745
A
3678 }
3679 if (len > m->m_len - moff) {
1c79356b 3680 len = m->m_len - moff;
0a7de745 3681 }
1c79356b
A
3682 /*
3683 * If mp is set, just pass back the mbufs.
3684 * Otherwise copy them out via the uio, then free.
3685 * Sockbuf must be consistent here (points to current mbuf,
3686 * it points to next record) when we drop priority;
3687 * we must note any additions to the sockbuf when we
3688 * block interrupts again.
3689 */
39236c6e 3690 if (mp == NULL) {
2d21ac55
A
3691 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3692 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
55e303ae 3693 if (can_delay && len == m->m_len) {
2d21ac55 3694 /*
55e303ae
A
3695 * only delay the copy if we're consuming the
3696 * mbuf and we're NOT in MSG_PEEK mode
3697 * and we have enough data to make it worthwile
2d21ac55
A
3698 * to drop and retake the lock... can_delay
3699 * reflects the state of the 2 latter
3700 * constraints moff should always be zero
3701 * in these cases
55e303ae 3702 */
2d21ac55 3703 delayed_copy_len += len;
55e303ae 3704 } else {
2d21ac55
A
3705 if (delayed_copy_len) {
3706 error = sodelayed_copy(so, uio,
3707 &free_list, &delayed_copy_len);
55e303ae
A
3708
3709 if (error) {
55e303ae
A
3710 goto release;
3711 }
2d21ac55
A
3712 /*
3713 * can only get here if MSG_PEEK is not
3714 * set therefore, m should point at the
3715 * head of the rcv queue; if it doesn't,
3716 * it means something drastically
3717 * changed while we were out from behind
3718 * the lock in sodelayed_copy. perhaps
3719 * a RST on the stream. in any event,
3720 * the stream has been interrupted. it's
3721 * probably best just to return whatever
3722 * data we've moved and let the caller
3723 * sort it out...
3724 */
55e303ae 3725 if (m != so->so_rcv.sb_mb) {
2d21ac55 3726 break;
55e303ae
A
3727 }
3728 }
91447636 3729 socket_unlock(so, 0);
2d21ac55
A
3730 error = uiomove(mtod(m, caddr_t) + moff,
3731 (int)len, uio);
91447636 3732 socket_lock(so, 0);
55e303ae 3733
0a7de745 3734 if (error) {
2d21ac55 3735 goto release;
0a7de745 3736 }
55e303ae 3737 }
2d21ac55 3738 } else {
91447636 3739 uio_setresid(uio, (uio_resid(uio) - len));
2d21ac55 3740 }
1c79356b 3741 if (len == m->m_len - moff) {
0a7de745 3742 if (m->m_flags & M_EOR) {
1c79356b 3743 flags |= MSG_EOR;
0a7de745 3744 }
1c79356b
A
3745 if (flags & MSG_PEEK) {
3746 m = m->m_next;
3747 moff = 0;
3748 } else {
3749 nextrecord = m->m_nextpkt;
3750 sbfree(&so->so_rcv, m);
91447636 3751 m->m_nextpkt = NULL;
55e303ae 3752
39236c6e
A
3753 /*
3754 * If this packet is an unordered packet
3755 * (indicated by M_UNORDERED_DATA flag), remove
3756 * the additional bytes added to the
3757 * receive socket buffer size.
3758 */
3759 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3760 m->m_len &&
3761 (m->m_flags & M_UNORDERED_DATA) &&
3762 sbreserve(&so->so_rcv,
3763 so->so_rcv.sb_hiwat - m->m_len)) {
3764 if (so->so_msg_state->msg_uno_bytes >
3765 m->m_len) {
3766 so->so_msg_state->
0a7de745 3767 msg_uno_bytes -= m->m_len;
39236c6e
A
3768 } else {
3769 so->so_msg_state->
0a7de745 3770 msg_uno_bytes = 0;
39236c6e
A
3771 }
3772 m->m_flags &= ~M_UNORDERED_DATA;
3773 }
3774
3775 if (mp != NULL) {
1c79356b
A
3776 *mp = m;
3777 mp = &m->m_next;
3778 so->so_rcv.sb_mb = m = m->m_next;
39236c6e 3779 *mp = NULL;
1c79356b 3780 } else {
0a7de745 3781 if (free_list == NULL) {
2d21ac55 3782 free_list = m;
0a7de745 3783 } else {
2d21ac55 3784 ml->m_next = m;
0a7de745 3785 }
2d21ac55 3786 ml = m;
14353aa8 3787 so->so_rcv.sb_mb = m = m->m_next;
39236c6e 3788 ml->m_next = NULL;
1c79356b 3789 }
2d21ac55 3790 if (m != NULL) {
1c79356b 3791 m->m_nextpkt = nextrecord;
0a7de745 3792 if (nextrecord == NULL) {
2d21ac55 3793 so->so_rcv.sb_lastrecord = m;
0a7de745 3794 }
2d21ac55
A
3795 } else {
3796 so->so_rcv.sb_mb = nextrecord;
3797 SB_EMPTY_FIXUP(&so->so_rcv);
3798 }
3799 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3800 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
1c79356b
A
3801 }
3802 } else {
2d21ac55 3803 if (flags & MSG_PEEK) {
1c79356b 3804 moff += len;
2d21ac55 3805 } else {
6d2010ae
A
3806 if (mp != NULL) {
3807 int copy_flag;
3808
0a7de745 3809 if (flags & MSG_DONTWAIT) {
6d2010ae 3810 copy_flag = M_DONTWAIT;
0a7de745 3811 } else {
6d2010ae 3812 copy_flag = M_WAIT;
0a7de745 3813 }
6d2010ae 3814 *mp = m_copym(m, 0, len, copy_flag);
39236c6e
A
3815 /*
3816 * Failed to allocate an mbuf?
3817 * Adjust uio_resid back, it was
3818 * adjusted down by len bytes which
3819 * we didn't copy over.
3820 */
6d2010ae 3821 if (*mp == NULL) {
39236c6e
A
3822 uio_setresid(uio,
3823 (uio_resid(uio) + len));
6d2010ae
A
3824 break;
3825 }
3826 }
1c79356b
A
3827 m->m_data += len;
3828 m->m_len -= len;
3829 so->so_rcv.sb_cc -= len;
3830 }
3831 }
3832 if (so->so_oobmark) {
3833 if ((flags & MSG_PEEK) == 0) {
3834 so->so_oobmark -= len;
3835 if (so->so_oobmark == 0) {
2d21ac55
A
3836 so->so_state |= SS_RCVATMARK;
3837 /*
3838 * delay posting the actual event until
3839 * after any delayed copy processing
3840 * has finished
3841 */
3842 need_event = 1;
3843 break;
1c79356b
A
3844 }
3845 } else {
3846 offset += len;
0a7de745 3847 if (offset == so->so_oobmark) {
1c79356b 3848 break;
0a7de745 3849 }
1c79356b
A
3850 }
3851 }
0a7de745 3852 if (flags & MSG_EOR) {
1c79356b 3853 break;
0a7de745 3854 }
1c79356b 3855 /*
2d21ac55
A
3856 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3857 * (for non-atomic socket), we must not quit until
3858 * "uio->uio_resid == 0" or an error termination.
3859 * If a signal/timeout occurs, return with a short
3860 * count but without error. Keep sockbuf locked
3861 * against other readers.
1c79356b 3862 */
0a7de745 3863 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
2d21ac55 3864 (uio_resid(uio) - delayed_copy_len) > 0 &&
1c79356b 3865 !sosendallatonce(so) && !nextrecord) {
fe8ab488
A
3866 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3867#if CONTENT_FILTER
3868 && cfil_sock_data_pending(&so->so_rcv) == 0
3869#endif /* CONTENT_FILTER */
0a7de745 3870 )) {
2d21ac55 3871 goto release;
0a7de745 3872 }
fa4905b1 3873
2d21ac55
A
3874 /*
3875 * Depending on the protocol (e.g. TCP), the following
3876 * might cause the socket lock to be dropped and later
3877 * be reacquired, and more data could have arrived and
3878 * have been appended to the receive socket buffer by
3879 * the time it returns. Therefore, we only sleep in
3880 * sbwait() below if and only if the socket buffer is
3881 * empty, in order to avoid a false sleep.
3882 */
3883 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3884 (((struct inpcb *)so->so_pcb)->inp_state !=
0a7de745 3885 INPCB_STATE_DEAD)) {
2d21ac55 3886 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
0a7de745 3887 }
2d21ac55
A
3888
3889 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3890 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3891
3892 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3893 error = 0;
55e303ae 3894 goto release;
fa4905b1 3895 }
55e303ae 3896 /*
2d21ac55
A
3897 * have to wait until after we get back from the sbwait
3898 * to do the copy because we will drop the lock if we
3899 * have enough data that has been delayed... by dropping
3900 * the lock we open up a window allowing the netisr
3901 * thread to process the incoming packets and to change
3902 * the state of this socket... we're issuing the sbwait
3903 * because the socket is empty and we're expecting the
3904 * netisr thread to wake us up when more packets arrive;
3905 * if we allow that processing to happen and then sbwait
3906 * we could stall forever with packets sitting in the
3907 * socket if no further packets arrive from the remote
3908 * side.
55e303ae 3909 *
2d21ac55
A
3910 * we want to copy before we've collected all the data
3911 * to satisfy this request to allow the copy to overlap
3912 * the incoming packet processing on an MP system
55e303ae 3913 */
2d21ac55
A
3914 if (delayed_copy_len > sorecvmincopy &&
3915 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3916 error = sodelayed_copy(so, uio,
3917 &free_list, &delayed_copy_len);
55e303ae 3918
0a7de745 3919 if (error) {
2d21ac55 3920 goto release;
0a7de745 3921 }
1c79356b
A
3922 }
3923 m = so->so_rcv.sb_mb;
39236c6e 3924 if (m != NULL) {
1c79356b 3925 nextrecord = m->m_nextpkt;
fa4905b1 3926 }
316670eb 3927 SB_MB_CHECK(&so->so_rcv);
1c79356b
A
3928 }
3929 }
91447636 3930#ifdef MORE_LOCKING_DEBUG
39236c6e
A
3931 if (so->so_usecount <= 1) {
3932 panic("%s: after big while so=%p ref=%d on socket\n",
3933 __func__, so, so->so_usecount);
3934 /* NOTREACHED */
3935 }
91447636 3936#endif
1c79356b 3937
39236c6e 3938 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
2d21ac55 3939 if (so->so_options & SO_DONTTRUNC) {
1c79356b 3940 flags |= MSG_RCVMORE;
2d21ac55 3941 } else {
9bccf70c 3942 flags |= MSG_TRUNC;
0a7de745 3943 if ((flags & MSG_PEEK) == 0) {
1c79356b 3944 (void) sbdroprecord(&so->so_rcv);
0a7de745 3945 }
1c79356b
A
3946 }
3947 }
2d21ac55
A
3948
3949 /*
3950 * pru_rcvd below (for TCP) may cause more data to be received
3951 * if the socket lock is dropped prior to sending the ACK; some
3952 * legacy OpenTransport applications don't handle this well
3953 * (if it receives less data than requested while MSG_HAVEMORE
3954 * is set), and so we set the flag now based on what we know
3955 * prior to calling pru_rcvd.
3956 */
0a7de745 3957 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
2d21ac55 3958 flags |= MSG_HAVEMORE;
0a7de745 3959 }
2d21ac55 3960
1c79356b 3961 if ((flags & MSG_PEEK) == 0) {
39236c6e 3962 if (m == NULL) {
1c79356b 3963 so->so_rcv.sb_mb = nextrecord;
2d21ac55
A
3964 /*
3965 * First part is an inline SB_EMPTY_FIXUP(). Second
3966 * part makes sure sb_lastrecord is up-to-date if
3967 * there is still data in the socket buffer.
3968 */
3969 if (so->so_rcv.sb_mb == NULL) {
3970 so->so_rcv.sb_mbtail = NULL;
3971 so->so_rcv.sb_lastrecord = NULL;
3972 } else if (nextrecord->m_nextpkt == NULL) {
3973 so->so_rcv.sb_lastrecord = nextrecord;
3974 }
316670eb 3975 SB_MB_CHECK(&so->so_rcv);
2d21ac55
A
3976 }
3977 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3978 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
0a7de745 3979 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
1c79356b 3980 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
0a7de745 3981 }
1c79356b 3982 }
39236c6e 3983
55e303ae 3984 if (delayed_copy_len) {
91447636 3985 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
0a7de745 3986 if (error) {
2d21ac55 3987 goto release;
0a7de745 3988 }
55e303ae 3989 }
39236c6e
A
3990 if (free_list != NULL) {
3991 m_freem_list(free_list);
3992 free_list = NULL;
55e303ae 3993 }
0a7de745 3994 if (need_event) {
2d21ac55 3995 postevent(so, 0, EV_OOB);
0a7de745 3996 }
39236c6e 3997
91447636 3998 if (orig_resid == uio_resid(uio) && orig_resid &&
1c79356b 3999 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
0a7de745 4000 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
1c79356b
A
4001 goto restart;
4002 }
4003
0a7de745 4004 if (flagsp != NULL) {
1c79356b 4005 *flagsp |= flags;
0a7de745 4006 }
1c79356b 4007release:
91447636 4008#ifdef MORE_LOCKING_DEBUG
39236c6e
A
4009 if (so->so_usecount <= 1) {
4010 panic("%s: release so=%p ref=%d on socket\n", __func__,
2d21ac55 4011 so, so->so_usecount);
39236c6e
A
4012 /* NOTREACHED */
4013 }
91447636 4014#endif
0a7de745 4015 if (delayed_copy_len) {
2d21ac55 4016 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
0a7de745 4017 }
1c79356b 4018
0a7de745 4019 if (free_list != NULL) {
39236c6e 4020 m_freem_list(free_list);
0a7de745 4021 }
39236c6e 4022
0a7de745 4023 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
39236c6e 4024
3e170ce0
A
4025 if (en_tracing) {
4026 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4027 VM_KERNEL_ADDRPERM(so),
4028 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4029 (int64_t)(orig_resid - uio_resid(uio)));
4030 }
2d21ac55
A
4031 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4032 so->so_rcv.sb_cc, 0, error);
1c79356b 4033
0a7de745 4034 return error;
1c79356b
A
4035}
4036
2d21ac55
A
4037/*
4038 * Returns: 0 Success
4039 * uiomove:EFAULT
4040 */
4041static int
4042sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
39236c6e 4043 user_ssize_t *resid)
55e303ae 4044{
2d21ac55 4045 int error = 0;
55e303ae
A
4046 struct mbuf *m;
4047
4048 m = *free_list;
4049
91447636 4050 socket_unlock(so, 0);
55e303ae 4051
39236c6e 4052 while (m != NULL && error == 0) {
2d21ac55 4053 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
2d21ac55
A
4054 m = m->m_next;
4055 }
4056 m_freem_list(*free_list);
4057
39236c6e 4058 *free_list = NULL;
2d21ac55
A
4059 *resid = 0;
4060
4061 socket_lock(so, 0);
55e303ae 4062
0a7de745 4063 return error;
2d21ac55
A
4064}
4065
3e170ce0
A
4066static int
4067sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4068 u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4069{
4070#pragma unused(so)
4071 int error = 0;
4072 struct mbuf *ml, *m;
4073 int i = 0;
4074 struct uio *auio;
4075
4076 for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4077 ml = ml->m_nextpkt, i++) {
4078 auio = msgarray[i].uio;
4079 for (m = ml; m != NULL; m = m->m_next) {
4080 error = uiomove(mtod(m, caddr_t), m->m_len, auio);
0a7de745 4081 if (error != 0) {
3e170ce0 4082 goto out;
0a7de745 4083 }
3e170ce0
A
4084 }
4085 }
4086out:
4087 m_freem_list(*free_list);
4088
4089 *free_list = NULL;
4090 *resid = 0;
4091
0a7de745 4092 return error;
3e170ce0
A
4093}
4094
2d21ac55 4095int
3e170ce0
A
4096soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4097 int *flagsp)
2d21ac55 4098{
3e170ce0 4099 struct mbuf *m;
fe8ab488 4100 struct mbuf *nextrecord;
3e170ce0
A
4101 struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4102 int error;
4103 user_ssize_t len, pktlen, delayed_copy_len = 0;
fe8ab488 4104 struct protosw *pr = so->so_proto;
3e170ce0 4105 user_ssize_t resid;
fe8ab488
A
4106 struct proc *p = current_proc();
4107 struct uio *auio = NULL;
3e170ce0 4108 int npkts = 0;
fe8ab488 4109 int sblocked = 0;
3e170ce0
A
4110 struct sockaddr **psa = NULL;
4111 struct mbuf **controlp = NULL;
4112 int can_delay;
4113 int flags;
4114 struct mbuf *free_others = NULL;
55e303ae 4115
fe8ab488
A
4116 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4117 so, uiocnt,
4118 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4119
fe8ab488
A
4120 /*
4121 * Sanity checks:
4122 * - Only supports don't wait flags
4123 * - Only support datagram sockets (could be extended to raw)
4124 * - Must be atomic
4125 * - Protocol must support packet chains
4126 * - The uio array is NULL (should we panic?)
4127 */
0a7de745 4128 if (flagsp != NULL) {
3e170ce0 4129 flags = *flagsp;
0a7de745 4130 } else {
3e170ce0 4131 flags = 0;
0a7de745 4132 }
3e170ce0
A
4133 if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4134 MSG_NBIO)) {
4135 printf("%s invalid flags 0x%x\n", __func__, flags);
4136 error = EINVAL;
fe8ab488
A
4137 goto out;
4138 }
4139 if (so->so_type != SOCK_DGRAM) {
4140 error = EINVAL;
4141 goto out;
4142 }
4143 if (sosendallatonce(so) == 0) {
4144 error = EINVAL;
4145 goto out;
4146 }
4147 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4148 error = EPROTONOSUPPORT;
4149 goto out;
4150 }
3e170ce0 4151 if (msgarray == NULL) {
fe8ab488
A
4152 printf("%s uioarray is NULL\n", __func__);
4153 error = EINVAL;
4154 goto out;
4155 }
4156 if (uiocnt == 0) {
4157 printf("%s uiocnt is 0\n", __func__);
4158 error = EINVAL;
4159 goto out;
4160 }
4161 /*
4162 * Sanity check on the length passed by caller as we are making 'int'
4163 * comparisons
4164 */
3e170ce0
A
4165 resid = recv_msg_array_resid(msgarray, uiocnt);
4166 if (resid < 0 || resid > INT_MAX) {
fe8ab488
A
4167 error = EINVAL;
4168 goto out;
4169 }
4170
0a7de745 4171 if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
3e170ce0 4172 can_delay = 1;
0a7de745 4173 } else {
3e170ce0 4174 can_delay = 0;
0a7de745 4175 }
3e170ce0 4176
fe8ab488
A
4177 socket_lock(so, 1);
4178 so_update_last_owner_locked(so, p);
4179 so_update_policy(so);
4180
4181#if NECP
4182 so_update_necp_policy(so, NULL, NULL);
4183#endif /* NECP */
3e170ce0 4184
fe8ab488
A
4185 /*
4186 * If a recv attempt is made on a previously-accepted socket
4187 * that has been marked as inactive (disconnected), reject
4188 * the request.
4189 */
4190 if (so->so_flags & SOF_DEFUNCT) {
4191 struct sockbuf *sb = &so->so_rcv;
4192
4193 error = ENOTCONN;
39037602
A
4194 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4195 __func__, proc_pid(p), proc_best_name(p),
4196 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4197 SOCK_DOM(so), SOCK_TYPE(so), error);
fe8ab488
A
4198 /*
4199 * This socket should have been disconnected and flushed
4200 * prior to being returned from sodefunct(); there should
4201 * be no data on its receive list, so panic otherwise.
4202 */
0a7de745 4203 if (so->so_state & SS_DEFUNCT) {
fe8ab488 4204 sb_empty_assert(sb, __func__);
0a7de745 4205 }
fe8ab488
A
4206 goto release;
4207 }
3e170ce0
A
4208
4209next:
4210 /*
4211 * The uio may be empty
4212 */
4213 if (npkts >= uiocnt) {
4214 error = 0;
4215 goto release;
4216 }
fe8ab488
A
4217restart:
4218 /*
4219 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4220 * and if so just return to the caller. This could happen when
4221 * soreceive() is called by a socket upcall function during the
4222 * time the socket is freed. The socket buffer would have been
4223 * locked across the upcall, therefore we cannot put this thread
4224 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4225 * we may livelock), because the lock on the socket buffer will
4226 * only be released when the upcall routine returns to its caller.
4227 * Because the socket has been officially closed, there can be
4228 * no further read on it.
4229 */
4230 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4231 (SS_NOFDREF | SS_CANTRCVMORE)) {
4232 error = 0;
4233 goto release;
4234 }
4235
4236 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4237 if (error) {
4238 goto release;
4239 }
4240 sblocked = 1;
4241
fe8ab488
A
4242 m = so->so_rcv.sb_mb;
4243 /*
4244 * Block awaiting more datagram if needed
4245 */
3e170ce0
A
4246 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4247 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4248 ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
fe8ab488
A
4249 /*
4250 * Panic if we notice inconsistencies in the socket's
4251 * receive list; both sb_mb and sb_cc should correctly
4252 * reflect the contents of the list, otherwise we may
4253 * end up with false positives during select() or poll()
4254 * which could put the application in a bad state.
4255 */
4256 SB_MB_CHECK(&so->so_rcv);
4257
4258 if (so->so_error) {
4259 error = so->so_error;
0a7de745 4260 if ((flags & MSG_PEEK) == 0) {
3e170ce0 4261 so->so_error = 0;
0a7de745 4262 }
fe8ab488
A
4263 goto release;
4264 }
4265 if (so->so_state & SS_CANTRCVMORE) {
4266 goto release;
4267 }
0a7de745 4268 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
fe8ab488
A
4269 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4270 error = ENOTCONN;
4271 goto release;
4272 }
4273 if ((so->so_state & SS_NBIO) ||
0a7de745 4274 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
fe8ab488
A
4275 error = EWOULDBLOCK;
4276 goto release;
4277 }
4278 /*
4279 * Do not block if we got some data
fe8ab488 4280 */
3e170ce0 4281 if (free_list != NULL) {
fe8ab488
A
4282 error = 0;
4283 goto release;
4284 }
3e170ce0 4285
fe8ab488
A
4286 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4287 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4288
0a7de745 4289 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
fe8ab488
A
4290 sblocked = 0;
4291
4292 error = sbwait(&so->so_rcv);
4293 if (error) {
4294 goto release;
4295 }
4296 goto restart;
4297 }
4298
fe8ab488
A
4299 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4300 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4301 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4302
4303 /*
4304 * Consume the current uio index as we have a datagram
4305 */
3e170ce0
A
4306 auio = msgarray[npkts].uio;
4307 resid = uio_resid(auio);
4308 msgarray[npkts].which |= SOCK_MSG_DATA;
4309 psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4310 &msgarray[npkts].psa : NULL;
4311 controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4312 &msgarray[npkts].controlp : NULL;
4313 npkts += 1;
fe8ab488
A
4314 nextrecord = m->m_nextpkt;
4315
fe8ab488 4316 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3e170ce0 4317 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
0a7de745 4318 if (error == ERESTART) {
3e170ce0 4319 goto restart;
0a7de745 4320 } else if (error != 0) {
3e170ce0 4321 goto release;
0a7de745 4322 }
fe8ab488 4323 }
fe8ab488 4324
fe8ab488 4325 if (m != NULL && m->m_type == MT_CONTROL) {
3e170ce0 4326 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
0a7de745 4327 if (error != 0) {
3e170ce0 4328 goto release;
0a7de745 4329 }
fe8ab488 4330 }
fe8ab488 4331
3e170ce0
A
4332 if (m->m_pkthdr.len == 0) {
4333 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4334 __func__, __LINE__,
4335 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4336 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4337 m->m_type);
4338 }
fe8ab488
A
4339
4340 /*
3e170ce0
A
4341 * Loop to copy the mbufs of the current record
4342 * Support zero length packets
fe8ab488 4343 */
3e170ce0
A
4344 ml = NULL;
4345 pktlen = 0;
4346 while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
0a7de745 4347 if (m->m_len == 0) {
3e170ce0 4348 panic("%p m_len zero", m);
0a7de745
A
4349 }
4350 if (m->m_type == 0) {
3e170ce0 4351 panic("%p m_type zero", m);
0a7de745 4352 }
fe8ab488
A
4353 /*
4354 * Clip to the residual length
4355 */
0a7de745 4356 if (len > m->m_len) {
fe8ab488 4357 len = m->m_len;
0a7de745 4358 }
3e170ce0 4359 pktlen += len;
fe8ab488 4360 /*
3e170ce0 4361 * Copy the mbufs via the uio or delay the copy
fe8ab488
A
4362 * Sockbuf must be consistent here (points to current mbuf,
4363 * it points to next record) when we drop priority;
4364 * we must note any additions to the sockbuf when we
4365 * block interrupts again.
4366 */
3e170ce0 4367 if (len > 0 && can_delay == 0) {
fe8ab488
A
4368 socket_unlock(so, 0);
4369 error = uiomove(mtod(m, caddr_t), (int)len, auio);
4370 socket_lock(so, 0);
0a7de745 4371 if (error) {
fe8ab488 4372 goto release;
0a7de745 4373 }
3e170ce0
A
4374 } else {
4375 delayed_copy_len += len;
fe8ab488 4376 }
3e170ce0 4377
fe8ab488
A
4378 if (len == m->m_len) {
4379 /*
3e170ce0 4380 * m was entirely copied
fe8ab488 4381 */
fe8ab488 4382 sbfree(&so->so_rcv, m);
3e170ce0 4383 nextrecord = m->m_nextpkt;
fe8ab488
A
4384 m->m_nextpkt = NULL;
4385
4386 /*
3e170ce0 4387 * Set the first packet to the head of the free list
fe8ab488 4388 */
0a7de745 4389 if (free_list == NULL) {
3e170ce0 4390 free_list = m;
0a7de745 4391 }
3e170ce0
A
4392 /*
4393 * Link current packet to tail of free list
4394 */
4395 if (ml == NULL) {
0a7de745 4396 if (free_tail != NULL) {
3e170ce0 4397 free_tail->m_nextpkt = m;
0a7de745 4398 }
3e170ce0 4399 free_tail = m;
fe8ab488 4400 }
3e170ce0
A
4401 /*
4402 * Link current mbuf to last mbuf of current packet
4403 */
0a7de745 4404 if (ml != NULL) {
3e170ce0 4405 ml->m_next = m;
0a7de745 4406 }
3e170ce0
A
4407 ml = m;
4408
4409 /*
4410 * Move next buf to head of socket buffer
4411 */
4412 so->so_rcv.sb_mb = m = ml->m_next;
4413 ml->m_next = NULL;
4414
fe8ab488
A
4415 if (m != NULL) {
4416 m->m_nextpkt = nextrecord;
0a7de745 4417 if (nextrecord == NULL) {
fe8ab488 4418 so->so_rcv.sb_lastrecord = m;
0a7de745 4419 }
fe8ab488
A
4420 } else {
4421 so->so_rcv.sb_mb = nextrecord;
4422 SB_EMPTY_FIXUP(&so->so_rcv);
4423 }
4424 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4425 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4426 } else {
4427 /*
4428 * Stop the loop on partial copy
4429 */
fe8ab488
A
4430 break;
4431 }
4432 }
4433#ifdef MORE_LOCKING_DEBUG
4434 if (so->so_usecount <= 1) {
4435 panic("%s: after big while so=%llx ref=%d on socket\n",
4436 __func__,
3e170ce0 4437 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
fe8ab488
A
4438 /* NOTREACHED */
4439 }
4440#endif
4441 /*
4442 * Tell the caller we made a partial copy
4443 */
4444 if (m != NULL) {
4445 if (so->so_options & SO_DONTTRUNC) {
3e170ce0
A
4446 /*
4447 * Copyout first the freelist then the partial mbuf
4448 */
4449 socket_unlock(so, 0);
0a7de745 4450 if (delayed_copy_len) {
3e170ce0
A
4451 error = sodelayed_copy_list(so, msgarray,
4452 uiocnt, &free_list, &delayed_copy_len);
0a7de745 4453 }
3e170ce0
A
4454
4455 if (error == 0) {
4456 error = uiomove(mtod(m, caddr_t), (int)len,
4457 auio);
4458 }
4459 socket_lock(so, 0);
0a7de745 4460 if (error) {
3e170ce0 4461 goto release;
0a7de745 4462 }
3e170ce0 4463
fe8ab488
A
4464 m->m_data += len;
4465 m->m_len -= len;
4466 so->so_rcv.sb_cc -= len;
4467 flags |= MSG_RCVMORE;
4468 } else {
4469 (void) sbdroprecord(&so->so_rcv);
4470 nextrecord = so->so_rcv.sb_mb;
4471 m = NULL;
4472 flags |= MSG_TRUNC;
4473 }
4474 }
4475
4476 if (m == NULL) {
4477 so->so_rcv.sb_mb = nextrecord;
4478 /*
4479 * First part is an inline SB_EMPTY_FIXUP(). Second
4480 * part makes sure sb_lastrecord is up-to-date if
4481 * there is still data in the socket buffer.
4482 */
4483 if (so->so_rcv.sb_mb == NULL) {
4484 so->so_rcv.sb_mbtail = NULL;
4485 so->so_rcv.sb_lastrecord = NULL;
4486 } else if (nextrecord->m_nextpkt == NULL) {
4487 so->so_rcv.sb_lastrecord = nextrecord;
4488 }
4489 SB_MB_CHECK(&so->so_rcv);
4490 }
4491 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4492 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4493
4494 /*
4495 * We can continue to the next packet as long as:
4496 * - We haven't exhausted the uio array
4497 * - There was no error
4498 * - A packet was not truncated
4499 * - We can still receive more data
3e170ce0
A
4500 */
4501 if (npkts < uiocnt && error == 0 &&
4502 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4503 (so->so_state & SS_CANTRCVMORE) == 0) {
0a7de745 4504 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
fe8ab488
A
4505 sblocked = 0;
4506
3e170ce0 4507 goto next;
fe8ab488 4508 }
0a7de745 4509 if (flagsp != NULL) {
3e170ce0 4510 *flagsp |= flags;
0a7de745 4511 }
fe8ab488
A
4512
4513release:
4514 /*
4515 * pru_rcvd may cause more data to be received if the socket lock
4516 * is dropped so we set MSG_HAVEMORE now based on what we know.
3e170ce0
A
4517 * That way the caller won't be surprised if it receives less data
4518 * than requested.
fe8ab488 4519 */
0a7de745 4520 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
fe8ab488 4521 flags |= MSG_HAVEMORE;
0a7de745 4522 }
fe8ab488 4523
0a7de745 4524 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
fe8ab488 4525 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
0a7de745 4526 }
fe8ab488 4527
0a7de745
A
4528 if (sblocked) {
4529 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4530 } else {
fe8ab488 4531 socket_unlock(so, 1);
0a7de745 4532 }
3e170ce0 4533
0a7de745 4534 if (delayed_copy_len) {
3e170ce0
A
4535 error = sodelayed_copy_list(so, msgarray, uiocnt,
4536 &free_list, &delayed_copy_len);
0a7de745 4537 }
fe8ab488
A
4538out:
4539 /*
3e170ce0 4540 * Amortize the cost of freeing the mbufs
fe8ab488 4541 */
0a7de745 4542 if (free_list != NULL) {
fe8ab488 4543 m_freem_list(free_list);
0a7de745
A
4544 }
4545 if (free_others != NULL) {
3e170ce0 4546 m_freem_list(free_others);
0a7de745 4547 }
fe8ab488
A
4548
4549 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4550 0, 0, 0, 0);
0a7de745 4551 return error;
fe8ab488
A
4552}
4553
cb323159
A
4554static int
4555so_statistics_event_to_nstat_event(int64_t *input_options,
4556 uint64_t *nstat_event)
4557{
4558 int error = 0;
4559 switch (*input_options) {
4560 case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4561 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4562 break;
4563 case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4564 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4565 break;
4566#if (DEBUG || DEVELOPMENT)
4567 case SO_STATISTICS_EVENT_RESERVED_1:
4568 *nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4569 break;
4570 case SO_STATISTICS_EVENT_RESERVED_2:
4571 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4572 break;
4573#endif /* (DEBUG || DEVELOPMENT) */
4574 default:
4575 error = EINVAL;
4576 break;
4577 }
4578 return error;
4579}
4580
fe8ab488
A
4581/*
4582 * Returns: 0 Success
4583 * EINVAL
4584 * ENOTCONN
4585 * <pru_shutdown>:EINVAL
4586 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4587 * <pru_shutdown>:ENOBUFS[TCP]
4588 * <pru_shutdown>:EMSGSIZE[TCP]
4589 * <pru_shutdown>:EHOSTUNREACH[TCP]
4590 * <pru_shutdown>:ENETUNREACH[TCP]
4591 * <pru_shutdown>:ENETDOWN[TCP]
4592 * <pru_shutdown>:ENOMEM[TCP]
4593 * <pru_shutdown>:EACCES[TCP]
4594 * <pru_shutdown>:EMSGSIZE[TCP]
4595 * <pru_shutdown>:ENOBUFS[TCP]
4596 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4597 * <pru_shutdown>:??? [other protocol families]
4598 */
4599int
4600soshutdown(struct socket *so, int how)
4601{
4602 int error;
4603
4604 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4605
4606 switch (how) {
4607 case SHUT_RD:
4608 case SHUT_WR:
4609 case SHUT_RDWR:
4610 socket_lock(so, 1);
4611 if ((so->so_state &
0a7de745 4612 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
fe8ab488 4613 error = ENOTCONN;
2d21ac55
A
4614 } else {
4615 error = soshutdownlock(so, how);
4616 }
4617 socket_unlock(so, 1);
4618 break;
4619 default:
4620 error = EINVAL;
4621 break;
55e303ae 4622 }
55e303ae 4623
fe8ab488
A
4624 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4625
0a7de745 4626 return error;
55e303ae
A
4627}
4628
1c79356b 4629int
fe8ab488 4630soshutdownlock_final(struct socket *so, int how)
1c79356b 4631{
2d21ac55
A
4632 struct protosw *pr = so->so_proto;
4633 int error = 0;
1c79356b 4634
91447636 4635 sflt_notify(so, sock_evt_shutdown, &how);
1c79356b 4636
9bccf70c 4637 if (how != SHUT_WR) {
2d21ac55
A
4638 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4639 /* read already shut down */
4640 error = ENOTCONN;
4641 goto done;
4642 }
1c79356b
A
4643 sorflush(so);
4644 postevent(so, 0, EV_RCLOSED);
4645 }
9bccf70c 4646 if (how != SHUT_RD) {
2d21ac55
A
4647 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4648 /* write already shut down */
4649 error = ENOTCONN;
4650 goto done;
4651 }
4652 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4653 postevent(so, 0, EV_WCLOSED);
1c79356b 4654 }
2d21ac55 4655done:
fe8ab488 4656 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
0a7de745 4657 return error;
fe8ab488
A
4658}
4659
4660int
4661soshutdownlock(struct socket *so, int how)
4662{
4663 int error = 0;
4664
4665#if CONTENT_FILTER
4666 /*
4667 * A content filter may delay the actual shutdown until it
4668 * has processed the pending data
4669 */
4670 if (so->so_flags & SOF_CONTENT_FILTER) {
4671 error = cfil_sock_shutdown(so, &how);
4672 if (error == EJUSTRETURN) {
4673 error = 0;
4674 goto done;
4675 } else if (error != 0) {
4676 goto done;
4677 }
4678 }
4679#endif /* CONTENT_FILTER */
3e170ce0 4680
fe8ab488
A
4681 error = soshutdownlock_final(so, how);
4682
4683done:
0a7de745 4684 return error;
1c79356b
A
4685}
4686
39236c6e
A
4687void
4688sowflush(struct socket *so)
4689{
4690 struct sockbuf *sb = &so->so_snd;
39236c6e
A
4691
4692 /*
4693 * Obtain lock on the socket buffer (SB_LOCK). This is required
4694 * to prevent the socket buffer from being unexpectedly altered
4695 * while it is used by another thread in socket send/receive.
4696 *
4697 * sblock() must not fail here, hence the assertion.
4698 */
4699 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4700 VERIFY(sb->sb_flags & SB_LOCK);
4701
0a7de745
A
4702 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4703 sb->sb_flags |= SB_DROP;
4704 sb->sb_upcall = NULL;
4705 sb->sb_upcallarg = NULL;
39236c6e 4706
0a7de745 4707 sbunlock(sb, TRUE); /* keep socket locked */
39236c6e
A
4708
4709 selthreadclear(&sb->sb_sel);
4710 sbrelease(sb);
4711}
4712
1c79356b 4713void
2d21ac55 4714sorflush(struct socket *so)
1c79356b 4715{
39236c6e
A
4716 struct sockbuf *sb = &so->so_rcv;
4717 struct protosw *pr = so->so_proto;
1c79356b 4718 struct sockbuf asb;
39236c6e 4719#ifdef notyet
2d21ac55 4720 lck_mtx_t *mutex_held;
39236c6e
A
4721 /*
4722 * XXX: This code is currently commented out, because we may get here
4723 * as part of sofreelastref(), and at that time, pr_getlock() may no
4724 * longer be able to return us the lock; this will be fixed in future.
4725 */
0a7de745 4726 if (so->so_proto->pr_getlock != NULL) {
91447636 4727 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
0a7de745 4728 } else {
91447636 4729 mutex_held = so->so_proto->pr_domain->dom_mtx;
0a7de745 4730 }
39236c6e 4731
5ba3f43e 4732 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
39236c6e 4733#endif /* notyet */
91447636
A
4734
4735 sflt_notify(so, sock_evt_flush_read, NULL);
1c79356b 4736
1c79356b 4737 socantrcvmore(so);
39236c6e
A
4738
4739 /*
4740 * Obtain lock on the socket buffer (SB_LOCK). This is required
4741 * to prevent the socket buffer from being unexpectedly altered
4742 * while it is used by another thread in socket send/receive.
4743 *
4744 * sblock() must not fail here, hence the assertion.
4745 */
4746 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4747 VERIFY(sb->sb_flags & SB_LOCK);
4748
4749 /*
4750 * Copy only the relevant fields from "sb" to "asb" which we
4751 * need for sbrelease() to function. In particular, skip
4752 * sb_sel as it contains the wait queue linkage, which would
4753 * wreak havoc if we were to issue selthreadclear() on "asb".
4754 * Make sure to not carry over SB_LOCK in "asb", as we need
4755 * to acquire it later as part of sbrelease().
4756 */
0a7de745
A
4757 bzero(&asb, sizeof(asb));
4758 asb.sb_cc = sb->sb_cc;
4759 asb.sb_hiwat = sb->sb_hiwat;
4760 asb.sb_mbcnt = sb->sb_mbcnt;
4761 asb.sb_mbmax = sb->sb_mbmax;
4762 asb.sb_ctl = sb->sb_ctl;
4763 asb.sb_lowat = sb->sb_lowat;
4764 asb.sb_mb = sb->sb_mb;
4765 asb.sb_mbtail = sb->sb_mbtail;
4766 asb.sb_lastrecord = sb->sb_lastrecord;
4767 asb.sb_so = sb->sb_so;
4768 asb.sb_flags = sb->sb_flags;
4769 asb.sb_flags &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4770 asb.sb_flags |= SB_DROP;
39236c6e
A
4771
4772 /*
4773 * Ideally we'd bzero() these and preserve the ones we need;
4774 * but to do that we'd need to shuffle things around in the
4775 * sockbuf, and we can't do it now because there are KEXTS
4776 * that are directly referring to the socket structure.
4777 *
4778 * Setting SB_DROP acts as a barrier to prevent further appends.
4779 * Clearing SB_SEL is done for selthreadclear() below.
4780 */
0a7de745
A
4781 sb->sb_cc = 0;
4782 sb->sb_hiwat = 0;
4783 sb->sb_mbcnt = 0;
4784 sb->sb_mbmax = 0;
4785 sb->sb_ctl = 0;
4786 sb->sb_lowat = 0;
4787 sb->sb_mb = NULL;
4788 sb->sb_mbtail = NULL;
4789 sb->sb_lastrecord = NULL;
4790 sb->sb_timeo.tv_sec = 0;
4791 sb->sb_timeo.tv_usec = 0;
4792 sb->sb_upcall = NULL;
4793 sb->sb_upcallarg = NULL;
4794 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4795 sb->sb_flags |= SB_DROP;
4796
4797 sbunlock(sb, TRUE); /* keep socket locked */
39236c6e
A
4798
4799 /*
4800 * Note that selthreadclear() is called on the original "sb" and
4801 * not the local "asb" because of the way wait queue linkage is
4802 * implemented. Given that selwakeup() may be triggered, SB_SEL
4803 * should no longer be set (cleared above.)
4804 */
0b4e3aa0 4805 selthreadclear(&sb->sb_sel);
39236c6e 4806
0a7de745 4807 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
1c79356b 4808 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
0a7de745 4809 }
39236c6e 4810
1c79356b
A
4811 sbrelease(&asb);
4812}
4813
4814/*
4815 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4816 * an additional variant to handle the case where the option value needs
4817 * to be some kind of integer, but not a specific size.
4818 * In addition to their use here, these functions are also called by the
4819 * protocol-level pr_ctloutput() routines.
2d21ac55
A
4820 *
4821 * Returns: 0 Success
4822 * EINVAL
4823 * copyin:EFAULT
1c79356b
A
4824 */
4825int
2d21ac55 4826sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
1c79356b 4827{
0a7de745 4828 size_t valsize;
1c79356b
A
4829
4830 /*
4831 * If the user gives us more than we wanted, we ignore it,
4832 * but if we don't get the minimum length the caller
4833 * wants, we return EINVAL. On success, sopt->sopt_valsize
4834 * is set to however much we actually retrieved.
4835 */
0a7de745
A
4836 if ((valsize = sopt->sopt_valsize) < minlen) {
4837 return EINVAL;
4838 }
4839 if (valsize > len) {
1c79356b 4840 sopt->sopt_valsize = valsize = len;
0a7de745 4841 }
1c79356b 4842
0a7de745
A
4843 if (sopt->sopt_p != kernproc) {
4844 return copyin(sopt->sopt_val, buf, valsize);
4845 }
1c79356b 4846
91447636 4847 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
0a7de745 4848 return 0;
2d21ac55
A
4849}
4850
4851/*
4852 * sooptcopyin_timeval
4853 * Copy in a timeval value into tv_p, and take into account whether the
4854 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4855 * code here so that we can verify the 64-bit tv_sec value before we lose
4856 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4857 */
4858static int
39236c6e 4859sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
2d21ac55 4860{
0a7de745 4861 int error;
b0d623f7 4862
2d21ac55 4863 if (proc_is64bit(sopt->sopt_p)) {
0a7de745 4864 struct user64_timeval tv64;
2d21ac55 4865
0a7de745
A
4866 if (sopt->sopt_valsize < sizeof(tv64)) {
4867 return EINVAL;
4868 }
39236c6e 4869
0a7de745 4870 sopt->sopt_valsize = sizeof(tv64);
b0d623f7 4871 if (sopt->sopt_p != kernproc) {
0a7de745
A
4872 error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4873 if (error != 0) {
4874 return error;
4875 }
b0d623f7
A
4876 } else {
4877 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
0a7de745 4878 sizeof(tv64));
2d21ac55 4879 }
39236c6e 4880 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
0a7de745
A
4881 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4882 return EDOM;
4883 }
39236c6e 4884
2d21ac55
A
4885 tv_p->tv_sec = tv64.tv_sec;
4886 tv_p->tv_usec = tv64.tv_usec;
4887 } else {
0a7de745 4888 struct user32_timeval tv32;
b0d623f7 4889
0a7de745
A
4890 if (sopt->sopt_valsize < sizeof(tv32)) {
4891 return EINVAL;
4892 }
39236c6e 4893
0a7de745 4894 sopt->sopt_valsize = sizeof(tv32);
b0d623f7 4895 if (sopt->sopt_p != kernproc) {
0a7de745 4896 error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
2d21ac55 4897 if (error != 0) {
0a7de745 4898 return error;
2d21ac55
A
4899 }
4900 } else {
b0d623f7 4901 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
0a7de745 4902 sizeof(tv32));
2d21ac55 4903 }
39236c6e
A
4904#ifndef __LP64__
4905 /*
4906 * K64todo "comparison is always false due to
4907 * limited range of data type"
4908 */
4909 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
0a7de745
A
4910 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4911 return EDOM;
4912 }
b0d623f7
A
4913#endif
4914 tv_p->tv_sec = tv32.tv_sec;
4915 tv_p->tv_usec = tv32.tv_usec;
2d21ac55 4916 }
0a7de745 4917 return 0;
1c79356b
A
4918}
4919
5ba3f43e 4920int
cb323159
A
4921soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4922 boolean_t ignore_delegate)
39037602
A
4923{
4924 kauth_cred_t cred = NULL;
4925 proc_t ep = PROC_NULL;
5ba3f43e
A
4926 uid_t uid;
4927 int error = 0;
39037602 4928
cb323159 4929 if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
39037602 4930 ep = proc_find(so->e_pid);
0a7de745 4931 if (ep) {
39037602 4932 cred = kauth_cred_proc_ref(ep);
0a7de745 4933 }
39037602 4934 }
5ba3f43e
A
4935
4936 uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4937
4938 /* uid is 0 for root */
0a7de745 4939 if (uid != 0 || !allow_root) {
5ba3f43e 4940 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
0a7de745
A
4941 }
4942 if (cred) {
39037602 4943 kauth_cred_unref(&cred);
0a7de745
A
4944 }
4945 if (ep != PROC_NULL) {
39037602 4946 proc_rele(ep);
0a7de745 4947 }
39037602 4948
0a7de745 4949 return error;
39037602
A
4950}
4951
2d21ac55
A
4952/*
4953 * Returns: 0 Success
4954 * EINVAL
4955 * ENOPROTOOPT
4956 * ENOBUFS
4957 * EDOM
4958 * sooptcopyin:EINVAL
4959 * sooptcopyin:EFAULT
4960 * sooptcopyin_timeval:EINVAL
4961 * sooptcopyin_timeval:EFAULT
4962 * sooptcopyin_timeval:EDOM
4963 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4964 * <pr_ctloutput>:???w
4965 * sflt_attach_private:??? [whatever a filter author chooses]
4966 * <sf_setoption>:??? [whatever a filter author chooses]
4967 *
4968 * Notes: Other <pru_listen> returns depend on the protocol family; all
4969 * <sf_listen> returns depend on what the filter author causes
4970 * their filter to return.
4971 */
1c79356b 4972int
39236c6e 4973sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
1c79356b 4974{
0a7de745 4975 int error, optval;
cb323159 4976 int64_t long_optval;
0a7de745
A
4977 struct linger l;
4978 struct timeval tv;
2d21ac55
A
4979#if CONFIG_MACF_SOCKET
4980 struct mac extmac;
4981#endif /* MAC_SOCKET */
91447636 4982
0a7de745 4983 if (sopt->sopt_dir != SOPT_SET) {
39236c6e 4984 sopt->sopt_dir = SOPT_SET;
0a7de745 4985 }
39236c6e 4986
0a7de745 4987 if (dolock) {
39236c6e 4988 socket_lock(so, 1);
0a7de745 4989 }
39236c6e
A
4990
4991 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4992 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
b0d623f7 4993 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
2d21ac55
A
4994 /* the socket has been shutdown, no more sockopt's */
4995 error = EINVAL;
39236c6e 4996 goto out;
9bccf70c
A
4997 }
4998
6d2010ae 4999 error = sflt_setsockopt(so, sopt);
39236c6e 5000 if (error != 0) {
0a7de745 5001 if (error == EJUSTRETURN) {
6d2010ae 5002 error = 0;
0a7de745 5003 }
39236c6e 5004 goto out;
1c79356b
A
5005 }
5006
1c79356b 5007 if (sopt->sopt_level != SOL_SOCKET) {
39236c6e
A
5008 if (so->so_proto != NULL &&
5009 so->so_proto->pr_ctloutput != NULL) {
2d21ac55 5010 error = (*so->so_proto->pr_ctloutput)(so, sopt);
39236c6e 5011 goto out;
91447636 5012 }
1c79356b
A
5013 error = ENOPROTOOPT;
5014 } else {
39236c6e
A
5015 /*
5016 * Allow socket-level (SOL_SOCKET) options to be filtered by
5017 * the protocol layer, if needed. A zero value returned from
5018 * the handler means use default socket-level processing as
5019 * done by the rest of this routine. Otherwise, any other
5020 * return value indicates that the option is unsupported.
5021 */
5022 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
0a7de745 5023 pru_socheckopt(so, sopt)) != 0) {
39236c6e 5024 goto out;
0a7de745 5025 }
39236c6e
A
5026
5027 error = 0;
1c79356b
A
5028 switch (sopt->sopt_name) {
5029 case SO_LINGER:
91447636 5030 case SO_LINGER_SEC:
0a7de745
A
5031 error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5032 if (error != 0) {
39236c6e 5033 goto out;
0a7de745 5034 }
1c79356b 5035
2d21ac55
A
5036 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
5037 l.l_linger : l.l_linger * hz;
0a7de745 5038 if (l.l_onoff != 0) {
1c79356b 5039 so->so_options |= SO_LINGER;
0a7de745 5040 } else {
1c79356b 5041 so->so_options &= ~SO_LINGER;
0a7de745 5042 }
1c79356b
A
5043 break;
5044
5045 case SO_DEBUG:
5046 case SO_KEEPALIVE:
5047 case SO_DONTROUTE:
5048 case SO_USELOOPBACK:
5049 case SO_BROADCAST:
5050 case SO_REUSEADDR:
5051 case SO_REUSEPORT:
5052 case SO_OOBINLINE:
5053 case SO_TIMESTAMP:
6d2010ae 5054 case SO_TIMESTAMP_MONOTONIC:
d9a64523 5055 case SO_TIMESTAMP_CONTINUOUS:
1c79356b
A
5056 case SO_DONTTRUNC:
5057 case SO_WANTMORE:
9bccf70c 5058 case SO_WANTOOBFLAG:
fe8ab488 5059 case SO_NOWAKEFROMSLEEP:
39037602 5060 case SO_NOAPNFALLBK:
0a7de745
A
5061 error = sooptcopyin(sopt, &optval, sizeof(optval),
5062 sizeof(optval));
5063 if (error != 0) {
39236c6e 5064 goto out;
0a7de745
A
5065 }
5066 if (optval) {
1c79356b 5067 so->so_options |= sopt->sopt_name;
0a7de745 5068 } else {
1c79356b 5069 so->so_options &= ~sopt->sopt_name;
0a7de745 5070 }
1c79356b
A
5071 break;
5072
5073 case SO_SNDBUF:
5074 case SO_RCVBUF:
5075 case SO_SNDLOWAT:
5076 case SO_RCVLOWAT:
0a7de745
A
5077 error = sooptcopyin(sopt, &optval, sizeof(optval),
5078 sizeof(optval));
5079 if (error != 0) {
39236c6e 5080 goto out;
0a7de745 5081 }
1c79356b
A
5082
5083 /*
5084 * Values < 1 make no sense for any of these
5085 * options, so disallow them.
5086 */
5087 if (optval < 1) {
5088 error = EINVAL;
39236c6e 5089 goto out;
1c79356b
A
5090 }
5091
5092 switch (sopt->sopt_name) {
5093 case SO_SNDBUF:
39236c6e
A
5094 case SO_RCVBUF: {
5095 struct sockbuf *sb =
5096 (sopt->sopt_name == SO_SNDBUF) ?
5097 &so->so_snd : &so->so_rcv;
5098 if (sbreserve(sb, (u_int32_t)optval) == 0) {
1c79356b 5099 error = ENOBUFS;
39236c6e 5100 goto out;
1c79356b 5101 }
316670eb
A
5102 sb->sb_flags |= SB_USRSIZE;
5103 sb->sb_flags &= ~SB_AUTOSIZE;
5104 sb->sb_idealsize = (u_int32_t)optval;
1c79356b 5105 break;
316670eb 5106 }
1c79356b
A
5107 /*
5108 * Make sure the low-water is never greater than
5109 * the high-water.
5110 */
fe8ab488
A
5111 case SO_SNDLOWAT: {
5112 int space = sbspace(&so->so_snd);
5113 u_int32_t hiwat = so->so_snd.sb_hiwat;
5114
5115 if (so->so_snd.sb_flags & SB_UNIX) {
5116 struct unpcb *unp =
5117 (struct unpcb *)(so->so_pcb);
3e170ce0
A
5118 if (unp != NULL &&
5119 unp->unp_conn != NULL) {
fe8ab488
A
5120 hiwat += unp->unp_conn->unp_cc;
5121 }
5122 }
5123
1c79356b 5124 so->so_snd.sb_lowat =
fe8ab488
A
5125 (optval > hiwat) ?
5126 hiwat : optval;
5127
5128 if (space >= so->so_snd.sb_lowat) {
5129 sowwakeup(so);
5130 }
1c79356b 5131 break;
3e170ce0 5132 }
fe8ab488
A
5133 case SO_RCVLOWAT: {
5134 int64_t data_len;
1c79356b
A
5135 so->so_rcv.sb_lowat =
5136 (optval > so->so_rcv.sb_hiwat) ?
5137 so->so_rcv.sb_hiwat : optval;
3e170ce0 5138 data_len = so->so_rcv.sb_cc
fe8ab488 5139 - so->so_rcv.sb_ctl;
0a7de745
A
5140 if (data_len >= so->so_rcv.sb_lowat) {
5141 sorwakeup(so);
5142 }
1c79356b
A
5143 break;
5144 }
fe8ab488 5145 }
1c79356b
A
5146 break;
5147
5148 case SO_SNDTIMEO:
5149 case SO_RCVTIMEO:
2d21ac55 5150 error = sooptcopyin_timeval(sopt, &tv);
0a7de745 5151 if (error != 0) {
39236c6e 5152 goto out;
0a7de745 5153 }
1c79356b 5154
1c79356b
A
5155 switch (sopt->sopt_name) {
5156 case SO_SNDTIMEO:
91447636 5157 so->so_snd.sb_timeo = tv;
1c79356b
A
5158 break;
5159 case SO_RCVTIMEO:
91447636 5160 so->so_rcv.sb_timeo = tv;
1c79356b
A
5161 break;
5162 }
5163 break;
5164
39236c6e 5165 case SO_NKE: {
9bccf70c 5166 struct so_nke nke;
1c79356b 5167
0a7de745
A
5168 error = sooptcopyin(sopt, &nke, sizeof(nke),
5169 sizeof(nke));
5170 if (error != 0) {
39236c6e 5171 goto out;
0a7de745 5172 }
1c79356b 5173
6d2010ae 5174 error = sflt_attach_internal(so, nke.nke_handle);
1c79356b
A
5175 break;
5176 }
5177
9bccf70c 5178 case SO_NOSIGPIPE:
0a7de745
A
5179 error = sooptcopyin(sopt, &optval, sizeof(optval),
5180 sizeof(optval));
5181 if (error != 0) {
39236c6e 5182 goto out;
0a7de745
A
5183 }
5184 if (optval != 0) {
2d21ac55 5185 so->so_flags |= SOF_NOSIGPIPE;
0a7de745 5186 } else {
2d21ac55 5187 so->so_flags &= ~SOF_NOSIGPIPE;
0a7de745 5188 }
9bccf70c
A
5189 break;
5190
55e303ae 5191 case SO_NOADDRERR:
0a7de745
A
5192 error = sooptcopyin(sopt, &optval, sizeof(optval),
5193 sizeof(optval));
5194 if (error != 0) {
39236c6e 5195 goto out;
0a7de745
A
5196 }
5197 if (optval != 0) {
2d21ac55 5198 so->so_flags |= SOF_NOADDRAVAIL;
0a7de745 5199 } else {
2d21ac55 5200 so->so_flags &= ~SOF_NOADDRAVAIL;
0a7de745 5201 }
2d21ac55
A
5202 break;
5203
5204 case SO_REUSESHAREUID:
0a7de745
A
5205 error = sooptcopyin(sopt, &optval, sizeof(optval),
5206 sizeof(optval));
5207 if (error != 0) {
39236c6e 5208 goto out;
0a7de745
A
5209 }
5210 if (optval != 0) {
2d21ac55 5211 so->so_flags |= SOF_REUSESHAREUID;
0a7de745 5212 } else {
2d21ac55 5213 so->so_flags &= ~SOF_REUSESHAREUID;
0a7de745 5214 }
2d21ac55 5215 break;
39236c6e 5216
2d21ac55
A
5217 case SO_NOTIFYCONFLICT:
5218 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5219 error = EPERM;
39236c6e 5220 goto out;
2d21ac55 5221 }
0a7de745
A
5222 error = sooptcopyin(sopt, &optval, sizeof(optval),
5223 sizeof(optval));
5224 if (error != 0) {
39236c6e 5225 goto out;
0a7de745
A
5226 }
5227 if (optval != 0) {
2d21ac55 5228 so->so_flags |= SOF_NOTIFYCONFLICT;
0a7de745 5229 } else {
2d21ac55 5230 so->so_flags &= ~SOF_NOTIFYCONFLICT;
0a7de745 5231 }
2d21ac55 5232 break;
39236c6e 5233
2d21ac55 5234 case SO_RESTRICTIONS:
0a7de745
A
5235 error = sooptcopyin(sopt, &optval, sizeof(optval),
5236 sizeof(optval));
5237 if (error != 0) {
39236c6e 5238 goto out;
0a7de745 5239 }
39236c6e
A
5240
5241 error = so_set_restrictions(so, optval);
2d21ac55
A
5242 break;
5243
fe8ab488
A
5244 case SO_AWDL_UNRESTRICTED:
5245 if (SOCK_DOM(so) != PF_INET &&
5246 SOCK_DOM(so) != PF_INET6) {
5247 error = EOPNOTSUPP;
5248 goto out;
5249 }
5250 error = sooptcopyin(sopt, &optval, sizeof(optval),
5251 sizeof(optval));
0a7de745 5252 if (error != 0) {
fe8ab488 5253 goto out;
0a7de745 5254 }
fe8ab488 5255 if (optval != 0) {
39037602 5256 error = soopt_cred_check(so,
cb323159 5257 PRIV_NET_RESTRICTED_AWDL, false, false);
0a7de745 5258 if (error == 0) {
fe8ab488 5259 inp_set_awdl_unrestricted(
0a7de745
A
5260 sotoinpcb(so));
5261 }
5262 } else {
fe8ab488 5263 inp_clear_awdl_unrestricted(sotoinpcb(so));
0a7de745 5264 }
fe8ab488 5265 break;
39037602
A
5266 case SO_INTCOPROC_ALLOW:
5267 if (SOCK_DOM(so) != PF_INET6) {
5268 error = EOPNOTSUPP;
5269 goto out;
5270 }
5271 error = sooptcopyin(sopt, &optval, sizeof(optval),
5272 sizeof(optval));
0a7de745 5273 if (error != 0) {
39037602 5274 goto out;
0a7de745 5275 }
743345f9 5276 if (optval != 0 &&
0a7de745 5277 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
39037602 5278 error = soopt_cred_check(so,
cb323159 5279 PRIV_NET_RESTRICTED_INTCOPROC, false, false);
0a7de745 5280 if (error == 0) {
39037602 5281 inp_set_intcoproc_allowed(
0a7de745
A
5282 sotoinpcb(so));
5283 }
5284 } else if (optval == 0) {
39037602 5285 inp_clear_intcoproc_allowed(sotoinpcb(so));
0a7de745 5286 }
39037602 5287 break;
fe8ab488 5288
2d21ac55
A
5289 case SO_LABEL:
5290#if CONFIG_MACF_SOCKET
0a7de745
A
5291 if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
5292 sizeof(extmac))) != 0) {
39236c6e 5293 goto out;
0a7de745 5294 }
2d21ac55
A
5295
5296 error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
5297 so, &extmac);
5298#else
5299 error = EOPNOTSUPP;
5300#endif /* MAC_SOCKET */
55e303ae
A
5301 break;
5302
4a3eedf9 5303 case SO_UPCALLCLOSEWAIT:
0a7de745
A
5304 error = sooptcopyin(sopt, &optval, sizeof(optval),
5305 sizeof(optval));
5306 if (error != 0) {
39236c6e 5307 goto out;
0a7de745
A
5308 }
5309 if (optval != 0) {
4a3eedf9 5310 so->so_flags |= SOF_UPCALLCLOSEWAIT;
0a7de745 5311 } else {
4a3eedf9 5312 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
0a7de745 5313 }
4a3eedf9 5314 break;
4a3eedf9 5315
b0d623f7 5316 case SO_RANDOMPORT:
0a7de745
A
5317 error = sooptcopyin(sopt, &optval, sizeof(optval),
5318 sizeof(optval));
5319 if (error != 0) {
39236c6e 5320 goto out;
0a7de745
A
5321 }
5322 if (optval != 0) {
b0d623f7 5323 so->so_flags |= SOF_BINDRANDOMPORT;
0a7de745 5324 } else {
b0d623f7 5325 so->so_flags &= ~SOF_BINDRANDOMPORT;
0a7de745 5326 }
b0d623f7
A
5327 break;
5328
5329 case SO_NP_EXTENSIONS: {
5330 struct so_np_extensions sonpx;
5331
0a7de745
A
5332 error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5333 sizeof(sonpx));
5334 if (error != 0) {
39236c6e 5335 goto out;
0a7de745 5336 }
b0d623f7
A
5337 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5338 error = EINVAL;
39236c6e 5339 goto out;
b0d623f7
A
5340 }
5341 /*
5342 * Only one bit defined for now
5343 */
5344 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
0a7de745 5345 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
b0d623f7 5346 so->so_flags |= SOF_NPX_SETOPTSHUT;
0a7de745 5347 } else {
b0d623f7 5348 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
0a7de745 5349 }
b0d623f7
A
5350 }
5351 break;
5352 }
5353
d41d1dae 5354 case SO_TRAFFIC_CLASS: {
0a7de745
A
5355 error = sooptcopyin(sopt, &optval, sizeof(optval),
5356 sizeof(optval));
5357 if (error != 0) {
39236c6e 5358 goto out;
0a7de745 5359 }
39037602
A
5360 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5361 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5362 error = so_set_net_service_type(so, netsvc);
5363 goto out;
5364 }
6d2010ae 5365 error = so_set_traffic_class(so, optval);
0a7de745 5366 if (error != 0) {
39236c6e 5367 goto out;
0a7de745 5368 }
39037602
A
5369 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5370 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
6d2010ae 5371 break;
d41d1dae 5372 }
6d2010ae
A
5373
5374 case SO_RECV_TRAFFIC_CLASS: {
0a7de745
A
5375 error = sooptcopyin(sopt, &optval, sizeof(optval),
5376 sizeof(optval));
5377 if (error != 0) {
39236c6e 5378 goto out;
0a7de745
A
5379 }
5380 if (optval == 0) {
6d2010ae 5381 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
0a7de745 5382 } else {
6d2010ae 5383 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
0a7de745 5384 }
6d2010ae
A
5385 break;
5386 }
316670eb 5387
39037602 5388#if (DEVELOPMENT || DEBUG)
6d2010ae
A
5389 case SO_TRAFFIC_CLASS_DBG: {
5390 struct so_tcdbg so_tcdbg;
316670eb
A
5391
5392 error = sooptcopyin(sopt, &so_tcdbg,
0a7de745
A
5393 sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5394 if (error != 0) {
39236c6e 5395 goto out;
0a7de745 5396 }
6d2010ae 5397 error = so_set_tcdbg(so, &so_tcdbg);
0a7de745 5398 if (error != 0) {
39236c6e 5399 goto out;
0a7de745 5400 }
6d2010ae
A
5401 break;
5402 }
39037602 5403#endif /* (DEVELOPMENT || DEBUG) */
316670eb
A
5404
5405 case SO_PRIVILEGED_TRAFFIC_CLASS:
5406 error = priv_check_cred(kauth_cred_get(),
5407 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
0a7de745 5408 if (error != 0) {
39236c6e 5409 goto out;
0a7de745
A
5410 }
5411 error = sooptcopyin(sopt, &optval, sizeof(optval),
5412 sizeof(optval));
5413 if (error != 0) {
39236c6e 5414 goto out;
0a7de745
A
5415 }
5416 if (optval == 0) {
316670eb 5417 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
0a7de745 5418 } else {
316670eb 5419 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
0a7de745 5420 }
316670eb
A
5421 break;
5422
a39ff7e2
A
5423#if (DEVELOPMENT || DEBUG)
5424 case SO_DEFUNCTIT:
5425 error = sosetdefunct(current_proc(), so, 0, FALSE);
0a7de745 5426 if (error == 0) {
a39ff7e2 5427 error = sodefunct(current_proc(), so, 0);
0a7de745 5428 }
a39ff7e2
A
5429
5430 break;
5431#endif /* (DEVELOPMENT || DEBUG) */
5432
6d2010ae 5433 case SO_DEFUNCTOK:
0a7de745
A
5434 error = sooptcopyin(sopt, &optval, sizeof(optval),
5435 sizeof(optval));
6d2010ae 5436 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
0a7de745 5437 if (error == 0) {
6d2010ae 5438 error = EBADF;
0a7de745 5439 }
39236c6e 5440 goto out;
6d2010ae
A
5441 }
5442 /*
5443 * Any process can set SO_DEFUNCTOK (clear
5444 * SOF_NODEFUNCT), but only root can clear
5445 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5446 */
5447 if (optval == 0 &&
5448 kauth_cred_issuser(kauth_cred_get()) == 0) {
5449 error = EPERM;
39236c6e 5450 goto out;
6d2010ae 5451 }
0a7de745 5452 if (optval) {
6d2010ae 5453 so->so_flags &= ~SOF_NODEFUNCT;
0a7de745 5454 } else {
6d2010ae 5455 so->so_flags |= SOF_NODEFUNCT;
0a7de745 5456 }
6d2010ae 5457
39236c6e
A
5458 if (SOCK_DOM(so) == PF_INET ||
5459 SOCK_DOM(so) == PF_INET6) {
5460 char s[MAX_IPv6_STR_LEN];
5461 char d[MAX_IPv6_STR_LEN];
5462 struct inpcb *inp = sotoinpcb(so);
5463
39037602
A
5464 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5465 "[%s %s:%d -> %s:%d] is now marked "
5466 "as %seligible for "
39236c6e 5467 "defunct\n", __func__, proc_selfpid(),
39037602 5468 proc_best_name(current_proc()),
3e170ce0 5469 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39236c6e
A
5470 (SOCK_TYPE(so) == SOCK_STREAM) ?
5471 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5472 ((SOCK_DOM(so) == PF_INET) ?
5473 (void *)&inp->inp_laddr.s_addr :
0a7de745 5474 (void *)&inp->in6p_laddr), s, sizeof(s)),
39236c6e
A
5475 ntohs(inp->in6p_lport),
5476 inet_ntop(SOCK_DOM(so),
5477 (SOCK_DOM(so) == PF_INET) ?
5478 (void *)&inp->inp_faddr.s_addr :
0a7de745 5479 (void *)&inp->in6p_faddr, d, sizeof(d)),
39236c6e
A
5480 ntohs(inp->in6p_fport),
5481 (so->so_flags & SOF_NODEFUNCT) ?
39037602 5482 "not " : "");
39236c6e 5483 } else {
39037602
A
5484 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5485 "is now marked as %seligible for "
5486 "defunct\n",
39236c6e 5487 __func__, proc_selfpid(),
39037602 5488 proc_best_name(current_proc()),
3e170ce0 5489 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39236c6e
A
5490 SOCK_DOM(so), SOCK_TYPE(so),
5491 (so->so_flags & SOF_NODEFUNCT) ?
39037602 5492 "not " : "");
39236c6e 5493 }
6d2010ae
A
5494 break;
5495
5496 case SO_ISDEFUNCT:
5497 /* This option is not settable */
5498 error = EINVAL;
5499 break;
d41d1dae 5500
316670eb 5501 case SO_OPPORTUNISTIC:
0a7de745
A
5502 error = sooptcopyin(sopt, &optval, sizeof(optval),
5503 sizeof(optval));
5504 if (error == 0) {
316670eb 5505 error = so_set_opportunistic(so, optval);
0a7de745 5506 }
316670eb
A
5507 break;
5508
5509 case SO_FLUSH:
5510 /* This option is handled by lower layer(s) */
5511 error = 0;
5512 break;
5513
5514 case SO_RECV_ANYIF:
0a7de745
A
5515 error = sooptcopyin(sopt, &optval, sizeof(optval),
5516 sizeof(optval));
5517 if (error == 0) {
316670eb 5518 error = so_set_recv_anyif(so, optval);
0a7de745 5519 }
316670eb
A
5520 break;
5521
39236c6e
A
5522 case SO_TRAFFIC_MGT_BACKGROUND: {
5523 /* This option is handled by lower layer(s) */
5524 error = 0;
5525 break;
5526 }
5527
5528#if FLOW_DIVERT
5529 case SO_FLOW_DIVERT_TOKEN:
5530 error = flow_divert_token_set(so, sopt);
5531 break;
0a7de745 5532#endif /* FLOW_DIVERT */
39236c6e
A
5533
5534
5535 case SO_DELEGATED:
0a7de745
A
5536 if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5537 sizeof(optval))) != 0) {
39236c6e 5538 break;
0a7de745 5539 }
39236c6e 5540
cb323159 5541 error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
39236c6e
A
5542 break;
5543
5544 case SO_DELEGATED_UUID: {
5545 uuid_t euuid;
5546
0a7de745
A
5547 if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5548 sizeof(euuid))) != 0) {
39236c6e 5549 break;
0a7de745 5550 }
39236c6e 5551
cb323159 5552 error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
39236c6e
A
5553 break;
5554 }
3e170ce0 5555
fe8ab488
A
5556#if NECP
5557 case SO_NECP_ATTRIBUTES:
5558 error = necp_set_socket_attributes(so, sopt);
5559 break;
fe8ab488 5560
cb323159 5561 case SO_NECP_CLIENTUUID: {
5ba3f43e
A
5562 if (SOCK_DOM(so) == PF_MULTIPATH) {
5563 /* Handled by MPTCP itself */
fe8ab488
A
5564 break;
5565 }
5566
5ba3f43e
A
5567 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5568 error = EINVAL;
fe8ab488 5569 goto out;
5ba3f43e
A
5570 }
5571
5572 struct inpcb *inp = sotoinpcb(so);
5573 if (!uuid_is_null(inp->necp_client_uuid)) {
5574 // Clear out the old client UUID if present
5575 necp_inpcb_remove_cb(inp);
5576 }
5577
5578 error = sooptcopyin(sopt, &inp->necp_client_uuid,
0a7de745 5579 sizeof(uuid_t), sizeof(uuid_t));
5ba3f43e
A
5580 if (error != 0) {
5581 goto out;
5582 }
5583
5584 if (uuid_is_null(inp->necp_client_uuid)) {
5585 error = EINVAL;
5586 goto out;
5587 }
5588
cb323159
A
5589 pid_t current_pid = proc_pid(current_proc());
5590 error = necp_client_register_socket_flow(current_pid,
5ba3f43e
A
5591 inp->necp_client_uuid, inp);
5592 if (error != 0) {
5593 uuid_clear(inp->necp_client_uuid);
5594 goto out;
5595 }
5596
5597 if (inp->inp_lport != 0) {
cb323159 5598 // There is a bound local port, so this is not
5ba3f43e 5599 // a fresh socket. Assign to the client.
cb323159 5600 necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5ba3f43e
A
5601 }
5602
fe8ab488 5603 break;
cb323159
A
5604 }
5605 case SO_NECP_LISTENUUID: {
5606 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5607 error = EINVAL;
5608 goto out;
5609 }
5610
5611 struct inpcb *inp = sotoinpcb(so);
5612 if (!uuid_is_null(inp->necp_client_uuid)) {
5613 error = EINVAL;
5614 goto out;
5615 }
5616
5617 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5618 sizeof(uuid_t), sizeof(uuid_t));
5619 if (error != 0) {
5620 goto out;
5621 }
5622
5623 if (uuid_is_null(inp->necp_client_uuid)) {
5624 error = EINVAL;
5625 goto out;
5626 }
5627
5628 error = necp_client_register_socket_listener(proc_pid(current_proc()),
5629 inp->necp_client_uuid, inp);
5630 if (error != 0) {
5631 uuid_clear(inp->necp_client_uuid);
5632 goto out;
5633 }
5634
5635 // Mark that the port registration is held by NECP
5636 inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5637
5638 break;
5639 }
5ba3f43e 5640#endif /* NECP */
39236c6e 5641
3e170ce0 5642 case SO_EXTENDED_BK_IDLE:
0a7de745
A
5643 error = sooptcopyin(sopt, &optval, sizeof(optval),
5644 sizeof(optval));
5645 if (error == 0) {
3e170ce0 5646 error = so_set_extended_bk_idle(so, optval);
0a7de745 5647 }
3e170ce0
A
5648 break;
5649
490019cf
A
5650 case SO_MARK_CELLFALLBACK:
5651 error = sooptcopyin(sopt, &optval, sizeof(optval),
5652 sizeof(optval));
0a7de745 5653 if (error != 0) {
490019cf 5654 goto out;
0a7de745 5655 }
490019cf
A
5656 if (optval < 0) {
5657 error = EINVAL;
5658 goto out;
5659 }
0a7de745 5660 if (optval == 0) {
490019cf 5661 so->so_flags1 &= ~SOF1_CELLFALLBACK;
0a7de745 5662 } else {
490019cf 5663 so->so_flags1 |= SOF1_CELLFALLBACK;
0a7de745 5664 }
490019cf 5665 break;
39037602 5666
cb323159
A
5667 case SO_STATISTICS_EVENT:
5668 error = sooptcopyin(sopt, &long_optval,
5669 sizeof(long_optval), sizeof(long_optval));
5670 if (error != 0) {
5671 goto out;
5672 }
5673 u_int64_t nstat_event = 0;
5674 error = so_statistics_event_to_nstat_event(
5675 &long_optval, &nstat_event);
5676 if (error != 0) {
5677 goto out;
5678 }
5679 nstat_pcb_event(sotoinpcb(so), nstat_event);
5680 break;
5681
39037602
A
5682 case SO_NET_SERVICE_TYPE: {
5683 error = sooptcopyin(sopt, &optval, sizeof(optval),
5684 sizeof(optval));
0a7de745 5685 if (error != 0) {
39037602 5686 goto out;
0a7de745 5687 }
39037602
A
5688 error = so_set_net_service_type(so, optval);
5689 break;
5690 }
5691
5692 case SO_QOSMARKING_POLICY_OVERRIDE:
5693 error = priv_check_cred(kauth_cred_get(),
5694 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
0a7de745 5695 if (error != 0) {
39037602 5696 goto out;
0a7de745 5697 }
39037602
A
5698 error = sooptcopyin(sopt, &optval, sizeof(optval),
5699 sizeof(optval));
0a7de745 5700 if (error != 0) {
39037602 5701 goto out;
0a7de745
A
5702 }
5703 if (optval == 0) {
39037602 5704 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
0a7de745 5705 } else {
39037602 5706 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
0a7de745 5707 }
39037602
A
5708 break;
5709
cb323159
A
5710 case SO_MPKL_SEND_INFO: {
5711 struct so_mpkl_send_info so_mpkl_send_info;
5712
5713 error = sooptcopyin(sopt, &so_mpkl_send_info,
5714 sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5715 if (error != 0) {
5716 goto out;
5717 }
5718 uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5719 so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5720
5721 if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5722 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5723 } else {
5724 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5725 }
5726 break;
5727 }
1c79356b
A
5728 default:
5729 error = ENOPROTOOPT;
5730 break;
5731 }
39236c6e
A
5732 if (error == 0 && so->so_proto != NULL &&
5733 so->so_proto->pr_ctloutput != NULL) {
5734 (void) so->so_proto->pr_ctloutput(so, sopt);
1c79356b
A
5735 }
5736 }
39236c6e 5737out:
0a7de745 5738 if (dolock) {
39236c6e 5739 socket_unlock(so, 1);
0a7de745
A
5740 }
5741 return error;
1c79356b
A
5742}
5743
2d21ac55 5744/* Helper routines for getsockopt */
1c79356b 5745int
2d21ac55 5746sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
1c79356b 5747{
0a7de745
A
5748 int error;
5749 size_t valsize;
1c79356b
A
5750
5751 error = 0;
5752
5753 /*
5754 * Documented get behavior is that we always return a value,
5755 * possibly truncated to fit in the user's buffer.
5756 * Traditional behavior is that we always tell the user
5757 * precisely how much we copied, rather than something useful
5758 * like the total amount we had available for her.
5759 * Note that this interface is not idempotent; the entire answer must
5760 * generated ahead of time.
5761 */
5762 valsize = min(len, sopt->sopt_valsize);
5763 sopt->sopt_valsize = valsize;
91447636 5764 if (sopt->sopt_val != USER_ADDR_NULL) {
0a7de745 5765 if (sopt->sopt_p != kernproc) {
1c79356b 5766 error = copyout(buf, sopt->sopt_val, valsize);
0a7de745 5767 } else {
91447636 5768 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
0a7de745 5769 }
1c79356b 5770 }
0a7de745 5771 return error;
2d21ac55
A
5772}
5773
5774static int
39236c6e 5775sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
2d21ac55 5776{
0a7de745
A
5777 int error;
5778 size_t len;
5779 struct user64_timeval tv64 = {};
5780 struct user32_timeval tv32 = {};
5781 const void * val;
5782 size_t valsize;
b0d623f7 5783
2d21ac55
A
5784 error = 0;
5785 if (proc_is64bit(sopt->sopt_p)) {
0a7de745 5786 len = sizeof(tv64);
2d21ac55
A
5787 tv64.tv_sec = tv_p->tv_sec;
5788 tv64.tv_usec = tv_p->tv_usec;
5789 val = &tv64;
5790 } else {
0a7de745 5791 len = sizeof(tv32);
b0d623f7
A
5792 tv32.tv_sec = tv_p->tv_sec;
5793 tv32.tv_usec = tv_p->tv_usec;
5794 val = &tv32;
2d21ac55
A
5795 }
5796 valsize = min(len, sopt->sopt_valsize);
5797 sopt->sopt_valsize = valsize;
5798 if (sopt->sopt_val != USER_ADDR_NULL) {
0a7de745 5799 if (sopt->sopt_p != kernproc) {
2d21ac55 5800 error = copyout(val, sopt->sopt_val, valsize);
0a7de745 5801 } else {
2d21ac55 5802 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
0a7de745 5803 }
2d21ac55 5804 }
0a7de745 5805 return error;
1c79356b
A
5806}
5807
2d21ac55
A
5808/*
5809 * Return: 0 Success
5810 * ENOPROTOOPT
5811 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5812 * <pr_ctloutput>:???
5813 * <sf_getoption>:???
5814 */
1c79356b 5815int
39236c6e 5816sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
1c79356b 5817{
0a7de745
A
5818 int error, optval;
5819 struct linger l;
5820 struct timeval tv;
2d21ac55
A
5821#if CONFIG_MACF_SOCKET
5822 struct mac extmac;
5823#endif /* MAC_SOCKET */
1c79356b 5824
0a7de745 5825 if (sopt->sopt_dir != SOPT_GET) {
2d21ac55 5826 sopt->sopt_dir = SOPT_GET;
0a7de745 5827 }
9bccf70c 5828
0a7de745 5829 if (dolock) {
39236c6e 5830 socket_lock(so, 1);
0a7de745 5831 }
2d21ac55 5832
6d2010ae 5833 error = sflt_getsockopt(so, sopt);
39236c6e 5834 if (error != 0) {
0a7de745 5835 if (error == EJUSTRETURN) {
6d2010ae 5836 error = 0;
0a7de745 5837 }
39236c6e 5838 goto out;
1c79356b 5839 }
39236c6e 5840
1c79356b 5841 if (sopt->sopt_level != SOL_SOCKET) {
39236c6e
A
5842 if (so->so_proto != NULL &&
5843 so->so_proto->pr_ctloutput != NULL) {
2d21ac55 5844 error = (*so->so_proto->pr_ctloutput)(so, sopt);
39236c6e 5845 goto out;
91447636 5846 }
39236c6e 5847 error = ENOPROTOOPT;
1c79356b 5848 } else {
39236c6e
A
5849 /*
5850 * Allow socket-level (SOL_SOCKET) options to be filtered by
5851 * the protocol layer, if needed. A zero value returned from
5852 * the handler means use default socket-level processing as
5853 * done by the rest of this routine. Otherwise, any other
5854 * return value indicates that the option is unsupported.
5855 */
5856 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
0a7de745 5857 pru_socheckopt(so, sopt)) != 0) {
39236c6e 5858 goto out;
0a7de745 5859 }
39236c6e
A
5860
5861 error = 0;
1c79356b
A
5862 switch (sopt->sopt_name) {
5863 case SO_LINGER:
91447636 5864 case SO_LINGER_SEC:
39236c6e 5865 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
2d21ac55
A
5866 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5867 so->so_linger : so->so_linger / hz;
0a7de745 5868 error = sooptcopyout(sopt, &l, sizeof(l));
1c79356b
A
5869 break;
5870
5871 case SO_USELOOPBACK:
5872 case SO_DONTROUTE:
5873 case SO_DEBUG:
5874 case SO_KEEPALIVE:
5875 case SO_REUSEADDR:
5876 case SO_REUSEPORT:
5877 case SO_BROADCAST:
5878 case SO_OOBINLINE:
5879 case SO_TIMESTAMP:
6d2010ae 5880 case SO_TIMESTAMP_MONOTONIC:
d9a64523 5881 case SO_TIMESTAMP_CONTINUOUS:
1c79356b
A
5882 case SO_DONTTRUNC:
5883 case SO_WANTMORE:
9bccf70c 5884 case SO_WANTOOBFLAG:
fe8ab488 5885 case SO_NOWAKEFROMSLEEP:
39037602 5886 case SO_NOAPNFALLBK:
1c79356b
A
5887 optval = so->so_options & sopt->sopt_name;
5888integer:
0a7de745 5889 error = sooptcopyout(sopt, &optval, sizeof(optval));
1c79356b
A
5890 break;
5891
5892 case SO_TYPE:
5893 optval = so->so_type;
5894 goto integer;
5895
5896 case SO_NREAD:
2d21ac55
A
5897 if (so->so_proto->pr_flags & PR_ATOMIC) {
5898 int pkt_total;
5899 struct mbuf *m1;
1c79356b 5900
2d21ac55
A
5901 pkt_total = 0;
5902 m1 = so->so_rcv.sb_mb;
39236c6e
A
5903 while (m1 != NULL) {
5904 if (m1->m_type == MT_DATA ||
5905 m1->m_type == MT_HEADER ||
0a7de745 5906 m1->m_type == MT_OOBDATA) {
1c79356b 5907 pkt_total += m1->m_len;
0a7de745 5908 }
1c79356b
A
5909 m1 = m1->m_next;
5910 }
5911 optval = pkt_total;
2d21ac55
A
5912 } else {
5913 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5914 }
1c79356b 5915 goto integer;
39236c6e 5916
fe8ab488
A
5917 case SO_NUMRCVPKT:
5918 if (so->so_proto->pr_flags & PR_ATOMIC) {
5919 int cnt = 0;
5920 struct mbuf *m1;
5921
5922 m1 = so->so_rcv.sb_mb;
5923 while (m1 != NULL) {
cb323159 5924 cnt += 1;
fe8ab488
A
5925 m1 = m1->m_nextpkt;
5926 }
5927 optval = cnt;
5928 goto integer;
5929 } else {
cb323159 5930 error = ENOPROTOOPT;
fe8ab488
A
5931 break;
5932 }
5933
91447636
A
5934 case SO_NWRITE:
5935 optval = so->so_snd.sb_cc;
2d21ac55 5936 goto integer;
39236c6e 5937
1c79356b
A
5938 case SO_ERROR:
5939 optval = so->so_error;
5940 so->so_error = 0;
5941 goto integer;
5942
fe8ab488
A
5943 case SO_SNDBUF: {
5944 u_int32_t hiwat = so->so_snd.sb_hiwat;
1c79356b 5945
fe8ab488
A
5946 if (so->so_snd.sb_flags & SB_UNIX) {
5947 struct unpcb *unp =
5948 (struct unpcb *)(so->so_pcb);
5949 if (unp != NULL && unp->unp_conn != NULL) {
5950 hiwat += unp->unp_conn->unp_cc;
5951 }
5952 }
5953
5954 optval = hiwat;
5955 goto integer;
5956 }
1c79356b
A
5957 case SO_RCVBUF:
5958 optval = so->so_rcv.sb_hiwat;
5959 goto integer;
5960
5961 case SO_SNDLOWAT:
5962 optval = so->so_snd.sb_lowat;
5963 goto integer;
5964
5965 case SO_RCVLOWAT:
5966 optval = so->so_rcv.sb_lowat;
5967 goto integer;
5968
5969 case SO_SNDTIMEO:
5970 case SO_RCVTIMEO:
91447636 5971 tv = (sopt->sopt_name == SO_SNDTIMEO ?
2d21ac55 5972 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1c79356b 5973
2d21ac55
A
5974 error = sooptcopyout_timeval(sopt, &tv);
5975 break;
1c79356b 5976
91447636
A
5977 case SO_NOSIGPIPE:
5978 optval = (so->so_flags & SOF_NOSIGPIPE);
5979 goto integer;
9bccf70c 5980
55e303ae 5981 case SO_NOADDRERR:
91447636
A
5982 optval = (so->so_flags & SOF_NOADDRAVAIL);
5983 goto integer;
55e303ae 5984
2d21ac55
A
5985 case SO_REUSESHAREUID:
5986 optval = (so->so_flags & SOF_REUSESHAREUID);
5987 goto integer;
5988
39236c6e 5989
2d21ac55
A
5990 case SO_NOTIFYCONFLICT:
5991 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5992 goto integer;
39236c6e 5993
2d21ac55 5994 case SO_RESTRICTIONS:
39236c6e 5995 optval = so_get_restrictions(so);
2d21ac55
A
5996 goto integer;
5997
fe8ab488 5998 case SO_AWDL_UNRESTRICTED:
3e170ce0 5999 if (SOCK_DOM(so) == PF_INET ||
fe8ab488
A
6000 SOCK_DOM(so) == PF_INET6) {
6001 optval = inp_get_awdl_unrestricted(
0a7de745 6002 sotoinpcb(so));
fe8ab488 6003 goto integer;
0a7de745 6004 } else {
fe8ab488 6005 error = EOPNOTSUPP;
0a7de745 6006 }
fe8ab488
A
6007 break;
6008
39037602
A
6009 case SO_INTCOPROC_ALLOW:
6010 if (SOCK_DOM(so) == PF_INET6) {
6011 optval = inp_get_intcoproc_allowed(
0a7de745 6012 sotoinpcb(so));
39037602 6013 goto integer;
0a7de745 6014 } else {
39037602 6015 error = EOPNOTSUPP;
0a7de745 6016 }
39037602
A
6017 break;
6018
2d21ac55
A
6019 case SO_LABEL:
6020#if CONFIG_MACF_SOCKET
0a7de745
A
6021 if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
6022 sizeof(extmac))) != 0 ||
2d21ac55 6023 (error = mac_socket_label_get(proc_ucred(
0a7de745 6024 sopt->sopt_p), so, &extmac)) != 0) {
2d21ac55 6025 break;
0a7de745 6026 }
2d21ac55 6027
0a7de745 6028 error = sooptcopyout(sopt, &extmac, sizeof(extmac));
2d21ac55
A
6029#else
6030 error = EOPNOTSUPP;
6031#endif /* MAC_SOCKET */
6032 break;
6033
6034 case SO_PEERLABEL:
6035#if CONFIG_MACF_SOCKET
0a7de745
A
6036 if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
6037 sizeof(extmac))) != 0 ||
2d21ac55 6038 (error = mac_socketpeer_label_get(proc_ucred(
0a7de745 6039 sopt->sopt_p), so, &extmac)) != 0) {
2d21ac55 6040 break;
0a7de745 6041 }
2d21ac55 6042
0a7de745 6043 error = sooptcopyout(sopt, &extmac, sizeof(extmac));
2d21ac55
A
6044#else
6045 error = EOPNOTSUPP;
6046#endif /* MAC_SOCKET */
6047 break;
6048
4a3eedf9
A
6049#ifdef __APPLE_API_PRIVATE
6050 case SO_UPCALLCLOSEWAIT:
6051 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6052 goto integer;
6053#endif
b0d623f7
A
6054 case SO_RANDOMPORT:
6055 optval = (so->so_flags & SOF_BINDRANDOMPORT);
6056 goto integer;
6057
6058 case SO_NP_EXTENSIONS: {
527f9951 6059 struct so_np_extensions sonpx = {};
b0d623f7 6060
39236c6e
A
6061 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6062 SONPX_SETOPTSHUT : 0;
b0d623f7 6063 sonpx.npx_mask = SONPX_MASK_VALID;
4a3eedf9 6064
39236c6e 6065 error = sooptcopyout(sopt, &sonpx,
0a7de745 6066 sizeof(struct so_np_extensions));
39236c6e 6067 break;
b0d623f7 6068 }
6d2010ae 6069
d41d1dae
A
6070 case SO_TRAFFIC_CLASS:
6071 optval = so->so_traffic_class;
6072 goto integer;
316670eb 6073
6d2010ae
A
6074 case SO_RECV_TRAFFIC_CLASS:
6075 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6076 goto integer;
6077
6078 case SO_TRAFFIC_CLASS_STATS:
39236c6e 6079 error = sooptcopyout(sopt, &so->so_tc_stats,
0a7de745 6080 sizeof(so->so_tc_stats));
316670eb 6081 break;
6d2010ae 6082
39037602 6083#if (DEVELOPMENT || DEBUG)
39236c6e 6084 case SO_TRAFFIC_CLASS_DBG:
6d2010ae
A
6085 error = sogetopt_tcdbg(so, sopt);
6086 break;
39037602 6087#endif /* (DEVELOPMENT || DEBUG) */
316670eb
A
6088
6089 case SO_PRIVILEGED_TRAFFIC_CLASS:
6090 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6091 goto integer;
6092
6d2010ae
A
6093 case SO_DEFUNCTOK:
6094 optval = !(so->so_flags & SOF_NODEFUNCT);
6095 goto integer;
6096
6097 case SO_ISDEFUNCT:
6098 optval = (so->so_flags & SOF_DEFUNCT);
6099 goto integer;
d41d1dae 6100
316670eb
A
6101 case SO_OPPORTUNISTIC:
6102 optval = so_get_opportunistic(so);
6103 goto integer;
6104
6105 case SO_FLUSH:
6106 /* This option is not gettable */
6107 error = EINVAL;
6108 break;
6109
6110 case SO_RECV_ANYIF:
6111 optval = so_get_recv_anyif(so);
6112 goto integer;
6113
39236c6e
A
6114 case SO_TRAFFIC_MGT_BACKGROUND:
6115 /* This option is handled by lower layer(s) */
6116 if (so->so_proto != NULL &&
6117 so->so_proto->pr_ctloutput != NULL) {
6118 (void) so->so_proto->pr_ctloutput(so, sopt);
6119 }
6120 break;
6121
6122#if FLOW_DIVERT
6123 case SO_FLOW_DIVERT_TOKEN:
6124 error = flow_divert_token_get(so, sopt);
6125 break;
0a7de745 6126#endif /* FLOW_DIVERT */
3e170ce0 6127
fe8ab488
A
6128#if NECP
6129 case SO_NECP_ATTRIBUTES:
6130 error = necp_get_socket_attributes(so, sopt);
6131 break;
5ba3f43e 6132
cb323159 6133 case SO_NECP_CLIENTUUID: {
5ba3f43e
A
6134 uuid_t *ncu;
6135
6136 if (SOCK_DOM(so) == PF_MULTIPATH) {
6137 ncu = &mpsotomppcb(so)->necp_client_uuid;
6138 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6139 ncu = &sotoinpcb(so)->necp_client_uuid;
6140 } else {
6141 error = EINVAL;
6142 goto out;
6143 }
6144
6145 error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6146 break;
6147 }
cb323159
A
6148
6149 case SO_NECP_LISTENUUID: {
6150 uuid_t *nlu;
6151
6152 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6153 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6154 nlu = &sotoinpcb(so)->necp_client_uuid;
6155 } else {
6156 error = ENOENT;
6157 goto out;
6158 }
6159 } else {
6160 error = EINVAL;
6161 goto out;
6162 }
6163
6164 error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6165 break;
6166 }
fe8ab488
A
6167#endif /* NECP */
6168
6169#if CONTENT_FILTER
6170 case SO_CFIL_SOCK_ID: {
6171 cfil_sock_id_t sock_id;
6172
6173 sock_id = cfil_sock_id_from_socket(so);
6174
3e170ce0 6175 error = sooptcopyout(sopt, &sock_id,
0a7de745 6176 sizeof(cfil_sock_id_t));
fe8ab488
A
6177 break;
6178 }
0a7de745 6179#endif /* CONTENT_FILTER */
fe8ab488 6180
3e170ce0
A
6181 case SO_EXTENDED_BK_IDLE:
6182 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6183 goto integer;
490019cf
A
6184 case SO_MARK_CELLFALLBACK:
6185 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6186 ? 1 : 0;
6187 goto integer;
39037602 6188 case SO_NET_SERVICE_TYPE: {
0a7de745 6189 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
39037602 6190 optval = so->so_netsvctype;
0a7de745 6191 } else {
39037602 6192 optval = NET_SERVICE_TYPE_BE;
0a7de745 6193 }
39037602
A
6194 goto integer;
6195 }
6196 case SO_NETSVC_MARKING_LEVEL:
6197 optval = so_get_netsvc_marking_level(so);
6198 goto integer;
6199
cb323159
A
6200 case SO_MPKL_SEND_INFO: {
6201 struct so_mpkl_send_info so_mpkl_send_info;
6202
6203 uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6204 so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6205 error = sooptcopyout(sopt, &so_mpkl_send_info,
6206 sizeof(struct so_mpkl_send_info));
6207 break;
6208 }
1c79356b
A
6209 default:
6210 error = ENOPROTOOPT;
6211 break;
6212 }
1c79356b 6213 }
39236c6e 6214out:
0a7de745 6215 if (dolock) {
39236c6e 6216 socket_unlock(so, 1);
0a7de745
A
6217 }
6218 return error;
1c79356b 6219}
39236c6e
A
6220
6221/*
6222 * The size limits on our soopt_getm is different from that on FreeBSD.
6d2010ae
A
6223 * We limit the size of options to MCLBYTES. This will have to change
6224 * if we need to define options that need more space than MCLBYTES.
6225 */
1c79356b 6226int
9bccf70c 6227soopt_getm(struct sockopt *sopt, struct mbuf **mp)
1c79356b
A
6228{
6229 struct mbuf *m, *m_prev;
6230 int sopt_size = sopt->sopt_valsize;
b0d623f7 6231 int how;
1c79356b 6232
0a7de745
A
6233 if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6234 return EMSGSIZE;
6235 }
a3d08fcd 6236
b0d623f7
A
6237 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6238 MGET(m, how, MT_DATA);
0a7de745
A
6239 if (m == NULL) {
6240 return ENOBUFS;
6241 }
1c79356b 6242 if (sopt_size > MLEN) {
b0d623f7 6243 MCLGET(m, how);
1c79356b
A
6244 if ((m->m_flags & M_EXT) == 0) {
6245 m_free(m);
0a7de745 6246 return ENOBUFS;
1c79356b
A
6247 }
6248 m->m_len = min(MCLBYTES, sopt_size);
6249 } else {
6250 m->m_len = min(MLEN, sopt_size);
6251 }
6252 sopt_size -= m->m_len;
6253 *mp = m;
6254 m_prev = m;
6255
6d2010ae 6256 while (sopt_size > 0) {
b0d623f7 6257 MGET(m, how, MT_DATA);
39236c6e 6258 if (m == NULL) {
1c79356b 6259 m_freem(*mp);
0a7de745 6260 return ENOBUFS;
1c79356b
A
6261 }
6262 if (sopt_size > MLEN) {
b0d623f7 6263 MCLGET(m, how);
1c79356b
A
6264 if ((m->m_flags & M_EXT) == 0) {
6265 m_freem(*mp);
6d2010ae 6266 m_freem(m);
0a7de745 6267 return ENOBUFS;
1c79356b
A
6268 }
6269 m->m_len = min(MCLBYTES, sopt_size);
6270 } else {
6271 m->m_len = min(MLEN, sopt_size);
6272 }
6273 sopt_size -= m->m_len;
6274 m_prev->m_next = m;
6275 m_prev = m;
6276 }
0a7de745 6277 return 0;
1c79356b
A
6278}
6279
6d2010ae 6280/* copyin sopt data into mbuf chain */
1c79356b 6281int
9bccf70c 6282soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
1c79356b
A
6283{
6284 struct mbuf *m0 = m;
6285
0a7de745
A
6286 if (sopt->sopt_val == USER_ADDR_NULL) {
6287 return 0;
6288 }
1c79356b 6289 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
b0d623f7 6290 if (sopt->sopt_p != kernproc) {
1c79356b
A
6291 int error;
6292
2d21ac55
A
6293 error = copyin(sopt->sopt_val, mtod(m, char *),
6294 m->m_len);
1c79356b
A
6295 if (error != 0) {
6296 m_freem(m0);
0a7de745 6297 return error;
1c79356b 6298 }
2d21ac55
A
6299 } else {
6300 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6301 mtod(m, char *), m->m_len);
6302 }
1c79356b 6303 sopt->sopt_valsize -= m->m_len;
2d21ac55 6304 sopt->sopt_val += m->m_len;
1c79356b
A
6305 m = m->m_next;
6306 }
39236c6e
A
6307 /* should be allocated enoughly at ip6_sooptmcopyin() */
6308 if (m != NULL) {
9bccf70c 6309 panic("soopt_mcopyin");
39236c6e
A
6310 /* NOTREACHED */
6311 }
0a7de745 6312 return 0;
1c79356b
A
6313}
6314
6d2010ae 6315/* copyout mbuf chain data into soopt */
1c79356b 6316int
9bccf70c 6317soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
1c79356b
A
6318{
6319 struct mbuf *m0 = m;
6320 size_t valsize = 0;
6321
0a7de745
A
6322 if (sopt->sopt_val == USER_ADDR_NULL) {
6323 return 0;
6324 }
1c79356b 6325 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
b0d623f7 6326 if (sopt->sopt_p != kernproc) {
1c79356b
A
6327 int error;
6328
2d21ac55
A
6329 error = copyout(mtod(m, char *), sopt->sopt_val,
6330 m->m_len);
1c79356b
A
6331 if (error != 0) {
6332 m_freem(m0);
0a7de745 6333 return error;
1c79356b 6334 }
2d21ac55
A
6335 } else {
6336 bcopy(mtod(m, char *),
6337 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6338 }
6339 sopt->sopt_valsize -= m->m_len;
6340 sopt->sopt_val += m->m_len;
6341 valsize += m->m_len;
6342 m = m->m_next;
1c79356b
A
6343 }
6344 if (m != NULL) {
6345 /* enough soopt buffer should be given from user-land */
6346 m_freem(m0);
0a7de745 6347 return EINVAL;
1c79356b
A
6348 }
6349 sopt->sopt_valsize = valsize;
0a7de745 6350 return 0;
1c79356b
A
6351}
6352
9bccf70c 6353void
2d21ac55 6354sohasoutofband(struct socket *so)
9bccf70c 6355{
0a7de745 6356 if (so->so_pgid < 0) {
9bccf70c 6357 gsignal(-so->so_pgid, SIGURG);
0a7de745 6358 } else if (so->so_pgid > 0) {
2d21ac55 6359 proc_signal(so->so_pgid, SIGURG);
0a7de745 6360 }
9bccf70c 6361 selwakeup(&so->so_rcv.sb_sel);
39037602
A
6362 if (so->so_rcv.sb_flags & SB_KNOTE) {
6363 KNOTE(&so->so_rcv.sb_sel.si_note,
6364 (NOTE_OOB | SO_FILT_HINT_LOCKED));
6365 }
9bccf70c
A
6366}
6367
6368int
39236c6e 6369sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
9bccf70c 6370{
39236c6e 6371#pragma unused(cred)
9bccf70c
A
6372 struct proc *p = current_proc();
6373 int revents = 0;
91447636
A
6374
6375 socket_lock(so, 1);
39236c6e
A
6376 so_update_last_owner_locked(so, PROC_NULL);
6377 so_update_policy(so);
9bccf70c 6378
0a7de745
A
6379 if (events & (POLLIN | POLLRDNORM)) {
6380 if (soreadable(so)) {
9bccf70c 6381 revents |= events & (POLLIN | POLLRDNORM);
0a7de745
A
6382 }
6383 }
9bccf70c 6384
0a7de745
A
6385 if (events & (POLLOUT | POLLWRNORM)) {
6386 if (sowriteable(so)) {
9bccf70c 6387 revents |= events & (POLLOUT | POLLWRNORM);
0a7de745
A
6388 }
6389 }
9bccf70c 6390
0a7de745
A
6391 if (events & (POLLPRI | POLLRDBAND)) {
6392 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
9bccf70c 6393 revents |= events & (POLLPRI | POLLRDBAND);
0a7de745
A
6394 }
6395 }
9bccf70c
A
6396
6397 if (revents == 0) {
6398 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
2d21ac55
A
6399 /*
6400 * Darwin sets the flag first,
6401 * BSD calls selrecord first
6402 */
9bccf70c
A
6403 so->so_rcv.sb_flags |= SB_SEL;
6404 selrecord(p, &so->so_rcv.sb_sel, wql);
6405 }
6406
6407 if (events & (POLLOUT | POLLWRNORM)) {
2d21ac55
A
6408 /*
6409 * Darwin sets the flag first,
6410 * BSD calls selrecord first
6411 */
9bccf70c
A
6412 so->so_snd.sb_flags |= SB_SEL;
6413 selrecord(p, &so->so_snd.sb_sel, wql);
6414 }
6415 }
6416
91447636 6417 socket_unlock(so, 1);
0a7de745 6418 return revents;
9bccf70c 6419}
55e303ae 6420
55e303ae 6421int
cb323159 6422soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
55e303ae 6423{
cb323159 6424 struct socket *so = (struct socket *)fp->f_fglob->fg_data;
39037602 6425 int result;
2d21ac55 6426
91447636 6427 socket_lock(so, 1);
39236c6e
A
6428 so_update_last_owner_locked(so, PROC_NULL);
6429 so_update_policy(so);
55e303ae 6430
2d21ac55 6431#if CONFIG_MACF_SOCKET
cb323159
A
6432 proc_t p = knote_get_kq(kn)->kq_p;
6433 if (mac_socket_check_kqfilter(proc_ucred(p), kn, so) != 0) {
2d21ac55 6434 socket_unlock(so, 1);
cb323159 6435 knote_set_error(kn, EPERM);
39037602 6436 return 0;
2d21ac55
A
6437 }
6438#endif /* MAC_SOCKET */
6439
55e303ae
A
6440 switch (kn->kn_filter) {
6441 case EVFILT_READ:
39037602 6442 kn->kn_filtid = EVFILTID_SOREAD;
55e303ae
A
6443 break;
6444 case EVFILT_WRITE:
39037602 6445 kn->kn_filtid = EVFILTID_SOWRITE;
316670eb
A
6446 break;
6447 case EVFILT_SOCK:
39037602
A
6448 kn->kn_filtid = EVFILTID_SCK;
6449 break;
6450 case EVFILT_EXCEPT:
6451 kn->kn_filtid = EVFILTID_SOEXCEPT;
55e303ae
A
6452 break;
6453 default:
91447636 6454 socket_unlock(so, 1);
cb323159 6455 knote_set_error(kn, EINVAL);
39037602 6456 return 0;
316670eb 6457 }
55e303ae 6458
39037602
A
6459 /*
6460 * call the appropriate sub-filter attach
6461 * with the socket still locked
6462 */
5ba3f43e 6463 result = knote_fops(kn)->f_attach(kn, kev);
55e303ae 6464
91447636 6465 socket_unlock(so, 1);
39037602
A
6466
6467 return result;
55e303ae
A
6468}
6469
55e303ae 6470static int
cb323159 6471filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
55e303ae 6472{
cb323159
A
6473 int retval = 0;
6474 int64_t data = 0;
b0d623f7 6475
cb323159 6476 if (so->so_options & SO_ACCEPTCONN) {
39236c6e
A
6477 /*
6478 * Radar 6615193 handle the listen case dynamically
6479 * for kqueue read filter. This allows to call listen()
6480 * after registering the kqueue EVFILT_READ.
b0d623f7
A
6481 */
6482
cb323159
A
6483 retval = !TAILQ_EMPTY(&so->so_comp);
6484 data = so->so_qlen;
6485 goto out;
b0d623f7
A
6486 }
6487
6488 /* socket isn't a listener */
3e170ce0
A
6489 /*
6490 * NOTE_LOWAT specifies new low water mark in data, i.e.
6491 * the bytes of protocol data. We therefore exclude any
6492 * control bytes.
6493 */
cb323159 6494 data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3e170ce0 6495
39037602
A
6496 if (kn->kn_sfflags & NOTE_OOB) {
6497 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6498 kn->kn_fflags |= NOTE_OOB;
cb323159
A
6499 data -= so->so_oobmark;
6500 retval = 1;
6501 goto out;
91447636 6502 }
04b8595b 6503 }
3e170ce0 6504
04b8595b 6505 if ((so->so_state & SS_CANTRCVMORE)
fe8ab488 6506#if CONTENT_FILTER
04b8595b 6507 && cfil_sock_data_pending(&so->so_rcv) == 0
fe8ab488 6508#endif /* CONTENT_FILTER */
0a7de745 6509 ) {
04b8595b
A
6510 kn->kn_flags |= EV_EOF;
6511 kn->kn_fflags = so->so_error;
cb323159
A
6512 retval = 1;
6513 goto out;
91447636
A
6514 }
6515
0a7de745 6516 if (so->so_error) { /* temporary udp error */
cb323159
A
6517 retval = 1;
6518 goto out;
91447636
A
6519 }
6520
0a7de745 6521 int64_t lowwat = so->so_rcv.sb_lowat;
3e170ce0
A
6522 /*
6523 * Ensure that when NOTE_LOWAT is used, the derived
6524 * low water mark is bounded by socket's rcv buf's
6525 * high and low water mark values.
6526 */
39236c6e 6527 if (kn->kn_sfflags & NOTE_LOWAT) {
0a7de745 6528 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6d2010ae 6529 lowwat = so->so_rcv.sb_hiwat;
0a7de745 6530 } else if (kn->kn_sdata > lowwat) {
6d2010ae 6531 lowwat = kn->kn_sdata;
0a7de745 6532 }
6d2010ae 6533 }
39236c6e 6534
cb323159 6535 retval = (data >= lowwat);
3e170ce0 6536
cb323159
A
6537out:
6538 if (retval && kev) {
6539 knote_fill_kevent(kn, kev, data);
6540 }
6541 return retval;
55e303ae
A
6542}
6543
39037602 6544static int
cb323159 6545filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
39037602
A
6546{
6547 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6548
6549 /* socket locked */
6550
6551 /*
6552 * If the caller explicitly asked for OOB results (e.g. poll())
6553 * from EVFILT_READ, then save that off in the hookid field
6554 * and reserve the kn_flags EV_OOBAND bit for output only.
6555 */
6556 if (kn->kn_filter == EVFILT_READ &&
6557 kn->kn_flags & EV_OOBAND) {
6558 kn->kn_flags &= ~EV_OOBAND;
cb323159 6559 kn->kn_hook32 = EV_OOBAND;
39037602 6560 } else {
cb323159 6561 kn->kn_hook32 = 0;
39037602 6562 }
0a7de745 6563 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
39037602 6564 so->so_rcv.sb_flags |= SB_KNOTE;
0a7de745 6565 }
39037602
A
6566
6567 /* indicate if event is already fired */
cb323159 6568 return filt_soread_common(kn, NULL, so);
39037602
A
6569}
6570
55e303ae 6571static void
39037602 6572filt_sordetach(struct knote *kn)
55e303ae 6573{
91447636 6574 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
39037602 6575
91447636 6576 socket_lock(so, 1);
0a7de745
A
6577 if (so->so_rcv.sb_flags & SB_KNOTE) {
6578 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
39037602 6579 so->so_rcv.sb_flags &= ~SB_KNOTE;
0a7de745
A
6580 }
6581 }
39037602
A
6582 socket_unlock(so, 1);
6583}
6584
6585/*ARGSUSED*/
6586static int
6587filt_soread(struct knote *kn, long hint)
6588{
6589 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6590 int retval;
6591
0a7de745 6592 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
39037602 6593 socket_lock(so, 1);
0a7de745 6594 }
39037602 6595
cb323159 6596 retval = filt_soread_common(kn, NULL, so);
39037602 6597
0a7de745 6598 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
39037602 6599 socket_unlock(so, 1);
0a7de745 6600 }
39037602
A
6601
6602 return retval;
6603}
6604
6605static int
cb323159 6606filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
39037602
A
6607{
6608 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6609 int retval;
6610
6611 socket_lock(so, 1);
6612
6613 /* save off the new input fflags and data */
6614 kn->kn_sfflags = kev->fflags;
6615 kn->kn_sdata = kev->data;
39037602
A
6616
6617 /* determine if changes result in fired events */
cb323159 6618 retval = filt_soread_common(kn, NULL, so);
55e303ae 6619
91447636 6620 socket_unlock(so, 1);
39037602
A
6621
6622 return retval;
6623}
6624
6625static int
cb323159 6626filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
39037602 6627{
39037602
A
6628 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6629 int retval;
6630
6631 socket_lock(so, 1);
cb323159 6632 retval = filt_soread_common(kn, kev, so);
39037602
A
6633 socket_unlock(so, 1);
6634
6635 return retval;
55e303ae
A
6636}
6637
316670eb
A
6638int
6639so_wait_for_if_feedback(struct socket *so)
6640{
39236c6e 6641 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
316670eb
A
6642 (so->so_state & SS_ISCONNECTED)) {
6643 struct inpcb *inp = sotoinpcb(so);
0a7de745
A
6644 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6645 return 1;
6646 }
316670eb 6647 }
0a7de745 6648 return 0;
316670eb
A
6649}
6650
55e303ae 6651static int
cb323159 6652filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
55e303ae 6653{
316670eb 6654 int ret = 0;
cb323159 6655 int64_t data = sbspace(&so->so_snd);
91447636 6656
55e303ae 6657 if (so->so_state & SS_CANTSENDMORE) {
2d21ac55 6658 kn->kn_flags |= EV_EOF;
55e303ae 6659 kn->kn_fflags = so->so_error;
cb323159
A
6660 ret = 1;
6661 goto out;
55e303ae 6662 }
cb323159 6663
0a7de745 6664 if (so->so_error) { /* temporary udp error */
cb323159
A
6665 ret = 1;
6666 goto out;
91447636 6667 }
cb323159 6668
3e170ce0 6669 if (!socanwrite(so)) {
cb323159
A
6670 ret = 0;
6671 goto out;
91447636 6672 }
cb323159 6673
3e170ce0 6674 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
cb323159
A
6675 ret = 1;
6676 goto out;
3e170ce0 6677 }
cb323159 6678
0a7de745 6679 int64_t lowwat = so->so_snd.sb_lowat;
cb323159 6680
39236c6e 6681 if (kn->kn_sfflags & NOTE_LOWAT) {
0a7de745 6682 if (kn->kn_sdata > so->so_snd.sb_hiwat) {
6d2010ae 6683 lowwat = so->so_snd.sb_hiwat;
0a7de745 6684 } else if (kn->kn_sdata > lowwat) {
6d2010ae 6685 lowwat = kn->kn_sdata;
0a7de745 6686 }
6d2010ae 6687 }
cb323159
A
6688
6689 if (data >= lowwat) {
39037602
A
6690 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6691#if (DEBUG || DEVELOPMENT)
6692 && so_notsent_lowat_check == 1
6693#endif /* DEBUG || DEVELOPMENT */
6694 ) {
6695 if ((SOCK_DOM(so) == PF_INET ||
6696 SOCK_DOM(so) == PF_INET6) &&
6697 so->so_type == SOCK_STREAM) {
fe8ab488
A
6698 ret = tcp_notsent_lowat_check(so);
6699 }
6700#if MPTCP
6701 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6702 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6703 ret = mptcp_notsent_lowat_check(so);
6704 }
6705#endif
6706 else {
cb323159
A
6707 ret = 1;
6708 goto out;
fe8ab488 6709 }
316670eb
A
6710 } else {
6711 ret = 1;
6712 }
6713 }
0a7de745 6714 if (so_wait_for_if_feedback(so)) {
316670eb 6715 ret = 0;
0a7de745 6716 }
cb323159
A
6717
6718out:
6719 if (ret && kev) {
6720 knote_fill_kevent(kn, kev, data);
6721 }
0a7de745 6722 return ret;
316670eb
A
6723}
6724
39037602 6725static int
cb323159 6726filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
39037602
A
6727{
6728 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6729
6730 /* socket locked */
0a7de745 6731 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
39037602 6732 so->so_snd.sb_flags |= SB_KNOTE;
0a7de745 6733 }
39037602
A
6734
6735 /* determine if its already fired */
cb323159 6736 return filt_sowrite_common(kn, NULL, so);
39037602
A
6737}
6738
316670eb 6739static void
39037602 6740filt_sowdetach(struct knote *kn)
316670eb
A
6741{
6742 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6743 socket_lock(so, 1);
39236c6e 6744
0a7de745
A
6745 if (so->so_snd.sb_flags & SB_KNOTE) {
6746 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
39037602 6747 so->so_snd.sb_flags &= ~SB_KNOTE;
0a7de745
A
6748 }
6749 }
316670eb
A
6750 socket_unlock(so, 1);
6751}
6752
39037602 6753/*ARGSUSED*/
316670eb 6754static int
39037602 6755filt_sowrite(struct knote *kn, long hint)
316670eb 6756{
316670eb 6757 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
39037602 6758 int ret;
316670eb 6759
0a7de745 6760 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
316670eb 6761 socket_lock(so, 1);
0a7de745 6762 }
39037602 6763
cb323159 6764 ret = filt_sowrite_common(kn, NULL, so);
39037602 6765
0a7de745 6766 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
39037602 6767 socket_unlock(so, 1);
0a7de745 6768 }
39037602
A
6769
6770 return ret;
6771}
6772
6773static int
cb323159 6774filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
39037602
A
6775{
6776 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6777 int ret;
6778
6779 socket_lock(so, 1);
6780
6781 /*save off the new input fflags and data */
6782 kn->kn_sfflags = kev->fflags;
6783 kn->kn_sdata = kev->data;
39037602
A
6784
6785 /* determine if these changes result in a triggered event */
cb323159 6786 ret = filt_sowrite_common(kn, NULL, so);
39037602
A
6787
6788 socket_unlock(so, 1);
6789
6790 return ret;
6791}
6792
6793static int
cb323159 6794filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
39037602 6795{
39037602
A
6796 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6797 int ret;
6798
6799 socket_lock(so, 1);
cb323159 6800 ret = filt_sowrite_common(kn, kev, so);
39037602 6801 socket_unlock(so, 1);
cb323159 6802
39037602
A
6803 return ret;
6804}
6805
6806static int
cb323159
A
6807filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6808 struct socket *so, long ev_hint)
39037602
A
6809{
6810 int ret = 0;
cb323159 6811 int64_t data = 0;
39037602 6812 uint32_t level_trigger = 0;
316670eb 6813
39236c6e 6814 if (ev_hint & SO_FILT_HINT_CONNRESET) {
3e170ce0 6815 kn->kn_fflags |= NOTE_CONNRESET;
39236c6e
A
6816 }
6817 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
3e170ce0 6818 kn->kn_fflags |= NOTE_TIMEOUT;
39236c6e
A
6819 }
6820 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
3e170ce0 6821 kn->kn_fflags |= NOTE_NOSRCADDR;
39236c6e
A
6822 }
6823 if (ev_hint & SO_FILT_HINT_IFDENIED) {
3e170ce0 6824 kn->kn_fflags |= NOTE_IFDENIED;
39236c6e
A
6825 }
6826 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
3e170ce0 6827 kn->kn_fflags |= NOTE_KEEPALIVE;
316670eb 6828 }
39236c6e 6829 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
3e170ce0 6830 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
39236c6e
A
6831 }
6832 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
3e170ce0 6833 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
39236c6e 6834 }
3e170ce0
A
6835 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6836 (so->so_state & SS_ISCONNECTED)) {
6837 kn->kn_fflags |= NOTE_CONNECTED;
6838 level_trigger |= NOTE_CONNECTED;
39236c6e 6839 }
3e170ce0
A
6840 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6841 (so->so_state & SS_ISDISCONNECTED)) {
6842 kn->kn_fflags |= NOTE_DISCONNECTED;
6843 level_trigger |= NOTE_DISCONNECTED;
39236c6e
A
6844 }
6845 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6846 if (so->so_proto != NULL &&
0a7de745 6847 (so->so_proto->pr_flags & PR_EVCONNINFO)) {
39236c6e 6848 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
0a7de745 6849 }
39236c6e 6850 }
316670eb 6851
39037602
A
6852 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6853 tcp_notify_ack_active(so)) {
6854 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6855 }
6856
3e170ce0 6857 if ((so->so_state & SS_CANTRCVMORE)
fe8ab488 6858#if CONTENT_FILTER
3e170ce0 6859 && cfil_sock_data_pending(&so->so_rcv) == 0
fe8ab488 6860#endif /* CONTENT_FILTER */
3e170ce0 6861 ) {
316670eb 6862 kn->kn_fflags |= NOTE_READCLOSED;
3e170ce0
A
6863 level_trigger |= NOTE_READCLOSED;
6864 }
316670eb 6865
3e170ce0 6866 if (so->so_state & SS_CANTSENDMORE) {
316670eb 6867 kn->kn_fflags |= NOTE_WRITECLOSED;
3e170ce0
A
6868 level_trigger |= NOTE_WRITECLOSED;
6869 }
316670eb 6870
3e170ce0
A
6871 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6872 (so->so_flags & SOF_SUSPENDED)) {
39236c6e 6873 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
3e170ce0
A
6874
6875 /* If resume event was delivered before, reset it */
cb323159 6876 kn->kn_hook32 &= ~NOTE_RESUME;
3e170ce0 6877
316670eb 6878 kn->kn_fflags |= NOTE_SUSPEND;
3e170ce0 6879 level_trigger |= NOTE_SUSPEND;
316670eb
A
6880 }
6881
3e170ce0
A
6882 if ((ev_hint & SO_FILT_HINT_RESUME) ||
6883 (so->so_flags & SOF_SUSPENDED) == 0) {
39236c6e 6884 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
3e170ce0
A
6885
6886 /* If suspend event was delivered before, reset it */
cb323159 6887 kn->kn_hook32 &= ~NOTE_SUSPEND;
3e170ce0 6888
316670eb 6889 kn->kn_fflags |= NOTE_RESUME;
3e170ce0 6890 level_trigger |= NOTE_RESUME;
316670eb
A
6891 }
6892
6893 if (so->so_error != 0) {
6894 ret = 1;
cb323159 6895 data = so->so_error;
316670eb
A
6896 kn->kn_flags |= EV_EOF;
6897 } else {
cb323159
A
6898 u_int32_t data32;
6899 get_sockev_state(so, &data32);
6900 data = data32;
316670eb
A
6901 }
6902
3e170ce0
A
6903 /* Reset any events that are not requested on this knote */
6904 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6905 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6906
6907 /* Find the level triggerred events that are already delivered */
cb323159 6908 level_trigger &= kn->kn_hook32;
3e170ce0
A
6909 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6910
6911 /* Do not deliver level triggerred events more than once */
0a7de745 6912 if ((kn->kn_fflags & ~level_trigger) != 0) {
316670eb 6913 ret = 1;
0a7de745 6914 }
316670eb 6915
cb323159
A
6916 if (ret && kev) {
6917 /*
6918 * Store the state of the events being delivered. This
6919 * state can be used to deliver level triggered events
6920 * ateast once and still avoid waking up the application
6921 * multiple times as long as the event is active.
6922 */
6923 if (kn->kn_fflags != 0) {
6924 kn->kn_hook32 |= (kn->kn_fflags &
6925 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6926 }
6927
6928 /*
6929 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6930 * only one of them and remember the last one that was
6931 * delivered last
6932 */
6933 if (kn->kn_fflags & NOTE_SUSPEND) {
6934 kn->kn_hook32 &= ~NOTE_RESUME;
6935 }
6936 if (kn->kn_fflags & NOTE_RESUME) {
6937 kn->kn_hook32 &= ~NOTE_SUSPEND;
6938 }
6939
6940 knote_fill_kevent(kn, kev, data);
6941 }
0a7de745 6942 return ret;
316670eb
A
6943}
6944
39037602 6945static int
cb323159 6946filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
39037602
A
6947{
6948 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6949
6950 /* socket locked */
cb323159 6951 kn->kn_hook32 = 0;
0a7de745 6952 if (KNOTE_ATTACH(&so->so_klist, kn)) {
39037602 6953 so->so_flags |= SOF_KNOTE;
0a7de745 6954 }
39037602
A
6955
6956 /* determine if event already fired */
cb323159 6957 return filt_sockev_common(kn, NULL, so, 0);
39037602
A
6958}
6959
3e170ce0 6960static void
39037602 6961filt_sockdetach(struct knote *kn)
3e170ce0 6962{
39037602
A
6963 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6964 socket_lock(so, 1);
3e170ce0 6965
0a7de745
A
6966 if ((so->so_flags & SOF_KNOTE) != 0) {
6967 if (KNOTE_DETACH(&so->so_klist, kn)) {
39037602 6968 so->so_flags &= ~SOF_KNOTE;
0a7de745
A
6969 }
6970 }
39037602
A
6971 socket_unlock(so, 1);
6972}
6973
6974static int
6975filt_sockev(struct knote *kn, long hint)
6976{
6977 int ret = 0, locked = 0;
6978 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6979 long ev_hint = (hint & SO_FILT_HINT_EV);
6980
6981 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6982 socket_lock(so, 1);
6983 locked = 1;
3e170ce0 6984 }
39037602 6985
cb323159 6986 ret = filt_sockev_common(kn, NULL, so, ev_hint);
39037602 6987
0a7de745 6988 if (locked) {
39037602 6989 socket_unlock(so, 1);
0a7de745 6990 }
39037602
A
6991
6992 return ret;
6993}
6994
6995
6996
6997/*
6998 * filt_socktouch - update event state
6999 */
7000static int
7001filt_socktouch(
7002 struct knote *kn,
cb323159 7003 struct kevent_qos_s *kev)
39037602
A
7004{
7005 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
7006 uint32_t changed_flags;
7007 int ret;
7008
7009 socket_lock(so, 1);
7010
7011 /* save off the [result] data and fflags */
cb323159 7012 changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
39037602
A
7013
7014 /* save off the new input fflags and data */
7015 kn->kn_sfflags = kev->fflags;
7016 kn->kn_sdata = kev->data;
39037602
A
7017
7018 /* restrict the current results to the (smaller?) set of new interest */
7019 /*
7020 * For compatibility with previous implementations, we leave kn_fflags
7021 * as they were before.
7022 */
7023 //kn->kn_fflags &= kev->fflags;
7024
7025 /*
7026 * Since we keep track of events that are already
7027 * delivered, if any of those events are not requested
7028 * anymore the state related to them can be reset
7029 */
cb323159 7030 kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
39037602
A
7031
7032 /* determine if we have events to deliver */
cb323159 7033 ret = filt_sockev_common(kn, NULL, so, 0);
39037602
A
7034
7035 socket_unlock(so, 1);
7036
7037 return ret;
7038}
7039
7040/*
7041 * filt_sockprocess - query event fired state and return data
7042 */
7043static int
cb323159 7044filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
39037602 7045{
39037602
A
7046 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
7047 int ret = 0;
7048
7049 socket_lock(so, 1);
7050
cb323159 7051 ret = filt_sockev_common(kn, kev, so, 0);
39037602
A
7052
7053 socket_unlock(so, 1);
7054
7055 return ret;
3e170ce0
A
7056}
7057
316670eb 7058void
39236c6e
A
7059get_sockev_state(struct socket *so, u_int32_t *statep)
7060{
316670eb
A
7061 u_int32_t state = *(statep);
7062
39037602
A
7063 /*
7064 * If the state variable is already used by a previous event,
7065 * reset it.
7066 */
0a7de745 7067 if (state != 0) {
39037602 7068 return;
0a7de745 7069 }
39037602 7070
0a7de745 7071 if (so->so_state & SS_ISCONNECTED) {
316670eb 7072 state |= SOCKEV_CONNECTED;
0a7de745 7073 } else {
316670eb 7074 state &= ~(SOCKEV_CONNECTED);
0a7de745 7075 }
39236c6e 7076 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
316670eb 7077 *(statep) = state;
55e303ae
A
7078}
7079
0a7de745 7080#define SO_LOCK_HISTORY_STR_LEN \
39236c6e 7081 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
b0d623f7 7082
39236c6e
A
7083__private_extern__ const char *
7084solockhistory_nr(struct socket *so)
55e303ae 7085{
39236c6e
A
7086 size_t n = 0;
7087 int i;
7088 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7089
0a7de745 7090 bzero(lock_history_str, sizeof(lock_history_str));
39236c6e 7091 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
4ba76501 7092 n += scnprintf(lock_history_str + n,
39236c6e
A
7093 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7094 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7095 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
b0d623f7 7096 }
0a7de745 7097 return lock_history_str;
55e303ae
A
7098}
7099
cb323159
A
7100lck_mtx_t *
7101socket_getlock(struct socket *so, int flags)
7102{
7103 if (so->so_proto->pr_getlock != NULL) {
7104 return (*so->so_proto->pr_getlock)(so, flags);
7105 } else {
7106 return so->so_proto->pr_domain->dom_mtx;
7107 }
7108}
7109
5ba3f43e 7110void
2d21ac55 7111socket_lock(struct socket *so, int refcount)
91447636 7112{
b0d623f7 7113 void *lr_saved;
0c530ab8 7114
b0d623f7 7115 lr_saved = __builtin_return_address(0);
91447636
A
7116
7117 if (so->so_proto->pr_lock) {
5ba3f43e 7118 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
2d21ac55 7119 } else {
91447636 7120#ifdef MORE_LOCKING_DEBUG
5ba3f43e 7121 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
2d21ac55 7122 LCK_MTX_ASSERT_NOTOWNED);
91447636
A
7123#endif
7124 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
0a7de745 7125 if (refcount) {
91447636 7126 so->so_usecount++;
0a7de745 7127 }
b0d623f7 7128 so->lock_lr[so->next_lock_lr] = lr_saved;
0a7de745 7129 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
91447636 7130 }
5ba3f43e 7131}
91447636 7132
5ba3f43e
A
7133void
7134socket_lock_assert_owned(struct socket *so)
7135{
7136 lck_mtx_t *mutex_held;
7137
0a7de745 7138 if (so->so_proto->pr_getlock != NULL) {
5ba3f43e 7139 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
0a7de745 7140 } else {
5ba3f43e 7141 mutex_held = so->so_proto->pr_domain->dom_mtx;
0a7de745 7142 }
5ba3f43e
A
7143
7144 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
91447636
A
7145}
7146
7147int
5ba3f43e
A
7148socket_try_lock(struct socket *so)
7149{
7150 lck_mtx_t *mtx;
7151
0a7de745 7152 if (so->so_proto->pr_getlock != NULL) {
5ba3f43e 7153 mtx = (*so->so_proto->pr_getlock)(so, 0);
0a7de745 7154 } else {
5ba3f43e 7155 mtx = so->so_proto->pr_domain->dom_mtx;
0a7de745 7156 }
5ba3f43e 7157
0a7de745 7158 return lck_mtx_try_lock(mtx);
5ba3f43e
A
7159}
7160
7161void
2d21ac55 7162socket_unlock(struct socket *so, int refcount)
91447636 7163{
b0d623f7 7164 void *lr_saved;
2d21ac55 7165 lck_mtx_t *mutex_held;
91447636 7166
b0d623f7 7167 lr_saved = __builtin_return_address(0);
91447636 7168
cb323159 7169 if (so == NULL || so->so_proto == NULL) {
39236c6e
A
7170 panic("%s: null so_proto so=%p\n", __func__, so);
7171 /* NOTREACHED */
7172 }
91447636 7173
cb323159 7174 if (so->so_proto->pr_unlock) {
5ba3f43e 7175 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
2d21ac55 7176 } else {
91447636
A
7177 mutex_held = so->so_proto->pr_domain->dom_mtx;
7178#ifdef MORE_LOCKING_DEBUG
5ba3f43e 7179 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
91447636 7180#endif
b0d623f7 7181 so->unlock_lr[so->next_unlock_lr] = lr_saved;
0a7de745 7182 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
0c530ab8 7183
91447636 7184 if (refcount) {
39236c6e
A
7185 if (so->so_usecount <= 0) {
7186 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7187 "lrh=%s", __func__, so->so_usecount, so,
7188 SOCK_DOM(so), so->so_type,
7189 SOCK_PROTO(so), solockhistory_nr(so));
7190 /* NOTREACHED */
7191 }
7192
91447636 7193 so->so_usecount--;
0a7de745 7194 if (so->so_usecount == 0) {
91447636 7195 sofreelastref(so, 1);
0a7de745 7196 }
91447636
A
7197 }
7198 lck_mtx_unlock(mutex_held);
7199 }
91447636 7200}
2d21ac55
A
7201
7202/* Called with socket locked, will unlock socket */
91447636 7203void
2d21ac55 7204sofree(struct socket *so)
91447636 7205{
2d21ac55 7206 lck_mtx_t *mutex_held;
39236c6e 7207
0a7de745 7208 if (so->so_proto->pr_getlock != NULL) {
91447636 7209 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
0a7de745 7210 } else {
91447636 7211 mutex_held = so->so_proto->pr_domain->dom_mtx;
0a7de745 7212 }
5ba3f43e 7213 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
2d21ac55 7214
91447636
A
7215 sofreelastref(so, 0);
7216}
7217
7218void
2d21ac55 7219soreference(struct socket *so)
91447636 7220{
0a7de745
A
7221 socket_lock(so, 1); /* locks & take one reference on socket */
7222 socket_unlock(so, 0); /* unlock only */
91447636
A
7223}
7224
7225void
2d21ac55 7226sodereference(struct socket *so)
91447636
A
7227{
7228 socket_lock(so, 0);
7229 socket_unlock(so, 1);
7230}
2d21ac55
A
7231
7232/*
7233 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7234 * possibility of using jumbo clusters. Caller must ensure to hold
7235 * the socket lock.
7236 */
7237void
7238somultipages(struct socket *so, boolean_t set)
7239{
0a7de745 7240 if (set) {
2d21ac55 7241 so->so_flags |= SOF_MULTIPAGES;
0a7de745 7242 } else {
2d21ac55 7243 so->so_flags &= ~SOF_MULTIPAGES;
0a7de745 7244 }
2d21ac55 7245}
b0d623f7 7246
fe8ab488
A
7247void
7248soif2kcl(struct socket *so, boolean_t set)
7249{
0a7de745 7250 if (set) {
fe8ab488 7251 so->so_flags1 |= SOF1_IF_2KCL;
0a7de745 7252 } else {
fe8ab488 7253 so->so_flags1 &= ~SOF1_IF_2KCL;
0a7de745 7254 }
fe8ab488
A
7255}
7256
b0d623f7 7257int
0a7de745
A
7258so_isdstlocal(struct socket *so)
7259{
b0d623f7
A
7260 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7261
0a7de745
A
7262 if (SOCK_DOM(so) == PF_INET) {
7263 return inaddr_local(inp->inp_faddr);
7264 } else if (SOCK_DOM(so) == PF_INET6) {
7265 return in6addr_local(&inp->in6p_faddr);
7266 }
39236c6e 7267
0a7de745 7268 return 0;
b0d623f7 7269}
6d2010ae
A
7270
7271int
7272sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7273{
39236c6e 7274 struct sockbuf *rcv, *snd;
6d2010ae
A
7275 int err = 0, defunct;
7276
39236c6e
A
7277 rcv = &so->so_rcv;
7278 snd = &so->so_snd;
7279
6d2010ae
A
7280 defunct = (so->so_flags & SOF_DEFUNCT);
7281 if (defunct) {
39236c6e 7282 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
6d2010ae 7283 panic("%s: SB_DROP not set", __func__);
39236c6e
A
7284 /* NOTREACHED */
7285 }
6d2010ae
A
7286 goto done;
7287 }
7288
7289 if (so->so_flags & SOF_NODEFUNCT) {
7290 if (noforce) {
7291 err = EOPNOTSUPP;
d9a64523
A
7292 if (p != PROC_NULL) {
7293 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7294 "name %s level %d) so 0x%llx [%d,%d] "
7295 "is not eligible for defunct "
7296 "(%d)\n", __func__, proc_selfpid(),
7297 proc_best_name(current_proc()), proc_pid(p),
7298 proc_best_name(p), level,
7299 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7300 SOCK_DOM(so), SOCK_TYPE(so), err);
7301 }
0a7de745 7302 return err;
d9a64523
A
7303 }
7304 so->so_flags &= ~SOF_NODEFUNCT;
7305 if (p != PROC_NULL) {
39037602
A
7306 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7307 "name %s level %d) so 0x%llx [%d,%d] "
d9a64523 7308 "defunct by force "
39037602
A
7309 "(%d)\n", __func__, proc_selfpid(),
7310 proc_best_name(current_proc()), proc_pid(p),
7311 proc_best_name(p), level,
7312 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7313 SOCK_DOM(so), SOCK_TYPE(so), err);
6d2010ae 7314 }
3e170ce0
A
7315 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7316 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7317 struct ifnet *ifp = inp->inp_last_outifp;
7318
7319 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7320 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7321 } else if (so->so_flags & SOF_DELEGATED) {
7322 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7323 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7324 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
d9a64523 7325 } else if (noforce && p != PROC_NULL) {
3e170ce0 7326 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
39037602 7327
3e170ce0
A
7328 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7329 so->so_extended_bk_start = net_uptime();
7330 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
39037602 7331
3e170ce0 7332 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
39037602 7333
3e170ce0 7334 err = EOPNOTSUPP;
d9a64523
A
7335 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7336 "name %s level %d) so 0x%llx [%d,%d] "
7337 "extend bk idle "
7338 "(%d)\n", __func__, proc_selfpid(),
39037602
A
7339 proc_best_name(current_proc()), proc_pid(p),
7340 proc_best_name(p), level,
7341 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
d9a64523 7342 SOCK_DOM(so), SOCK_TYPE(so), err);
0a7de745 7343 return err;
3e170ce0
A
7344 } else {
7345 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7346 }
6d2010ae
A
7347 }
7348
7349 so->so_flags |= SOF_DEFUNCT;
39236c6e 7350
6d2010ae 7351 /* Prevent further data from being appended to the socket buffers */
39236c6e
A
7352 snd->sb_flags |= SB_DROP;
7353 rcv->sb_flags |= SB_DROP;
7354
7355 /* Flush any existing data in the socket buffers */
7356 if (rcv->sb_cc != 0) {
7357 rcv->sb_flags &= ~SB_SEL;
7358 selthreadclear(&rcv->sb_sel);
7359 sbrelease(rcv);
7360 }
7361 if (snd->sb_cc != 0) {
7362 snd->sb_flags &= ~SB_SEL;
7363 selthreadclear(&snd->sb_sel);
7364 sbrelease(snd);
7365 }
6d2010ae
A
7366
7367done:
d9a64523
A
7368 if (p != PROC_NULL) {
7369 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7370 "so 0x%llx [%d,%d] %s defunct%s\n", __func__,
7371 proc_selfpid(), proc_best_name(current_proc()),
7372 proc_pid(p), proc_best_name(p), level,
7373 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7374 SOCK_TYPE(so), defunct ? "is already" : "marked as",
7375 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7376 " extbkidle" : "");
7377 }
0a7de745 7378 return err;
6d2010ae
A
7379}
7380
7381int
7382sodefunct(struct proc *p, struct socket *so, int level)
7383{
7384 struct sockbuf *rcv, *snd;
7385
39236c6e 7386 if (!(so->so_flags & SOF_DEFUNCT)) {
6d2010ae 7387 panic("%s improperly called", __func__);
39236c6e
A
7388 /* NOTREACHED */
7389 }
0a7de745 7390 if (so->so_state & SS_DEFUNCT) {
6d2010ae 7391 goto done;
0a7de745 7392 }
6d2010ae
A
7393
7394 rcv = &so->so_rcv;
7395 snd = &so->so_snd;
7396
39236c6e
A
7397 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7398 char s[MAX_IPv6_STR_LEN];
7399 char d[MAX_IPv6_STR_LEN];
7400 struct inpcb *inp = sotoinpcb(so);
7401
d9a64523
A
7402 if (p != PROC_NULL) {
7403 SODEFUNCTLOG(
0a7de745
A
7404 "%s[%d, %s]: (target pid %d name %s level %d) "
7405 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
7406 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7407 " snd_fl 0x%x]\n", __func__,
7408 proc_selfpid(), proc_best_name(current_proc()),
7409 proc_pid(p), proc_best_name(p), level,
7410 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7411 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7412 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7413 (void *)&inp->inp_laddr.s_addr :
7414 (void *)&inp->in6p_laddr),
7415 s, sizeof(s)), ntohs(inp->in6p_lport),
7416 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7417 (void *)&inp->inp_faddr.s_addr :
7418 (void *)&inp->in6p_faddr,
7419 d, sizeof(d)), ntohs(inp->in6p_fport),
7420 (uint32_t)rcv->sb_sel.si_flags,
7421 (uint32_t)snd->sb_sel.si_flags,
7422 rcv->sb_flags, snd->sb_flags);
7423 }
7424 } else if (p != PROC_NULL) {
39037602
A
7425 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7426 "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7427 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7428 proc_selfpid(), proc_best_name(current_proc()),
7429 proc_pid(p), proc_best_name(p), level,
7430 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7431 SOCK_DOM(so), SOCK_TYPE(so),
7432 (uint32_t)rcv->sb_sel.si_flags,
39236c6e 7433 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
39037602 7434 snd->sb_flags);
39236c6e 7435 }
6d2010ae
A
7436
7437 /*
7438 * Unwedge threads blocked on sbwait() and sb_lock().
7439 */
7440 sbwakeup(rcv);
7441 sbwakeup(snd);
7442
fe8ab488 7443 so->so_flags1 |= SOF1_DEFUNCTINPROG;
0a7de745
A
7444 if (rcv->sb_flags & SB_LOCK) {
7445 sbunlock(rcv, TRUE); /* keep socket locked */
7446 }
7447 if (snd->sb_flags & SB_LOCK) {
7448 sbunlock(snd, TRUE); /* keep socket locked */
7449 }
6d2010ae
A
7450 /*
7451 * Flush the buffers and disconnect. We explicitly call shutdown
7452 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7453 * states are set for the socket. This would also flush out data
7454 * hanging off the receive list of this socket.
7455 */
fe8ab488
A
7456 (void) soshutdownlock_final(so, SHUT_RD);
7457 (void) soshutdownlock_final(so, SHUT_WR);
6d2010ae
A
7458 (void) sodisconnectlocked(so);
7459
7460 /*
7461 * Explicitly handle connectionless-protocol disconnection
7462 * and release any remaining data in the socket buffers.
7463 */
0a7de745 7464 if (!(so->so_state & SS_ISDISCONNECTED)) {
6d2010ae 7465 (void) soisdisconnected(so);
0a7de745 7466 }
6d2010ae 7467
0a7de745 7468 if (so->so_error == 0) {
6d2010ae 7469 so->so_error = EBADF;
0a7de745 7470 }
6d2010ae 7471
39236c6e
A
7472 if (rcv->sb_cc != 0) {
7473 rcv->sb_flags &= ~SB_SEL;
7474 selthreadclear(&rcv->sb_sel);
6d2010ae 7475 sbrelease(rcv);
39236c6e
A
7476 }
7477 if (snd->sb_cc != 0) {
7478 snd->sb_flags &= ~SB_SEL;
7479 selthreadclear(&snd->sb_sel);
6d2010ae 7480 sbrelease(snd);
39236c6e 7481 }
6d2010ae 7482 so->so_state |= SS_DEFUNCT;
39037602 7483 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
6d2010ae
A
7484
7485done:
0a7de745 7486 return 0;
6d2010ae 7487}
316670eb 7488
3e170ce0
A
7489int
7490soresume(struct proc *p, struct socket *so, int locked)
7491{
0a7de745 7492 if (locked == 0) {
3e170ce0 7493 socket_lock(so, 1);
0a7de745 7494 }
3e170ce0
A
7495
7496 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
39037602
A
7497 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7498 "[%d,%d] resumed from bk idle\n",
7499 __func__, proc_selfpid(), proc_best_name(current_proc()),
7500 proc_pid(p), proc_best_name(p),
3e170ce0 7501 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 7502 SOCK_DOM(so), SOCK_TYPE(so));
3e170ce0
A
7503
7504 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7505 so->so_extended_bk_start = 0;
7506 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7507
7508 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7509 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7510 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7511 }
0a7de745 7512 if (locked == 0) {
3e170ce0 7513 socket_unlock(so, 1);
0a7de745 7514 }
3e170ce0 7515
0a7de745 7516 return 0;
3e170ce0
A
7517}
7518
7519/*
7520 * Does not attempt to account for sockets that are delegated from
7521 * the current process
7522 */
7523int
7524so_set_extended_bk_idle(struct socket *so, int optval)
7525{
7526 int error = 0;
7527
7528 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7529 SOCK_PROTO(so) != IPPROTO_TCP) {
7530 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7531 error = EOPNOTSUPP;
7532 } else if (optval == 0) {
7533 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7534
7535 soresume(current_proc(), so, 1);
7536 } else {
7537 struct proc *p = current_proc();
7538 int i;
7539 struct filedesc *fdp;
7540 int count = 0;
7541
5ba3f43e
A
7542 /*
7543 * Unlock socket to avoid lock ordering issue with
7544 * the proc fd table lock
0a7de745 7545 */
5ba3f43e
A
7546 socket_unlock(so, 0);
7547
3e170ce0
A
7548 proc_fdlock(p);
7549
7550 fdp = p->p_fd;
7551 for (i = 0; i < fdp->fd_nfiles; i++) {
7552 struct fileproc *fp = fdp->fd_ofiles[i];
7553 struct socket *so2;
7554
7555 if (fp == NULL ||
7556 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
0a7de745 7557 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) {
3e170ce0 7558 continue;
0a7de745 7559 }
3e170ce0
A
7560
7561 so2 = (struct socket *)fp->f_fglob->fg_data;
7562 if (so != so2 &&
0a7de745 7563 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
3e170ce0 7564 count++;
0a7de745
A
7565 }
7566 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
3e170ce0 7567 break;
0a7de745 7568 }
3e170ce0 7569 }
5ba3f43e
A
7570 proc_fdunlock(p);
7571
7572 socket_lock(so, 0);
7573
3e170ce0
A
7574 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7575 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7576 error = EBUSY;
7577 } else if (so->so_flags & SOF_DELEGATED) {
7578 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7579 error = EBUSY;
7580 } else {
7581 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7582 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7583 }
39037602 7584 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
3e170ce0 7585 "%s marked for extended bk idle\n",
39037602 7586 __func__, proc_selfpid(), proc_best_name(current_proc()),
3e170ce0
A
7587 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7588 SOCK_DOM(so), SOCK_TYPE(so),
7589 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
39037602 7590 "is" : "not");
3e170ce0
A
7591 }
7592
0a7de745 7593 return error;
3e170ce0
A
7594}
7595
7596static void
7597so_stop_extended_bk_idle(struct socket *so)
7598{
7599 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7600 so->so_extended_bk_start = 0;
7601
7602 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7603 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7604 /*
7605 * Force defunct
7606 */
7607 sosetdefunct(current_proc(), so,
7608 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7609 if (so->so_flags & SOF_DEFUNCT) {
7610 sodefunct(current_proc(), so,
7611 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7612 }
7613}
7614
7615void
7616so_drain_extended_bk_idle(struct socket *so)
7617{
7618 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7619 /*
7620 * Only penalize sockets that have outstanding data
7621 */
7622 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7623 so_stop_extended_bk_idle(so);
7624
7625 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7626 }
7627 }
7628}
7629
7630/*
7631 * Return values tells if socket is still in extended background idle
7632 */
7633int
7634so_check_extended_bk_idle_time(struct socket *so)
7635{
7636 int ret = 1;
7637
7638 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
39037602
A
7639 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7640 __func__, proc_selfpid(), proc_best_name(current_proc()),
3e170ce0 7641 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 7642 SOCK_DOM(so), SOCK_TYPE(so));
3e170ce0
A
7643 if (net_uptime() - so->so_extended_bk_start >
7644 soextbkidlestat.so_xbkidle_time) {
7645 so_stop_extended_bk_idle(so);
7646
7647 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7648
7649 ret = 0;
7650 } else {
7651 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7652
7653 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7654 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7655 }
7656 }
39037602 7657
0a7de745 7658 return ret;
3e170ce0
A
7659}
7660
7661void
7662resume_proc_sockets(proc_t p)
7663{
7664 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
0a7de745 7665 struct filedesc *fdp;
3e170ce0
A
7666 int i;
7667
7668 proc_fdlock(p);
7669 fdp = p->p_fd;
7670 for (i = 0; i < fdp->fd_nfiles; i++) {
0a7de745 7671 struct fileproc *fp;
3e170ce0
A
7672 struct socket *so;
7673
7674 fp = fdp->fd_ofiles[i];
39037602 7675 if (fp == NULL ||
3e170ce0 7676 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
0a7de745 7677 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) {
3e170ce0 7678 continue;
0a7de745 7679 }
3e170ce0
A
7680
7681 so = (struct socket *)fp->f_fglob->fg_data;
7682 (void) soresume(p, so, 0);
7683 }
7684 proc_fdunlock(p);
7685
7686 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7687 }
7688}
7689
316670eb
A
7690__private_extern__ int
7691so_set_recv_anyif(struct socket *so, int optval)
7692{
7693 int ret = 0;
7694
7695#if INET6
39236c6e 7696 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
316670eb 7697#else
39236c6e 7698 if (SOCK_DOM(so) == PF_INET) {
316670eb 7699#endif /* !INET6 */
0a7de745 7700 if (optval) {
316670eb 7701 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
0a7de745 7702 } else {
316670eb 7703 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
0a7de745 7704 }
316670eb
A
7705 }
7706
5ba3f43e 7707
0a7de745 7708 return ret;
316670eb
A
7709}
7710
7711__private_extern__ int
7712so_get_recv_anyif(struct socket *so)
7713{
7714 int ret = 0;
7715
7716#if INET6
39236c6e 7717 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
316670eb 7718#else
39236c6e 7719 if (SOCK_DOM(so) == PF_INET) {
316670eb
A
7720#endif /* !INET6 */
7721 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7722 }
7723
0a7de745 7724 return ret;
316670eb 7725}
39236c6e
A
7726
7727int
7728so_set_restrictions(struct socket *so, uint32_t vals)
7729{
7730 int nocell_old, nocell_new;
fe8ab488 7731 int noexpensive_old, noexpensive_new;
cb323159 7732 int noconstrained_old, noconstrained_new;
39236c6e
A
7733
7734 /*
7735 * Deny-type restrictions are trapdoors; once set they cannot be
7736 * unset for the lifetime of the socket. This allows them to be
7737 * issued by a framework on behalf of the application without
7738 * having to worry that they can be undone.
7739 *
7740 * Note here that socket-level restrictions overrides any protocol
7741 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7742 * socket restriction issued on the socket has a higher precendence
7743 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7744 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7745 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7746 */
7747 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
fe8ab488 7748 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
cb323159 7749 noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
39236c6e 7750 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
3e170ce0 7751 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
cb323159 7752 SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
39236c6e 7753 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
fe8ab488 7754 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
cb323159 7755 noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
39236c6e
A
7756
7757 /* we can only set, not clear restrictions */
fe8ab488 7758 if ((nocell_new - nocell_old) == 0 &&
cb323159
A
7759 (noexpensive_new - noexpensive_old) == 0 &&
7760 (noconstrained_new - noconstrained_old) == 0) {
0a7de745
A
7761 return 0;
7762 }
39236c6e
A
7763#if INET6
7764 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7765#else
7766 if (SOCK_DOM(so) == PF_INET) {
7767#endif /* !INET6 */
fe8ab488 7768 if (nocell_new - nocell_old != 0) {
3e170ce0
A
7769 /*
7770 * if deny cellular is now set, do what's needed
7771 * for INPCB
7772 */
fe8ab488
A
7773 inp_set_nocellular(sotoinpcb(so));
7774 }
7775 if (noexpensive_new - noexpensive_old != 0) {
7776 inp_set_noexpensive(sotoinpcb(so));
7777 }
cb323159
A
7778 if (noconstrained_new - noconstrained_old != 0) {
7779 inp_set_noconstrained(sotoinpcb(so));
7780 }
39236c6e
A
7781 }
7782
0a7de745 7783 if (SOCK_DOM(so) == PF_MULTIPATH) {
5ba3f43e 7784 mptcp_set_restrictions(so);
0a7de745 7785 }
5ba3f43e 7786
0a7de745 7787 return 0;
39236c6e
A
7788}
7789
7790uint32_t
7791so_get_restrictions(struct socket *so)
7792{
0a7de745
A
7793 return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7794 SO_RESTRICT_DENY_OUT |
7795 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
39236c6e
A
7796}
7797
39236c6e 7798int
cb323159 7799so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
39236c6e
A
7800{
7801 struct proc *ep = PROC_NULL;
7802 int error = 0;
7803
7804 /* pid 0 is reserved for kernel */
7805 if (epid == 0) {
7806 error = EINVAL;
7807 goto done;
7808 }
7809
7810 /*
7811 * If this is an in-kernel socket, prevent its delegate
7812 * association from changing unless the socket option is
7813 * coming from within the kernel itself.
7814 */
7815 if (so->last_pid == 0 && p != kernproc) {
7816 error = EACCES;
7817 goto done;
7818 }
7819
7820 /*
7821 * If this is issued by a process that's recorded as the
7822 * real owner of the socket, or if the pid is the same as
7823 * the process's own pid, then proceed. Otherwise ensure
7824 * that the issuing process has the necessary privileges.
7825 */
cb323159 7826 if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
39236c6e
A
7827 if ((error = priv_check_cred(kauth_cred_get(),
7828 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7829 error = EACCES;
7830 goto done;
7831 }
7832 }
7833
7834 /* Find the process that corresponds to the effective pid */
7835 if ((ep = proc_find(epid)) == PROC_NULL) {
7836 error = ESRCH;
7837 goto done;
7838 }
7839
7840 /*
7841 * If a process tries to delegate the socket to itself, then
7842 * there's really nothing to do; treat it as a way for the
7843 * delegate association to be cleared. Note that we check
7844 * the passed-in proc rather than calling proc_selfpid(),
7845 * as we need to check the process issuing the socket option
7846 * which could be kernproc. Given that we don't allow 0 for
7847 * effective pid, it means that a delegated in-kernel socket
7848 * stays delegated during its lifetime (which is probably OK.)
7849 */
7850 if (epid == proc_pid(p)) {
7851 so->so_flags &= ~SOF_DELEGATED;
7852 so->e_upid = 0;
7853 so->e_pid = 0;
7854 uuid_clear(so->e_uuid);
7855 } else {
7856 so->so_flags |= SOF_DELEGATED;
7857 so->e_upid = proc_uniqueid(ep);
7858 so->e_pid = proc_pid(ep);
0a7de745 7859 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
39236c6e 7860 }
cb323159
A
7861 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7862 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
7863 }
39236c6e
A
7864done:
7865 if (error == 0 && net_io_policy_log) {
7866 uuid_string_t buf;
7867
7868 uuid_unparse(so->e_uuid, buf);
7869 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7870 "euuid %s%s\n", __func__, proc_name_address(p),
3e170ce0
A
7871 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7872 SOCK_DOM(so), SOCK_TYPE(so),
7873 so->e_pid, proc_name_address(ep), buf,
39236c6e
A
7874 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7875 } else if (error != 0 && net_io_policy_log) {
7876 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7877 "ERROR (%d)\n", __func__, proc_name_address(p),
3e170ce0
A
7878 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7879 SOCK_DOM(so), SOCK_TYPE(so),
7880 epid, (ep == PROC_NULL) ? "PROC_NULL" :
39236c6e
A
7881 proc_name_address(ep), error);
7882 }
7883
fe8ab488
A
7884 /* Update this socket's policy upon success */
7885 if (error == 0) {
7886 so->so_policy_gencnt *= -1;
7887 so_update_policy(so);
7888#if NECP
7889 so_update_necp_policy(so, NULL, NULL);
7890#endif /* NECP */
7891 }
7892
0a7de745 7893 if (ep != PROC_NULL) {
39236c6e 7894 proc_rele(ep);
0a7de745 7895 }
39236c6e 7896
0a7de745 7897 return error;
39236c6e
A
7898}
7899
7900int
cb323159 7901so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
39236c6e
A
7902{
7903 uuid_string_t buf;
7904 uuid_t uuid;
7905 int error = 0;
7906
7907 /* UUID must not be all-zeroes (reserved for kernel) */
7908 if (uuid_is_null(euuid)) {
7909 error = EINVAL;
3e170ce0 7910 goto done;
39236c6e
A
7911 }
7912
7913 /*
7914 * If this is an in-kernel socket, prevent its delegate
7915 * association from changing unless the socket option is
7916 * coming from within the kernel itself.
7917 */
7918 if (so->last_pid == 0 && p != kernproc) {
7919 error = EACCES;
7920 goto done;
7921 }
7922
7923 /* Get the UUID of the issuing process */
0a7de745 7924 proc_getexecutableuuid(p, uuid, sizeof(uuid));
39236c6e
A
7925
7926 /*
7927 * If this is issued by a process that's recorded as the
7928 * real owner of the socket, or if the uuid is the same as
7929 * the process's own uuid, then proceed. Otherwise ensure
7930 * that the issuing process has the necessary privileges.
7931 */
cb323159
A
7932 if (check_cred &&
7933 (uuid_compare(euuid, so->last_uuid) != 0 ||
7934 uuid_compare(euuid, uuid) != 0)) {
39236c6e
A
7935 if ((error = priv_check_cred(kauth_cred_get(),
7936 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7937 error = EACCES;
7938 goto done;
7939 }
7940 }
7941
7942 /*
7943 * If a process tries to delegate the socket to itself, then
7944 * there's really nothing to do; treat it as a way for the
7945 * delegate association to be cleared. Note that we check
7946 * the uuid of the passed-in proc rather than that of the
7947 * current process, as we need to check the process issuing
7948 * the socket option which could be kernproc itself. Given
7949 * that we don't allow 0 for effective uuid, it means that
7950 * a delegated in-kernel socket stays delegated during its
7951 * lifetime (which is okay.)
7952 */
7953 if (uuid_compare(euuid, uuid) == 0) {
7954 so->so_flags &= ~SOF_DELEGATED;
7955 so->e_upid = 0;
7956 so->e_pid = 0;
7957 uuid_clear(so->e_uuid);
7958 } else {
7959 so->so_flags |= SOF_DELEGATED;
7960 /*
7961 * Unlike so_set_effective_pid(), we only have the UUID
7962 * here and the process ID is not known. Inherit the
7963 * real {pid,upid} of the socket.
7964 */
7965 so->e_upid = so->last_upid;
7966 so->e_pid = so->last_pid;
7967 uuid_copy(so->e_uuid, euuid);
7968 }
cb323159
A
7969 /*
7970 * The following will clear the effective process name as it's the same
7971 * as the real process
7972 */
7973 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7974 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
7975 }
39236c6e
A
7976done:
7977 if (error == 0 && net_io_policy_log) {
7978 uuid_unparse(so->e_uuid, buf);
7979 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7980 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
3e170ce0 7981 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
39236c6e
A
7982 SOCK_TYPE(so), so->e_pid, buf,
7983 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7984 } else if (error != 0 && net_io_policy_log) {
7985 uuid_unparse(euuid, buf);
7986 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7987 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
3e170ce0 7988 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
39236c6e
A
7989 SOCK_TYPE(so), buf, error);
7990 }
7991
fe8ab488
A
7992 /* Update this socket's policy upon success */
7993 if (error == 0) {
7994 so->so_policy_gencnt *= -1;
7995 so_update_policy(so);
7996#if NECP
7997 so_update_necp_policy(so, NULL, NULL);
7998#endif /* NECP */
7999 }
8000
0a7de745 8001 return error;
39236c6e
A
8002}
8003
8004void
8005netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8006 uint32_t ev_datalen)
8007{
8008 struct kev_msg ev_msg;
8009
8010 /*
8011 * A netpolicy event always starts with a netpolicy_event_data
8012 * structure, but the caller can provide for a longer event
8013 * structure to post, depending on the event code.
8014 */
0a7de745 8015 VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
39236c6e 8016
0a7de745
A
8017 bzero(&ev_msg, sizeof(ev_msg));
8018 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8019 ev_msg.kev_class = KEV_NETWORK_CLASS;
8020 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
8021 ev_msg.event_code = ev_code;
39236c6e 8022
0a7de745 8023 ev_msg.dv[0].data_ptr = ev_data;
39236c6e
A
8024 ev_msg.dv[0].data_length = ev_datalen;
8025
8026 kev_post_msg(&ev_msg);
8027}
fe8ab488
A
8028
8029void
3e170ce0 8030socket_post_kev_msg(uint32_t ev_code,
fe8ab488
A
8031 struct kev_socket_event_data *ev_data,
8032 uint32_t ev_datalen)
8033{
8034 struct kev_msg ev_msg;
8035
8036 bzero(&ev_msg, sizeof(ev_msg));
8037 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8038 ev_msg.kev_class = KEV_NETWORK_CLASS;
8039 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8040 ev_msg.event_code = ev_code;
8041
8042 ev_msg.dv[0].data_ptr = ev_data;
0a7de745 8043 ev_msg.dv[0].data_length = ev_datalen;
fe8ab488
A
8044
8045 kev_post_msg(&ev_msg);
8046}
8047
8048void
8049socket_post_kev_msg_closed(struct socket *so)
8050{
8051 struct kev_socket_closed ev;
8052 struct sockaddr *socksa = NULL, *peersa = NULL;
8053 int err;
8054 bzero(&ev, sizeof(ev));
8055 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8056 if (err == 0) {
8057 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8058 &peersa);
8059 if (err == 0) {
8060 memcpy(&ev.ev_data.kev_sockname, socksa,
8061 min(socksa->sa_len,
0a7de745 8062 sizeof(ev.ev_data.kev_sockname)));
fe8ab488
A
8063 memcpy(&ev.ev_data.kev_peername, peersa,
8064 min(peersa->sa_len,
0a7de745 8065 sizeof(ev.ev_data.kev_peername)));
fe8ab488 8066 socket_post_kev_msg(KEV_SOCKET_CLOSED,
0a7de745 8067 &ev.ev_data, sizeof(ev));
fe8ab488
A
8068 }
8069 }
0a7de745 8070 if (socksa != NULL) {
fe8ab488 8071 FREE(socksa, M_SONAME);
0a7de745
A
8072 }
8073 if (peersa != NULL) {
fe8ab488 8074 FREE(peersa, M_SONAME);
0a7de745 8075 }
fe8ab488 8076}