]> git.saurik.com Git - apple/xnu.git/blame - bsd/kern/uipc_socket.c
xnu-6153.11.26.tar.gz
[apple/xnu.git] / bsd / kern / uipc_socket.c
CommitLineData
1c79356b 1/*
cb323159 2 * Copyright (c) 1998-2019 Apple Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
39236c6e 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
39236c6e 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
39236c6e 17 *
2d21ac55
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
39236c6e 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b 27 */
1c79356b
A
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
9bccf70c 61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
1c79356b 62 */
2d21ac55
A
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
1c79356b
A
69
70#include <sys/param.h>
71#include <sys/systm.h>
55e303ae 72#include <sys/filedesc.h>
2d21ac55 73#include <sys/proc.h>
91447636
A
74#include <sys/proc_internal.h>
75#include <sys/kauth.h>
76#include <sys/file_internal.h>
1c79356b
A
77#include <sys/fcntl.h>
78#include <sys/malloc.h>
79#include <sys/mbuf.h>
80#include <sys/domain.h>
81#include <sys/kernel.h>
55e303ae 82#include <sys/event.h>
1c79356b
A
83#include <sys/poll.h>
84#include <sys/protosw.h>
85#include <sys/socket.h>
86#include <sys/socketvar.h>
87#include <sys/resourcevar.h>
88#include <sys/signalvar.h>
89#include <sys/sysctl.h>
39236c6e 90#include <sys/syslog.h>
1c79356b 91#include <sys/uio.h>
fe8ab488 92#include <sys/uio_internal.h>
1c79356b
A
93#include <sys/ev.h>
94#include <sys/kdebug.h>
2d21ac55 95#include <sys/un.h>
d1ecb069 96#include <sys/user.h>
316670eb 97#include <sys/priv.h>
39236c6e 98#include <sys/kern_event.h>
1c79356b 99#include <net/route.h>
39236c6e 100#include <net/init.h>
5ba3f43e 101#include <net/net_api_stats.h>
316670eb 102#include <net/ntstat.h>
fe8ab488 103#include <net/content_filter.h>
1c79356b
A
104#include <netinet/in.h>
105#include <netinet/in_pcb.h>
39037602 106#include <netinet/in_tclass.h>
cb323159 107#include <netinet/in_var.h>
39037602 108#include <netinet/tcp_var.h>
6d2010ae
A
109#include <netinet/ip6.h>
110#include <netinet6/ip6_var.h>
39236c6e 111#include <netinet/flow_divert.h>
1c79356b 112#include <kern/zalloc.h>
91447636 113#include <kern/locks.h>
1c79356b 114#include <machine/limits.h>
2d21ac55
A
115#include <libkern/OSAtomic.h>
116#include <pexpert/pexpert.h>
b0d623f7 117#include <kern/assert.h>
6d2010ae 118#include <kern/task.h>
39037602
A
119#include <kern/policy_internal.h>
120
316670eb 121#include <sys/kpi_mbuf.h>
6d2010ae 122#include <sys/mcache.h>
fe8ab488 123#include <sys/unpcb.h>
5ba3f43e 124#include <libkern/section_keywords.h>
2d21ac55
A
125
126#if CONFIG_MACF
2d21ac55
A
127#include <security/mac_framework.h>
128#endif /* MAC */
129
39236c6e
A
130#if MULTIPATH
131#include <netinet/mp_pcb.h>
fe8ab488 132#include <netinet/mptcp_var.h>
39236c6e
A
133#endif /* MULTIPATH */
134
3e170ce0
A
135#define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
136
137#if DEBUG || DEVELOPMENT
0a7de745 138#define DEBUG_KERNEL_ADDRPERM(_v) (_v)
3e170ce0 139#else
0a7de745 140#define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
3e170ce0
A
141#endif
142
39236c6e
A
143/* TODO: this should be in a header file somewhere */
144extern char *proc_name_address(void *p);
145
0a7de745
A
146static u_int32_t so_cache_hw; /* High water mark for socache */
147static u_int32_t so_cache_timeouts; /* number of timeouts */
148static u_int32_t so_cache_max_freed; /* max freed per timeout */
149static u_int32_t cached_sock_count = 0;
150STAILQ_HEAD(, socket) so_cache_head;
151int max_cached_sock_count = MAX_CACHED_SOCKETS;
152static u_int32_t so_cache_time;
153static int socketinit_done;
154static struct zone *so_cache_zone;
155
156static lck_grp_t *so_cache_mtx_grp;
157static lck_attr_t *so_cache_mtx_attr;
158static lck_grp_attr_t *so_cache_mtx_grp_attr;
159static lck_mtx_t *so_cache_mtx;
91447636 160
1c79356b
A
161#include <machine/limits.h>
162
cb323159 163static int filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
0a7de745
A
164static void filt_sordetach(struct knote *kn);
165static int filt_soread(struct knote *kn, long hint);
cb323159
A
166static int filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
167static int filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
39037602 168
cb323159 169static int filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
0a7de745
A
170static void filt_sowdetach(struct knote *kn);
171static int filt_sowrite(struct knote *kn, long hint);
cb323159
A
172static int filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
173static int filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
39037602 174
cb323159 175static int filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
0a7de745
A
176static void filt_sockdetach(struct knote *kn);
177static int filt_sockev(struct knote *kn, long hint);
cb323159
A
178static int filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
179static int filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
2d21ac55 180
39236c6e
A
181static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
182static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
55e303ae 183
5ba3f43e 184SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
39236c6e 185 .f_isfd = 1,
39037602 186 .f_attach = filt_sorattach,
39236c6e
A
187 .f_detach = filt_sordetach,
188 .f_event = filt_soread,
39037602
A
189 .f_touch = filt_sortouch,
190 .f_process = filt_sorprocess,
b0d623f7 191};
39236c6e 192
5ba3f43e 193SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
39236c6e 194 .f_isfd = 1,
39037602 195 .f_attach = filt_sowattach,
39236c6e
A
196 .f_detach = filt_sowdetach,
197 .f_event = filt_sowrite,
39037602
A
198 .f_touch = filt_sowtouch,
199 .f_process = filt_sowprocess,
b0d623f7 200};
39236c6e 201
5ba3f43e 202SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
316670eb 203 .f_isfd = 1,
39037602 204 .f_attach = filt_sockattach,
316670eb
A
205 .f_detach = filt_sockdetach,
206 .f_event = filt_sockev,
3e170ce0 207 .f_touch = filt_socktouch,
39037602
A
208 .f_process = filt_sockprocess,
209};
210
5ba3f43e 211SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
39037602
A
212 .f_isfd = 1,
213 .f_attach = filt_sorattach,
214 .f_detach = filt_sordetach,
215 .f_event = filt_soread,
216 .f_touch = filt_sortouch,
217 .f_process = filt_sorprocess,
316670eb 218};
55e303ae 219
fe8ab488
A
220SYSCTL_DECL(_kern_ipc);
221
0a7de745 222#define EVEN_MORE_LOCKING_DEBUG 0
fe8ab488 223
1c79356b 224int socket_debug = 0;
fe8ab488 225SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
0a7de745 226 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
fe8ab488 227
39037602
A
228static unsigned long sodefunct_calls = 0;
229SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
230 &sodefunct_calls, "");
231
39236c6e 232static int socket_zone = M_SOCKET;
0a7de745 233so_gen_t so_gencnt; /* generation count for sockets */
1c79356b
A
234
235MALLOC_DEFINE(M_SONAME, "soname", "socket name");
236MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
237
0a7de745
A
238#define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
239#define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
240#define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
241#define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
242#define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
243#define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
244#define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
245#define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
246#define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
1c79356b 247
0a7de745 248#define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
1c79356b 249
2d21ac55 250int somaxconn = SOMAXCONN;
39236c6e 251SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
0a7de745 252 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
1c79356b
A
253
254/* Should we get a maximum also ??? */
fa4905b1 255static int sosendmaxchain = 65536;
1c79356b 256static int sosendminchain = 16384;
55e303ae 257static int sorecvmincopy = 16384;
39236c6e 258SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
0a7de745 259 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
39236c6e 260SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
0a7de745 261 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
2d21ac55
A
262
263/*
264 * Set to enable jumbo clusters (if available) for large writes when
265 * the socket is marked with SOF_MULTIPAGES; see below.
266 */
267int sosendjcl = 1;
39236c6e 268SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
0a7de745 269 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
1c79356b 270
2d21ac55
A
271/*
272 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
273 * writes on the socket for all protocols on any network interfaces,
274 * depending upon sosendjcl above. Be extra careful when setting this
275 * to 1, because sending down packets that cross physical pages down to
276 * broken drivers (those that falsely assume that the physical pages
277 * are contiguous) might lead to system panics or silent data corruption.
278 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
279 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
280 * capable. Set this to 1 only for testing/debugging purposes.
281 */
282int sosendjcl_ignore_capab = 0;
39236c6e 283SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
0a7de745 284 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
1c79356b 285
3e170ce0
A
286/*
287 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
288 * writes on the socket for all protocols on any network interfaces.
289 * Be extra careful when setting this to 1, because sending down packets with
290 * clusters larger that 2 KB might lead to system panics or data corruption.
291 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
292 * on the outgoing interface
293 * Set this to 1 for testing/debugging purposes only.
294 */
fe8ab488
A
295int sosendbigcl_ignore_capab = 0;
296SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
0a7de745 297 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
fe8ab488 298
6d2010ae
A
299int sodefunctlog = 0;
300SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 301 &sodefunctlog, 0, "");
6d2010ae 302
316670eb
A
303int sothrottlelog = 0;
304SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 305 &sothrottlelog, 0, "");
39236c6e
A
306
307int sorestrictrecv = 1;
308SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 309 &sorestrictrecv, 0, "Enable inbound interface restrictions");
316670eb 310
fe8ab488
A
311int sorestrictsend = 1;
312SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 313 &sorestrictsend, 0, "Enable outbound interface restrictions");
1c79356b 314
3e170ce0
A
315int soreserveheadroom = 1;
316SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 317 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
3e170ce0 318
39037602
A
319#if (DEBUG || DEVELOPMENT)
320int so_notsent_lowat_check = 1;
0a7de745 321SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
39037602
A
322 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
323#endif /* DEBUG || DEVELOPMENT */
324
813fb2f6
A
325int so_accept_list_waits = 0;
326#if (DEBUG || DEVELOPMENT)
0a7de745 327SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
813fb2f6
A
328 &so_accept_list_waits, 0, "number of waits for listener incomp list");
329#endif /* DEBUG || DEVELOPMENT */
330
39236c6e 331extern struct inpcbinfo tcbinfo;
2d21ac55
A
332
333/* TODO: these should be in header file */
334extern int get_inpcb_str_size(void);
335extern int get_tcp_str_size(void);
2d21ac55 336
0a7de745 337vm_size_t so_cache_zone_element_size;
91447636 338
3e170ce0
A
339static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
340 user_ssize_t *);
2d21ac55
A
341static void cached_sock_alloc(struct socket **, int);
342static void cached_sock_free(struct socket *);
91447636 343
3e170ce0
A
344/*
345 * Maximum of extended background idle sockets per process
346 * Set to zero to disable further setting of the option
347 */
348
0a7de745
A
349#define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
350#define SO_IDLE_BK_IDLE_TIME 600
351#define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
3e170ce0
A
352
353struct soextbkidlestat soextbkidlestat;
354
355SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
0a7de745
A
356 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
357 "Maximum of extended background idle sockets per process");
3e170ce0
A
358
359SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745
A
360 &soextbkidlestat.so_xbkidle_time, 0,
361 "Time in seconds to keep extended background idle sockets");
3e170ce0
A
362
363SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745
A
364 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
365 "High water mark for extended background idle sockets");
3e170ce0
A
366
367SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
0a7de745 368 &soextbkidlestat, soextbkidlestat, "");
3e170ce0
A
369
370int so_set_extended_bk_idle(struct socket *, int);
371
5ba3f43e 372
316670eb
A
373/*
374 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
375 * setting the DSCP code on the packet based on the service class; see
376 * <rdar://problem/11277343> for details.
377 */
39037602 378__private_extern__ u_int32_t sotcdb = 0;
6d2010ae 379SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
0a7de745 380 &sotcdb, 0, "");
91447636 381
2d21ac55
A
382void
383socketinit(void)
1c79356b 384{
fe8ab488
A
385 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
386 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
387
3e170ce0
A
388#ifdef __LP64__
389 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
390 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
391 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
392 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
393 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
394 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
395#else
396 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
397 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
398 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
399 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
400 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
401 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
402#endif
403
39236c6e 404 if (socketinit_done) {
91447636
A
405 printf("socketinit: already called...\n");
406 return;
407 }
39236c6e 408 socketinit_done = 1;
91447636 409
39236c6e 410 PE_parse_boot_argn("socket_debug", &socket_debug,
0a7de745 411 sizeof(socket_debug));
2d21ac55 412
91447636
A
413 /*
414 * allocate lock group attribute and group for socket cache mutex
415 */
416 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
2d21ac55
A
417 so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
418 so_cache_mtx_grp_attr);
419
91447636
A
420 /*
421 * allocate the lock attribute for socket cache mutex
422 */
423 so_cache_mtx_attr = lck_attr_alloc_init();
91447636 424
2d21ac55
A
425 /* cached sockets mutex */
426 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
39236c6e
A
427 if (so_cache_mtx == NULL) {
428 panic("%s: unable to allocate so_cache_mtx\n", __func__);
429 /* NOTREACHED */
430 }
431 STAILQ_INIT(&so_cache_head);
1c79356b 432
0a7de745 433 so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
39236c6e 434 + get_inpcb_str_size() + 4 + get_tcp_str_size());
2d21ac55 435
3e170ce0 436 so_cache_zone = zinit(so_cache_zone_element_size,
39236c6e 437 (120000 * so_cache_zone_element_size), 8192, "socache zone");
6d2010ae 438 zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
0b4c1975 439 zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
1c79356b 440
3e170ce0
A
441 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
442 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
443 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
444 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
316670eb 445
39236c6e
A
446 in_pcbinit();
447 sflt_init();
6d2010ae 448 socket_tclass_init();
39236c6e
A
449#if MULTIPATH
450 mp_pcbinit();
451#endif /* MULTIPATH */
1c79356b
A
452}
453
2d21ac55
A
454static void
455cached_sock_alloc(struct socket **so, int waitok)
1c79356b 456{
0a7de745 457 caddr_t temp;
39236c6e 458 uintptr_t offset;
1c79356b 459
91447636
A
460 lck_mtx_lock(so_cache_mtx);
461
39236c6e
A
462 if (!STAILQ_EMPTY(&so_cache_head)) {
463 VERIFY(cached_sock_count > 0);
1c79356b 464
39236c6e
A
465 *so = STAILQ_FIRST(&so_cache_head);
466 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
467 STAILQ_NEXT((*so), so_cache_ent) = NULL;
91447636 468
39236c6e 469 cached_sock_count--;
91447636 470 lck_mtx_unlock(so_cache_mtx);
1c79356b 471
2d21ac55 472 temp = (*so)->so_saved_pcb;
0a7de745 473 bzero((caddr_t)*so, sizeof(struct socket));
39236c6e 474
2d21ac55 475 (*so)->so_saved_pcb = temp;
2d21ac55 476 } else {
2d21ac55 477 lck_mtx_unlock(so_cache_mtx);
1c79356b 478
0a7de745 479 if (waitok) {
2d21ac55 480 *so = (struct socket *)zalloc(so_cache_zone);
0a7de745 481 } else {
2d21ac55 482 *so = (struct socket *)zalloc_noblock(so_cache_zone);
0a7de745 483 }
1c79356b 484
0a7de745 485 if (*so == NULL) {
2d21ac55 486 return;
0a7de745 487 }
1c79356b 488
0a7de745 489 bzero((caddr_t)*so, sizeof(struct socket));
1c79356b 490
2d21ac55 491 /*
3e170ce0
A
492 * Define offsets for extra structures into our
493 * single block of memory. Align extra structures
39236c6e 494 * on longword boundaries.
2d21ac55 495 */
b0d623f7 496
39236c6e 497 offset = (uintptr_t)*so;
0a7de745 498 offset += sizeof(struct socket);
b0d623f7
A
499
500 offset = ALIGN(offset);
501
2d21ac55
A
502 (*so)->so_saved_pcb = (caddr_t)offset;
503 offset += get_inpcb_str_size();
b0d623f7
A
504
505 offset = ALIGN(offset);
1c79356b 506
316670eb 507 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
2d21ac55 508 (caddr_t)offset;
2d21ac55 509 }
1c79356b 510
3e170ce0 511 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
1c79356b
A
512}
513
2d21ac55
A
514static void
515cached_sock_free(struct socket *so)
1c79356b 516{
91447636 517 lck_mtx_lock(so_cache_mtx);
1c79356b 518
39236c6e 519 so_cache_time = net_uptime();
b0d623f7 520 if (++cached_sock_count > max_cached_sock_count) {
1c79356b 521 --cached_sock_count;
91447636 522 lck_mtx_unlock(so_cache_mtx);
91447636 523 zfree(so_cache_zone, so);
2d21ac55 524 } else {
0a7de745 525 if (so_cache_hw < cached_sock_count) {
1c79356b 526 so_cache_hw = cached_sock_count;
0a7de745 527 }
1c79356b 528
39236c6e 529 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
1c79356b
A
530
531 so->cache_timestamp = so_cache_time;
91447636 532 lck_mtx_unlock(so_cache_mtx);
1c79356b 533 }
1c79356b
A
534}
535
39236c6e
A
536void
537so_update_last_owner_locked(struct socket *so, proc_t self)
6d2010ae 538{
39236c6e
A
539 if (so->last_pid != 0) {
540 /*
541 * last_pid and last_upid should remain zero for sockets
542 * created using sock_socket. The check above achieves that
543 */
0a7de745 544 if (self == PROC_NULL) {
316670eb 545 self = current_proc();
0a7de745 546 }
39236c6e
A
547
548 if (so->last_upid != proc_uniqueid(self) ||
549 so->last_pid != proc_pid(self)) {
316670eb
A
550 so->last_upid = proc_uniqueid(self);
551 so->last_pid = proc_pid(self);
39236c6e 552 proc_getexecutableuuid(self, so->last_uuid,
0a7de745 553 sizeof(so->last_uuid));
cb323159
A
554 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
555 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
556 }
316670eb 557 }
fe8ab488 558 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
6d2010ae
A
559 }
560}
561
39236c6e
A
562void
563so_update_policy(struct socket *so)
1c79356b 564{
0a7de745 565 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
39236c6e 566 (void) inp_update_policy(sotoinpcb(so));
0a7de745 567 }
39236c6e 568}
1c79356b 569
fe8ab488
A
570#if NECP
571static void
3e170ce0
A
572so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
573 struct sockaddr *override_remote_addr)
fe8ab488 574{
0a7de745 575 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
3e170ce0
A
576 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
577 override_remote_addr, 0);
0a7de745 578 }
fe8ab488
A
579}
580#endif /* NECP */
581
39236c6e
A
582boolean_t
583so_cache_timer(void)
584{
0a7de745
A
585 struct socket *p;
586 int n_freed = 0;
39236c6e 587 boolean_t rc = FALSE;
1c79356b 588
39236c6e
A
589 lck_mtx_lock(so_cache_mtx);
590 so_cache_timeouts++;
591 so_cache_time = net_uptime();
592
593 while (!STAILQ_EMPTY(&so_cache_head)) {
594 VERIFY(cached_sock_count > 0);
595 p = STAILQ_FIRST(&so_cache_head);
3e170ce0 596 if ((so_cache_time - p->cache_timestamp) <
0a7de745 597 SO_CACHE_TIME_LIMIT) {
2d21ac55 598 break;
0a7de745 599 }
1c79356b 600
39236c6e
A
601 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
602 --cached_sock_count;
1c79356b 603
91447636 604 zfree(so_cache_zone, p);
2d21ac55
A
605
606 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
607 so_cache_max_freed++;
1c79356b
A
608 break;
609 }
610 }
1c79356b 611
39236c6e 612 /* Schedule again if there is more to cleanup */
0a7de745 613 if (!STAILQ_EMPTY(&so_cache_head)) {
39236c6e 614 rc = TRUE;
0a7de745 615 }
39236c6e
A
616
617 lck_mtx_unlock(so_cache_mtx);
0a7de745 618 return rc;
1c79356b 619}
1c79356b
A
620
621/*
622 * Get a socket structure from our zone, and initialize it.
623 * We don't implement `waitok' yet (see comments in uipc_domain.c).
624 * Note that it would probably be better to allocate socket
625 * and PCB at the same time, but I'm not convinced that all
626 * the protocols can be easily modified to do this.
627 */
628struct socket *
2d21ac55 629soalloc(int waitok, int dom, int type)
1c79356b
A
630{
631 struct socket *so;
632
2d21ac55
A
633 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
634 cached_sock_alloc(&so, waitok);
635 } else {
0a7de745 636 MALLOC_ZONE(so, struct socket *, sizeof(*so), socket_zone,
2d21ac55 637 M_WAITOK);
0a7de745
A
638 if (so != NULL) {
639 bzero(so, sizeof(*so));
640 }
1c79356b 641 }
2d21ac55 642 if (so != NULL) {
fe8ab488 643 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
2d21ac55 644 so->so_zone = socket_zone;
5ba3f43e
A
645
646 /*
647 * Increment the socket allocation statistics
648 */
649 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
650
2d21ac55 651#if CONFIG_MACF_SOCKET
39236c6e
A
652 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
653 if (mac_socket_label_init(so, !waitok) != 0) {
2d21ac55 654 sodealloc(so);
0a7de745 655 return NULL;
2d21ac55
A
656 }
657#endif /* MAC_SOCKET */
1c79356b
A
658 }
659
0a7de745 660 return so;
1c79356b
A
661}
662
663int
39236c6e
A
664socreate_internal(int dom, struct socket **aso, int type, int proto,
665 struct proc *p, uint32_t flags, struct proc *ep)
1c79356b 666{
39236c6e
A
667 struct protosw *prp;
668 struct socket *so;
669 int error = 0;
d1ecb069 670
55e303ae
A
671#if TCPDEBUG
672 extern int tcpconsdebug;
673#endif
39236c6e
A
674
675 VERIFY(aso != NULL);
676 *aso = NULL;
677
0a7de745 678 if (proto != 0) {
1c79356b 679 prp = pffindproto(dom, proto, type);
0a7de745 680 } else {
1c79356b 681 prp = pffindtype(dom, type);
0a7de745 682 }
9bccf70c 683
39236c6e 684 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
0a7de745
A
685 if (pffinddomain(dom) == NULL) {
686 return EAFNOSUPPORT;
687 }
2d21ac55 688 if (proto != 0) {
0a7de745
A
689 if (pffindprotonotype(dom, proto) != NULL) {
690 return EPROTOTYPE;
691 }
2d21ac55 692 }
0a7de745
A
693 return EPROTONOSUPPORT;
694 }
695 if (prp->pr_type != type) {
696 return EPROTOTYPE;
9bccf70c 697 }
b0d623f7 698 so = soalloc(1, dom, type);
0a7de745
A
699 if (so == NULL) {
700 return ENOBUFS;
701 }
1c79356b 702
5ba3f43e 703 switch (dom) {
0a7de745
A
704 case PF_LOCAL:
705 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
706 break;
707 case PF_INET:
708 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
709 if (type == SOCK_STREAM) {
710 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
711 } else {
712 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
713 }
714 break;
715 case PF_ROUTE:
716 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
717 break;
718 case PF_NDRV:
719 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
720 break;
721 case PF_KEY:
722 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
723 break;
724 case PF_INET6:
725 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
726 if (type == SOCK_STREAM) {
727 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
728 } else {
729 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
730 }
731 break;
732 case PF_SYSTEM:
733 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
734 break;
735 case PF_MULTIPATH:
736 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
737 break;
738 default:
739 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
740 break;
5ba3f43e
A
741 }
742
cb323159 743 if (flags & SOCF_MPTCP) {
39236c6e 744 so->so_state |= SS_NBIO;
0a7de745 745 }
39236c6e 746
1c79356b
A
747 TAILQ_INIT(&so->so_incomp);
748 TAILQ_INIT(&so->so_comp);
749 so->so_type = type;
316670eb
A
750 so->last_upid = proc_uniqueid(p);
751 so->last_pid = proc_pid(p);
0a7de745 752 proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
fe8ab488 753 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
39236c6e
A
754
755 if (ep != PROC_NULL && ep != p) {
756 so->e_upid = proc_uniqueid(ep);
757 so->e_pid = proc_pid(ep);
0a7de745 758 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
39236c6e
A
759 so->so_flags |= SOF_DELEGATED;
760 }
1c79356b 761
316670eb 762 so->so_cred = kauth_cred_proc_ref(p);
0a7de745 763 if (!suser(kauth_cred_get(), NULL)) {
39236c6e 764 so->so_state |= SS_PRIV;
0a7de745 765 }
b0d623f7 766
1c79356b 767 so->so_proto = prp;
39236c6e 768 so->so_rcv.sb_flags |= SB_RECV;
91447636 769 so->so_rcv.sb_so = so->so_snd.sb_so = so;
0c530ab8
A
770 so->next_lock_lr = 0;
771 so->next_unlock_lr = 0;
2d21ac55
A
772
773#if CONFIG_MACF_SOCKET
774 mac_socket_label_associate(kauth_cred_get(), so);
775#endif /* MAC_SOCKET */
776
2d21ac55 777 /*
39236c6e
A
778 * Attachment will create the per pcb lock if necessary and
779 * increase refcount for creation, make sure it's done before
780 * socket is inserted in lists.
2d21ac55
A
781 */
782 so->so_usecount++;
91447636
A
783
784 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
39236c6e 785 if (error != 0) {
2d21ac55
A
786 /*
787 * Warning:
788 * If so_pcb is not zero, the socket will be leaked,
789 * so protocol attachment handler must be coded carefuly
55e303ae 790 */
1c79356b 791 so->so_state |= SS_NOFDREF;
d190cdc3 792 VERIFY(so->so_usecount > 0);
37839358 793 so->so_usecount--;
0a7de745
A
794 sofreelastref(so, 1); /* will deallocate the socket */
795 return error;
1c79356b 796 }
39236c6e 797
cb323159
A
798 /*
799 * Note: needs so_pcb to be set after pru_attach
800 */
801 if (prp->pr_update_last_owner != NULL) {
802 (*prp->pr_update_last_owner)(so, p, ep);
803 }
804
39236c6e 805 atomic_add_32(&prp->pr_domain->dom_refs, 1);
1c79356b 806 TAILQ_INIT(&so->so_evlist);
91447636
A
807
808 /* Attach socket filters for this protocol */
809 sflt_initsock(so);
55e303ae 810#if TCPDEBUG
0a7de745 811 if (tcpconsdebug == 2) {
55e303ae 812 so->so_options |= SO_DEBUG;
0a7de745 813 }
9bccf70c 814#endif
6d2010ae 815 so_set_default_traffic_class(so);
39236c6e 816
d1ecb069 817 /*
39236c6e
A
818 * If this thread or task is marked to create backgrounded sockets,
819 * mark the socket as background.
d1ecb069 820 */
cb323159
A
821 if (!(flags & SOCF_MPTCP) &&
822 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
d1ecb069 823 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
6d2010ae
A
824 so->so_background_thread = current_thread();
825 }
826
827 switch (dom) {
316670eb 828 /*
39236c6e
A
829 * Don't mark Unix domain, system or multipath sockets as
830 * eligible for defunct by default.
831 */
6d2010ae 832 case PF_LOCAL:
316670eb 833 case PF_SYSTEM:
39236c6e 834 case PF_MULTIPATH:
6d2010ae
A
835 so->so_flags |= SOF_NODEFUNCT;
836 break;
316670eb
A
837 default:
838 break;
d1ecb069
A
839 }
840
fe8ab488
A
841 /*
842 * Entitlements can't be checked at socket creation time except if the
843 * application requested a feature guarded by a privilege (c.f., socket
844 * delegation).
845 * The priv(9) and the Sandboxing APIs are designed with the idea that
846 * a privilege check should only be triggered by a userland request.
847 * A privilege check at socket creation time is time consuming and
848 * could trigger many authorisation error messages from the security
849 * APIs.
850 */
851
1c79356b 852 *aso = so;
39236c6e 853
0a7de745 854 return 0;
1c79356b
A
855}
856
39236c6e
A
857/*
858 * Returns: 0 Success
859 * EAFNOSUPPORT
860 * EPROTOTYPE
861 * EPROTONOSUPPORT
862 * ENOBUFS
863 * <pru_attach>:ENOBUFS[AF_UNIX]
864 * <pru_attach>:ENOBUFS[TCP]
865 * <pru_attach>:ENOMEM[TCP]
866 * <pru_attach>:??? [other protocol families, IPSEC]
867 */
868int
869socreate(int dom, struct socket **aso, int type, int proto)
870{
0a7de745
A
871 return socreate_internal(dom, aso, type, proto, current_proc(), 0,
872 PROC_NULL);
39236c6e
A
873}
874
875int
876socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
877{
878 int error = 0;
879 struct proc *ep = PROC_NULL;
880
881 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
882 error = ESRCH;
883 goto done;
884 }
885
886 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
887
888 /*
889 * It might not be wise to hold the proc reference when calling
890 * socreate_internal since it calls soalloc with M_WAITOK
891 */
892done:
0a7de745 893 if (ep != PROC_NULL) {
39236c6e 894 proc_rele(ep);
0a7de745 895 }
39236c6e 896
0a7de745 897 return error;
39236c6e
A
898}
899
2d21ac55
A
900/*
901 * Returns: 0 Success
902 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
903 * <pru_bind>:EAFNOSUPPORT Address family not supported
904 * <pru_bind>:EADDRNOTAVAIL Address not available.
905 * <pru_bind>:EINVAL Invalid argument
906 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
907 * <pru_bind>:EACCES Permission denied
908 * <pru_bind>:EADDRINUSE Address in use
909 * <pru_bind>:EAGAIN Resource unavailable, try again
910 * <pru_bind>:EPERM Operation not permitted
911 * <pru_bind>:???
912 * <sf_bind>:???
913 *
914 * Notes: It's not possible to fully enumerate the return codes above,
915 * since socket filter authors and protocol family authors may
916 * not choose to limit their error returns to those listed, even
917 * though this may result in some software operating incorrectly.
918 *
919 * The error codes which are enumerated above are those known to
920 * be returned by the tcp_usr_bind function supplied.
921 */
1c79356b 922int
39236c6e 923sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
1c79356b
A
924{
925 struct proc *p = current_proc();
91447636 926 int error = 0;
1c79356b 927
0a7de745 928 if (dolock) {
39236c6e 929 socket_lock(so, 1);
0a7de745 930 }
39236c6e 931
6d2010ae 932 so_update_last_owner_locked(so, p);
39236c6e 933 so_update_policy(so);
3e170ce0 934
fe8ab488
A
935#if NECP
936 so_update_necp_policy(so, nam, NULL);
937#endif /* NECP */
3e170ce0 938
2d21ac55 939 /*
6d2010ae
A
940 * If this is a bind request on a socket that has been marked
941 * as inactive, reject it now before we go any further.
2d21ac55
A
942 */
943 if (so->so_flags & SOF_DEFUNCT) {
944 error = EINVAL;
39037602
A
945 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
946 __func__, proc_pid(p), proc_best_name(p),
947 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
948 SOCK_DOM(so), SOCK_TYPE(so), error);
2d21ac55
A
949 goto out;
950 }
951
91447636 952 /* Socket filter */
6d2010ae 953 error = sflt_bind(so, nam);
2d21ac55 954
0a7de745 955 if (error == 0) {
91447636 956 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
0a7de745 957 }
2d21ac55 958out:
0a7de745 959 if (dolock) {
39236c6e 960 socket_unlock(so, 1);
0a7de745 961 }
2d21ac55 962
0a7de745 963 if (error == EJUSTRETURN) {
91447636 964 error = 0;
0a7de745 965 }
2d21ac55 966
0a7de745 967 return error;
1c79356b
A
968}
969
970void
2d21ac55 971sodealloc(struct socket *so)
1c79356b 972{
316670eb
A
973 kauth_cred_unref(&so->so_cred);
974
6d2010ae
A
975 /* Remove any filters */
976 sflt_termsock(so);
977
fe8ab488
A
978#if CONTENT_FILTER
979 cfil_sock_detach(so);
980#endif /* CONTENT_FILTER */
981
39236c6e
A
982 /* Delete the state allocated for msg queues on a socket */
983 if (so->so_flags & SOF_ENABLE_MSGS) {
984 FREE(so->so_msg_state, M_TEMP);
985 so->so_msg_state = NULL;
986 }
987 VERIFY(so->so_msg_state == NULL);
988
fe8ab488 989 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
1c79356b 990
2d21ac55
A
991#if CONFIG_MACF_SOCKET
992 mac_socket_label_destroy(so);
993#endif /* MAC_SOCKET */
39236c6e 994
3e170ce0 995 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
2d21ac55
A
996 cached_sock_free(so);
997 } else {
0a7de745 998 FREE_ZONE(so, sizeof(*so), so->so_zone);
91447636 999 }
1c79356b
A
1000}
1001
2d21ac55
A
1002/*
1003 * Returns: 0 Success
1004 * EINVAL
1005 * EOPNOTSUPP
1006 * <pru_listen>:EINVAL[AF_UNIX]
1007 * <pru_listen>:EINVAL[TCP]
1008 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
1009 * <pru_listen>:EINVAL[TCP] Invalid argument
1010 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
1011 * <pru_listen>:EACCES[TCP] Permission denied
1012 * <pru_listen>:EADDRINUSE[TCP] Address in use
1013 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
1014 * <pru_listen>:EPERM[TCP] Operation not permitted
1015 * <sf_listen>:???
1016 *
1017 * Notes: Other <pru_listen> returns depend on the protocol family; all
1018 * <sf_listen> returns depend on what the filter author causes
1019 * their filter to return.
1020 */
1c79356b 1021int
2d21ac55 1022solisten(struct socket *so, int backlog)
1c79356b 1023{
1c79356b 1024 struct proc *p = current_proc();
2d21ac55 1025 int error = 0;
1c79356b 1026
91447636 1027 socket_lock(so, 1);
39236c6e
A
1028
1029 so_update_last_owner_locked(so, p);
1030 so_update_policy(so);
3e170ce0 1031
fe8ab488
A
1032#if NECP
1033 so_update_necp_policy(so, NULL, NULL);
1034#endif /* NECP */
3e170ce0 1035
2d21ac55
A
1036 if (so->so_proto == NULL) {
1037 error = EINVAL;
1038 goto out;
1039 }
1040 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1041 error = EOPNOTSUPP;
1042 goto out;
1043 }
1044
1045 /*
1046 * If the listen request is made on a socket that is not fully
6d2010ae
A
1047 * disconnected, or on a socket that has been marked as inactive,
1048 * reject the request now.
2d21ac55
A
1049 */
1050 if ((so->so_state &
0a7de745 1051 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
2d21ac55
A
1052 (so->so_flags & SOF_DEFUNCT)) {
1053 error = EINVAL;
6d2010ae 1054 if (so->so_flags & SOF_DEFUNCT) {
39037602 1055 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
39236c6e 1056 "(%d)\n", __func__, proc_pid(p),
39037602 1057 proc_best_name(p),
3e170ce0 1058 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 1059 SOCK_DOM(so), SOCK_TYPE(so), error);
6d2010ae 1060 }
2d21ac55
A
1061 goto out;
1062 }
1063
39236c6e 1064 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
2d21ac55
A
1065 error = EPERM;
1066 goto out;
1067 }
1068
6d2010ae 1069 error = sflt_listen(so);
0a7de745 1070 if (error == 0) {
91447636 1071 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
0a7de745 1072 }
2d21ac55 1073
1c79356b 1074 if (error) {
0a7de745 1075 if (error == EJUSTRETURN) {
91447636 1076 error = 0;
0a7de745 1077 }
2d21ac55 1078 goto out;
1c79356b 1079 }
2d21ac55 1080
0a7de745 1081 if (TAILQ_EMPTY(&so->so_comp)) {
1c79356b 1082 so->so_options |= SO_ACCEPTCONN;
0a7de745 1083 }
2d21ac55
A
1084 /*
1085 * POSIX: The implementation may have an upper limit on the length of
1086 * the listen queue-either global or per accepting socket. If backlog
1087 * exceeds this limit, the length of the listen queue is set to the
1088 * limit.
1089 *
1090 * If listen() is called with a backlog argument value that is less
1091 * than 0, the function behaves as if it had been called with a backlog
1092 * argument value of 0.
1093 *
1094 * A backlog argument of 0 may allow the socket to accept connections,
1095 * in which case the length of the listen queue may be set to an
1096 * implementation-defined minimum value.
1097 */
0a7de745 1098 if (backlog <= 0 || backlog > somaxconn) {
1c79356b 1099 backlog = somaxconn;
0a7de745 1100 }
1c79356b 1101
2d21ac55
A
1102 so->so_qlimit = backlog;
1103out:
91447636 1104 socket_unlock(so, 1);
0a7de745 1105 return error;
1c79356b
A
1106}
1107
813fb2f6
A
1108/*
1109 * The "accept list lock" protects the fields related to the listener queues
1110 * because we can unlock a socket to respect the lock ordering between
1111 * the listener socket and its clients sockets. The lock ordering is first to
1112 * acquire the client socket before the listener socket.
1113 *
1114 * The accept list lock serializes access to the following fields:
1115 * - of the listener socket:
1116 * - so_comp
1117 * - so_incomp
1118 * - so_qlen
1119 * - so_inqlen
1120 * - of client sockets that are in so_comp or so_incomp:
1121 * - so_head
1122 * - so_list
1123 *
1124 * As one can see the accept list lock protects the consistent of the
1125 * linkage of the client sockets.
1126 *
1127 * Note that those fields may be read without holding the accept list lock
1128 * for a preflight provided the accept list lock is taken when committing
1129 * to take an action based on the result of the preflight. The preflight
1130 * saves the cost of doing the unlock/lock dance.
1131 */
1132void
1133so_acquire_accept_list(struct socket *head, struct socket *so)
1134{
1135 lck_mtx_t *mutex_held;
1136
1137 if (head->so_proto->pr_getlock == NULL) {
1138 return;
1139 }
5ba3f43e
A
1140 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1141 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
813fb2f6
A
1142
1143 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1144 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1145 return;
1146 }
1147 if (so != NULL) {
1148 socket_unlock(so, 0);
1149 }
1150 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1151 so_accept_list_waits += 1;
1152 msleep((caddr_t)&head->so_incomp, mutex_held,
1153 PSOCK | PCATCH, __func__, NULL);
1154 }
1155 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1156 if (so != NULL) {
1157 socket_unlock(head, 0);
1158 socket_lock(so, 0);
1159 socket_lock(head, 0);
1160 }
1161}
1162
1163void
1164so_release_accept_list(struct socket *head)
1165{
1166 if (head->so_proto->pr_getlock != NULL) {
1167 lck_mtx_t *mutex_held;
1168
1169 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
5ba3f43e
A
1170 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1171
813fb2f6
A
1172 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1173 wakeup((caddr_t)&head->so_incomp);
1174 }
1175}
1176
1c79356b 1177void
2d21ac55 1178sofreelastref(struct socket *so, int dealloc)
9bccf70c 1179{
1c79356b
A
1180 struct socket *head = so->so_head;
1181
2d21ac55 1182 /* Assume socket is locked */
1c79356b 1183
39236c6e 1184 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
0b4e3aa0
A
1185 selthreadclear(&so->so_snd.sb_sel);
1186 selthreadclear(&so->so_rcv.sb_sel);
0a7de745
A
1187 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1188 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
fe8ab488 1189 so->so_event = sonullevent;
1c79356b 1190 return;
0b4e3aa0 1191 }
9bccf70c 1192 if (head != NULL) {
d190cdc3
A
1193 /*
1194 * Need to lock the listener when the protocol has
1195 * per socket locks
1196 */
813fb2f6 1197 if (head->so_proto->pr_getlock != NULL) {
d190cdc3 1198 socket_lock(head, 1);
813fb2f6
A
1199 so_acquire_accept_list(head, so);
1200 }
9bccf70c 1201 if (so->so_state & SS_INCOMP) {
d190cdc3 1202 so->so_state &= ~SS_INCOMP;
9bccf70c
A
1203 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1204 head->so_incqlen--;
d190cdc3
A
1205 head->so_qlen--;
1206 so->so_head = NULL;
813fb2f6
A
1207
1208 if (head->so_proto->pr_getlock != NULL) {
1209 so_release_accept_list(head);
1210 socket_unlock(head, 1);
1211 }
9bccf70c 1212 } else if (so->so_state & SS_COMP) {
813fb2f6
A
1213 if (head->so_proto->pr_getlock != NULL) {
1214 so_release_accept_list(head);
1215 socket_unlock(head, 1);
1216 }
9bccf70c
A
1217 /*
1218 * We must not decommission a socket that's
1219 * on the accept(2) queue. If we do, then
1220 * accept(2) may hang after select(2) indicated
1221 * that the listening socket was ready.
1222 */
9bccf70c
A
1223 selthreadclear(&so->so_snd.sb_sel);
1224 selthreadclear(&so->so_rcv.sb_sel);
0a7de745
A
1225 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1226 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
fe8ab488 1227 so->so_event = sonullevent;
9bccf70c
A
1228 return;
1229 } else {
813fb2f6
A
1230 if (head->so_proto->pr_getlock != NULL) {
1231 so_release_accept_list(head);
0a7de745
A
1232 socket_unlock(head, 1);
1233 }
813fb2f6 1234 printf("sofree: not queued\n");
9bccf70c 1235 }
1c79356b 1236 }
39236c6e 1237 sowflush(so);
1c79356b 1238 sorflush(so);
2d21ac55 1239
39236c6e
A
1240#if FLOW_DIVERT
1241 if (so->so_flags & SOF_FLOW_DIVERT) {
1242 flow_divert_detach(so);
1243 }
0a7de745 1244#endif /* FLOW_DIVERT */
39236c6e 1245
91447636
A
1246 /* 3932268: disable upcall */
1247 so->so_rcv.sb_flags &= ~SB_UPCALL;
0a7de745 1248 so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
fe8ab488 1249 so->so_event = sonullevent;
2d21ac55 1250
0a7de745 1251 if (dealloc) {
91447636 1252 sodealloc(so);
0a7de745 1253 }
1c79356b
A
1254}
1255
2d21ac55
A
1256void
1257soclose_wait_locked(struct socket *so)
1258{
1259 lck_mtx_t *mutex_held;
1260
0a7de745 1261 if (so->so_proto->pr_getlock != NULL) {
5ba3f43e 1262 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
0a7de745 1263 } else {
2d21ac55 1264 mutex_held = so->so_proto->pr_domain->dom_mtx;
0a7de745 1265 }
5ba3f43e 1266 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
2d21ac55 1267
4a3eedf9
A
1268 /*
1269 * Double check here and return if there's no outstanding upcall;
1270 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1271 */
0a7de745 1272 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
2d21ac55 1273 return;
0a7de745 1274 }
316670eb
A
1275 so->so_rcv.sb_flags &= ~SB_UPCALL;
1276 so->so_snd.sb_flags &= ~SB_UPCALL;
2d21ac55 1277 so->so_flags |= SOF_CLOSEWAIT;
5ba3f43e 1278
39236c6e 1279 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
2d21ac55 1280 "soclose_wait_locked", NULL);
5ba3f43e 1281 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
2d21ac55
A
1282 so->so_flags &= ~SOF_CLOSEWAIT;
1283}
1284
1c79356b
A
1285/*
1286 * Close a socket on last file table reference removal.
1287 * Initiate disconnect if connected.
1288 * Free socket when disconnect complete.
1289 */
1290int
2d21ac55 1291soclose_locked(struct socket *so)
1c79356b 1292{
1c79356b 1293 int error = 0;
91447636 1294 struct timespec ts;
1c79356b 1295
91447636 1296 if (so->so_usecount == 0) {
2d21ac55 1297 panic("soclose: so=%p refcount=0\n", so);
39236c6e 1298 /* NOTREACHED */
1c79356b
A
1299 }
1300
91447636 1301 sflt_notify(so, sock_evt_closing, NULL);
2d21ac55 1302
0a7de745 1303 if (so->so_upcallusecount) {
39236c6e 1304 soclose_wait_locked(so);
0a7de745 1305 }
39236c6e 1306
fe8ab488
A
1307#if CONTENT_FILTER
1308 /*
1309 * We have to wait until the content filters are done
1310 */
1311 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1312 cfil_sock_close_wait(so);
1313 cfil_sock_is_closed(so);
1314 cfil_sock_detach(so);
1315 }
1316#endif /* CONTENT_FILTER */
1317
3e170ce0
A
1318 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1319 soresume(current_proc(), so, 1);
1320 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1321 }
1322
91447636 1323 if ((so->so_options & SO_ACCEPTCONN)) {
813fb2f6
A
1324 struct socket *sp, *sonext;
1325 int persocklock = 0;
1326 int incomp_overflow_only;
2d21ac55
A
1327
1328 /*
1329 * We do not want new connection to be added
1330 * to the connection queues
1331 */
91447636 1332 so->so_options &= ~SO_ACCEPTCONN;
2d21ac55 1333
813fb2f6
A
1334 /*
1335 * We can drop the lock on the listener once
1336 * we've acquired the incoming list
1337 */
1338 if (so->so_proto->pr_getlock != NULL) {
1339 persocklock = 1;
1340 so_acquire_accept_list(so, NULL);
1341 socket_unlock(so, 0);
1342 }
1343again:
1344 incomp_overflow_only = 1;
2d21ac55 1345
813fb2f6 1346 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
39236c6e
A
1347 /*
1348 * Radar 5350314
2d21ac55
A
1349 * skip sockets thrown away by tcpdropdropblreq
1350 * they will get cleanup by the garbage collection.
1351 * otherwise, remove the incomp socket from the queue
1352 * and let soabort trigger the appropriate cleanup.
91447636 1353 */
0a7de745 1354 if (sp->so_flags & SOF_OVERFLOW) {
2d21ac55 1355 continue;
0a7de745 1356 }
2d21ac55 1357
0a7de745 1358 if (persocklock != 0) {
ff6e181a 1359 socket_lock(sp, 1);
0a7de745 1360 }
2d21ac55 1361
d190cdc3
A
1362 /*
1363 * Radar 27945981
1364 * The extra reference for the list insure the
1365 * validity of the socket pointer when we perform the
1366 * unlock of the head above
1367 */
2d21ac55
A
1368 if (sp->so_state & SS_INCOMP) {
1369 sp->so_state &= ~SS_INCOMP;
1370 sp->so_head = NULL;
d190cdc3
A
1371 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1372 so->so_incqlen--;
1373 so->so_qlen--;
2d21ac55
A
1374
1375 (void) soabort(sp);
813fb2f6
A
1376 } else {
1377 panic("%s sp %p in so_incomp but !SS_INCOMP",
1378 __func__, sp);
ff6e181a 1379 }
2d21ac55 1380
0a7de745 1381 if (persocklock != 0) {
2d21ac55 1382 socket_unlock(sp, 1);
0a7de745 1383 }
91447636
A
1384 }
1385
813fb2f6 1386 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
91447636 1387 /* Dequeue from so_comp since sofree() won't do it */
0a7de745 1388 if (persocklock != 0) {
ff6e181a 1389 socket_lock(sp, 1);
0a7de745 1390 }
ff6e181a 1391
2d21ac55
A
1392 if (sp->so_state & SS_COMP) {
1393 sp->so_state &= ~SS_COMP;
1394 sp->so_head = NULL;
d190cdc3
A
1395 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1396 so->so_qlen--;
2d21ac55
A
1397
1398 (void) soabort(sp);
813fb2f6
A
1399 } else {
1400 panic("%s sp %p in so_comp but !SS_COMP",
1401 __func__, sp);
2d21ac55 1402 }
91447636 1403
0a7de745 1404 if (persocklock) {
91447636 1405 socket_unlock(sp, 1);
ff6e181a 1406 }
0a7de745 1407 }
813fb2f6
A
1408
1409 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
0a7de745 1410#if (DEBUG | DEVELOPMENT)
813fb2f6
A
1411 panic("%s head %p so_comp not empty\n", __func__, so);
1412#endif /* (DEVELOPMENT || DEBUG) */
1413
1414 goto again;
91447636 1415 }
813fb2f6
A
1416
1417 if (!TAILQ_EMPTY(&so->so_comp)) {
0a7de745 1418#if (DEBUG | DEVELOPMENT)
813fb2f6
A
1419 panic("%s head %p so_comp not empty\n", __func__, so);
1420#endif /* (DEVELOPMENT || DEBUG) */
1421
1422 goto again;
1423 }
1424
1425 if (persocklock) {
1426 socket_lock(so, 0);
1427 so_release_accept_list(so);
1428 }
1429 }
39236c6e 1430 if (so->so_pcb == NULL) {
91447636
A
1431 /* 3915887: mark the socket as ready for dealloc */
1432 so->so_flags |= SOF_PCBCLEARING;
1c79356b 1433 goto discard;
91447636 1434 }
1c79356b
A
1435 if (so->so_state & SS_ISCONNECTED) {
1436 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
91447636 1437 error = sodisconnectlocked(so);
0a7de745 1438 if (error) {
1c79356b 1439 goto drop;
0a7de745 1440 }
1c79356b
A
1441 }
1442 if (so->so_options & SO_LINGER) {
813fb2f6
A
1443 lck_mtx_t *mutex_held;
1444
1c79356b 1445 if ((so->so_state & SS_ISDISCONNECTING) &&
0a7de745 1446 (so->so_state & SS_NBIO)) {
1c79356b 1447 goto drop;
0a7de745
A
1448 }
1449 if (so->so_proto->pr_getlock != NULL) {
5ba3f43e 1450 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
0a7de745 1451 } else {
91447636 1452 mutex_held = so->so_proto->pr_domain->dom_mtx;
0a7de745 1453 }
1c79356b 1454 while (so->so_state & SS_ISCONNECTED) {
0a7de745 1455 ts.tv_sec = (so->so_linger / 100);
2d21ac55
A
1456 ts.tv_nsec = (so->so_linger % 100) *
1457 NSEC_PER_USEC * 1000 * 10;
1458 error = msleep((caddr_t)&so->so_timeo,
1459 mutex_held, PSOCK | PCATCH, "soclose", &ts);
91447636 1460 if (error) {
2d21ac55
A
1461 /*
1462 * It's OK when the time fires,
1463 * don't report an error
1464 */
0a7de745 1465 if (error == EWOULDBLOCK) {
91447636 1466 error = 0;
0a7de745 1467 }
1c79356b 1468 break;
91447636 1469 }
1c79356b
A
1470 }
1471 }
1472 }
1473drop:
39236c6e 1474 if (so->so_usecount == 0) {
2d21ac55 1475 panic("soclose: usecount is zero so=%p\n", so);
39236c6e
A
1476 /* NOTREACHED */
1477 }
1478 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1c79356b 1479 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
0a7de745 1480 if (error == 0) {
1c79356b 1481 error = error2;
0a7de745 1482 }
1c79356b 1483 }
39236c6e 1484 if (so->so_usecount <= 0) {
2d21ac55 1485 panic("soclose: usecount is zero so=%p\n", so);
39236c6e
A
1486 /* NOTREACHED */
1487 }
1c79356b 1488discard:
39236c6e
A
1489 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1490 (so->so_state & SS_NOFDREF)) {
1c79356b 1491 panic("soclose: NOFDREF");
39236c6e
A
1492 /* NOTREACHED */
1493 }
1c79356b 1494 so->so_state |= SS_NOFDREF;
39236c6e 1495
0a7de745 1496 if ((so->so_flags & SOF_KNOTE) != 0) {
316670eb 1497 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
0a7de745 1498 }
39236c6e
A
1499
1500 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1c79356b 1501 evsofree(so);
39236c6e 1502
d190cdc3 1503 VERIFY(so->so_usecount > 0);
91447636 1504 so->so_usecount--;
1c79356b 1505 sofree(so);
0a7de745 1506 return error;
1c79356b
A
1507}
1508
91447636 1509int
2d21ac55 1510soclose(struct socket *so)
91447636
A
1511{
1512 int error = 0;
1513 socket_lock(so, 1);
2d21ac55 1514
2d21ac55 1515 if (so->so_retaincnt == 0) {
91447636 1516 error = soclose_locked(so);
2d21ac55
A
1517 } else {
1518 /*
1519 * if the FD is going away, but socket is
1520 * retained in kernel remove its reference
1521 */
91447636 1522 so->so_usecount--;
0a7de745 1523 if (so->so_usecount < 2) {
2d21ac55
A
1524 panic("soclose: retaincnt non null and so=%p "
1525 "usecount=%d\n", so, so->so_usecount);
0a7de745 1526 }
91447636
A
1527 }
1528 socket_unlock(so, 1);
0a7de745 1529 return error;
91447636
A
1530}
1531
1c79356b
A
1532/*
1533 * Must be called at splnet...
1534 */
2d21ac55 1535/* Should already be locked */
1c79356b 1536int
2d21ac55 1537soabort(struct socket *so)
1c79356b 1538{
9bccf70c 1539 int error;
1c79356b 1540
91447636 1541#ifdef MORE_LOCKING_DEBUG
2d21ac55 1542 lck_mtx_t *mutex_held;
91447636 1543
0a7de745 1544 if (so->so_proto->pr_getlock != NULL) {
91447636 1545 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
0a7de745 1546 } else {
91447636 1547 mutex_held = so->so_proto->pr_domain->dom_mtx;
0a7de745 1548 }
5ba3f43e 1549 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
91447636
A
1550#endif
1551
2d21ac55
A
1552 if ((so->so_flags & SOF_ABORTED) == 0) {
1553 so->so_flags |= SOF_ABORTED;
1554 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1555 if (error) {
1556 sofree(so);
0a7de745 1557 return error;
2d21ac55 1558 }
9bccf70c 1559 }
0a7de745 1560 return 0;
1c79356b
A
1561}
1562
1563int
2d21ac55 1564soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
9bccf70c 1565{
1c79356b 1566 int error;
91447636 1567
0a7de745 1568 if (dolock) {
2d21ac55 1569 socket_lock(so, 1);
0a7de745 1570 }
1c79356b 1571
39236c6e
A
1572 so_update_last_owner_locked(so, PROC_NULL);
1573 so_update_policy(so);
fe8ab488
A
1574#if NECP
1575 so_update_necp_policy(so, NULL, NULL);
1576#endif /* NECP */
39236c6e 1577
0a7de745 1578 if ((so->so_state & SS_NOFDREF) == 0) {
1c79356b 1579 panic("soaccept: !NOFDREF");
0a7de745 1580 }
1c79356b
A
1581 so->so_state &= ~SS_NOFDREF;
1582 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
2d21ac55 1583
0a7de745 1584 if (dolock) {
2d21ac55 1585 socket_unlock(so, 1);
0a7de745
A
1586 }
1587 return error;
1c79356b 1588}
2d21ac55 1589
91447636 1590int
2d21ac55 1591soaccept(struct socket *so, struct sockaddr **nam)
91447636 1592{
0a7de745 1593 return soacceptlock(so, nam, 1);
91447636 1594}
1c79356b
A
1595
1596int
d190cdc3 1597soacceptfilter(struct socket *so, struct socket *head)
2d21ac55
A
1598{
1599 struct sockaddr *local = NULL, *remote = NULL;
6d2010ae 1600 int error = 0;
2d21ac55
A
1601
1602 /*
39236c6e
A
1603 * Hold the lock even if this socket has not been made visible
1604 * to the filter(s). For sockets with global locks, this protects
1605 * against the head or peer going away
2d21ac55 1606 */
b0d623f7
A
1607 socket_lock(so, 1);
1608 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1609 sogetaddr_locked(so, &local, 0) != 0) {
d190cdc3 1610 so->so_state &= ~SS_NOFDREF;
b0d623f7 1611 socket_unlock(so, 1);
2d21ac55
A
1612 soclose(so);
1613 /* Out of resources; try it again next time */
1614 error = ECONNABORTED;
1615 goto done;
1616 }
1617
6d2010ae 1618 error = sflt_accept(head, so, local, remote);
2d21ac55
A
1619
1620 /*
1621 * If we get EJUSTRETURN from one of the filters, mark this socket
1622 * as inactive and return it anyway. This newly accepted socket
1623 * will be disconnected later before we hand it off to the caller.
1624 */
1625 if (error == EJUSTRETURN) {
1626 error = 0;
6d2010ae
A
1627 (void) sosetdefunct(current_proc(), so,
1628 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
2d21ac55
A
1629 }
1630
1631 if (error != 0) {
1632 /*
1633 * This may seem like a duplication to the above error
1634 * handling part when we return ECONNABORTED, except
1635 * the following is done while holding the lock since
1636 * the socket has been exposed to the filter(s) earlier.
1637 */
5ba3f43e 1638 so->so_state &= ~SS_NOFDREF;
2d21ac55
A
1639 socket_unlock(so, 1);
1640 soclose(so);
1641 /* Propagate socket filter's error code to the caller */
1642 } else {
1643 socket_unlock(so, 1);
1644 }
1645done:
1646 /* Callee checks for NULL pointer */
1647 sock_freeaddr(remote);
1648 sock_freeaddr(local);
0a7de745 1649 return error;
2d21ac55 1650}
1c79356b 1651
2d21ac55
A
1652/*
1653 * Returns: 0 Success
1654 * EOPNOTSUPP Operation not supported on socket
1655 * EISCONN Socket is connected
1656 * <pru_connect>:EADDRNOTAVAIL Address not available.
1657 * <pru_connect>:EINVAL Invalid argument
1658 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1659 * <pru_connect>:EACCES Permission denied
1660 * <pru_connect>:EADDRINUSE Address in use
1661 * <pru_connect>:EAGAIN Resource unavailable, try again
1662 * <pru_connect>:EPERM Operation not permitted
1663 * <sf_connect_out>:??? [anything a filter writer might set]
1664 */
1665int
1666soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1c79356b 1667{
1c79356b
A
1668 int error;
1669 struct proc *p = current_proc();
1c79356b 1670
0a7de745 1671 if (dolock) {
2d21ac55 1672 socket_lock(so, 1);
0a7de745 1673 }
39236c6e
A
1674
1675 so_update_last_owner_locked(so, p);
1676 so_update_policy(so);
1677
fe8ab488
A
1678#if NECP
1679 so_update_necp_policy(so, NULL, nam);
1680#endif /* NECP */
1681
2d21ac55
A
1682 /*
1683 * If this is a listening socket or if this is a previously-accepted
1684 * socket that has been marked as inactive, reject the connect request.
1685 */
1686 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
6d2010ae
A
1687 error = EOPNOTSUPP;
1688 if (so->so_flags & SOF_DEFUNCT) {
39037602 1689 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
39236c6e 1690 "(%d)\n", __func__, proc_pid(p),
39037602 1691 proc_best_name(p),
3e170ce0 1692 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 1693 SOCK_DOM(so), SOCK_TYPE(so), error);
6d2010ae 1694 }
0a7de745 1695 if (dolock) {
2d21ac55 1696 socket_unlock(so, 1);
0a7de745
A
1697 }
1698 return error;
91447636 1699 }
2d21ac55 1700
39236c6e 1701 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
0a7de745 1702 if (dolock) {
2d21ac55 1703 socket_unlock(so, 1);
0a7de745
A
1704 }
1705 return EPERM;
2d21ac55
A
1706 }
1707
1c79356b
A
1708 /*
1709 * If protocol is connection-based, can only connect once.
1710 * Otherwise, if connected, try to disconnect first.
1711 * This allows user to disconnect by connecting to, e.g.,
1712 * a null address.
1713 */
0a7de745 1714 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1c79356b 1715 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
2d21ac55 1716 (error = sodisconnectlocked(so)))) {
1c79356b 1717 error = EISCONN;
2d21ac55 1718 } else {
91447636
A
1719 /*
1720 * Run connect filter before calling protocol:
1721 * - non-blocking connect returns before completion;
1722 */
6d2010ae 1723 error = sflt_connectout(so, nam);
39236c6e 1724 if (error != 0) {
0a7de745 1725 if (error == EJUSTRETURN) {
91447636 1726 error = 0;
0a7de745 1727 }
6d2010ae 1728 } else {
39236c6e
A
1729 error = (*so->so_proto->pr_usrreqs->pru_connect)
1730 (so, nam, p);
91447636 1731 }
1c79356b 1732 }
0a7de745 1733 if (dolock) {
2d21ac55 1734 socket_unlock(so, 1);
0a7de745
A
1735 }
1736 return error;
1c79356b
A
1737}
1738
91447636 1739int
2d21ac55 1740soconnect(struct socket *so, struct sockaddr *nam)
91447636 1741{
0a7de745 1742 return soconnectlock(so, nam, 1);
91447636
A
1743}
1744
2d21ac55
A
1745/*
1746 * Returns: 0 Success
1747 * <pru_connect2>:EINVAL[AF_UNIX]
1748 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1749 * <pru_connect2>:??? [other protocol families]
1750 *
1751 * Notes: <pru_connect2> is not supported by [TCP].
1752 */
1c79356b 1753int
2d21ac55 1754soconnect2(struct socket *so1, struct socket *so2)
1c79356b 1755{
1c79356b 1756 int error;
91447636 1757
0c530ab8 1758 socket_lock(so1, 1);
0a7de745 1759 if (so2->so_proto->pr_lock) {
0c530ab8 1760 socket_lock(so2, 1);
0a7de745 1761 }
1c79356b
A
1762
1763 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
2d21ac55 1764
0c530ab8 1765 socket_unlock(so1, 1);
0a7de745 1766 if (so2->so_proto->pr_lock) {
0c530ab8 1767 socket_unlock(so2, 1);
0a7de745
A
1768 }
1769 return error;
1c79356b
A
1770}
1771
39236c6e 1772int
813fb2f6
A
1773soconnectxlocked(struct socket *so, struct sockaddr *src,
1774 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
3e170ce0
A
1775 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1776 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
39236c6e
A
1777{
1778 int error;
1779
fe8ab488
A
1780 so_update_last_owner_locked(so, p);
1781 so_update_policy(so);
3e170ce0 1782
39236c6e
A
1783 /*
1784 * If this is a listening socket or if this is a previously-accepted
1785 * socket that has been marked as inactive, reject the connect request.
1786 */
1787 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1788 error = EOPNOTSUPP;
1789 if (so->so_flags & SOF_DEFUNCT) {
39037602 1790 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
39236c6e 1791 "(%d)\n", __func__, proc_pid(p),
39037602 1792 proc_best_name(p),
3e170ce0 1793 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 1794 SOCK_DOM(so), SOCK_TYPE(so), error);
39236c6e 1795 }
0a7de745 1796 return error;
39236c6e
A
1797 }
1798
0a7de745
A
1799 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1800 return EPERM;
1801 }
39236c6e
A
1802
1803 /*
1804 * If protocol is connection-based, can only connect once
1805 * unless PR_MULTICONN is set. Otherwise, if connected,
1806 * try to disconnect first. This allows user to disconnect
1807 * by connecting to, e.g., a null address.
1808 */
0a7de745 1809 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
39236c6e
A
1810 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1811 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1812 (error = sodisconnectlocked(so)) != 0)) {
1813 error = EISCONN;
1814 } else {
1815 /*
1816 * Run connect filter before calling protocol:
1817 * - non-blocking connect returns before completion;
1818 */
813fb2f6 1819 error = sflt_connectout(so, dst);
39236c6e 1820 if (error != 0) {
490019cf
A
1821 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1822 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
0a7de745 1823 if (error == EJUSTRETURN) {
39236c6e 1824 error = 0;
0a7de745 1825 }
39236c6e
A
1826 } else {
1827 error = (*so->so_proto->pr_usrreqs->pru_connectx)
813fb2f6 1828 (so, src, dst, p, ifscope, aid, pcid,
3e170ce0 1829 flags, arg, arglen, auio, bytes_written);
39236c6e
A
1830 }
1831 }
1832
0a7de745 1833 return error;
39236c6e
A
1834}
1835
1c79356b 1836int
2d21ac55 1837sodisconnectlocked(struct socket *so)
1c79356b 1838{
1c79356b 1839 int error;
1c79356b
A
1840
1841 if ((so->so_state & SS_ISCONNECTED) == 0) {
1842 error = ENOTCONN;
1843 goto bad;
1844 }
1845 if (so->so_state & SS_ISDISCONNECTING) {
1846 error = EALREADY;
1847 goto bad;
1848 }
2d21ac55 1849
1c79356b 1850 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
0a7de745 1851 if (error == 0) {
91447636 1852 sflt_notify(so, sock_evt_disconnected, NULL);
0a7de745 1853 }
39236c6e 1854
1c79356b 1855bad:
0a7de745 1856 return error;
1c79356b 1857}
2d21ac55
A
1858
1859/* Locking version */
91447636 1860int
2d21ac55 1861sodisconnect(struct socket *so)
91447636 1862{
2d21ac55 1863 int error;
91447636
A
1864
1865 socket_lock(so, 1);
1866 error = sodisconnectlocked(so);
1867 socket_unlock(so, 1);
0a7de745 1868 return error;
91447636 1869}
1c79356b 1870
39236c6e 1871int
3e170ce0 1872sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
39236c6e
A
1873{
1874 int error;
1875
1876 /*
1877 * Call the protocol disconnectx handler; let it handle all
1878 * matters related to the connection state of this session.
1879 */
1880 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1881 if (error == 0) {
1882 /*
1883 * The event applies only for the session, not for
1884 * the disconnection of individual subflows.
1885 */
0a7de745 1886 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
39236c6e 1887 sflt_notify(so, sock_evt_disconnected, NULL);
0a7de745 1888 }
39236c6e 1889 }
0a7de745 1890 return error;
39236c6e
A
1891}
1892
1893int
3e170ce0 1894sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
39236c6e
A
1895{
1896 int error;
1897
1898 socket_lock(so, 1);
1899 error = sodisconnectxlocked(so, aid, cid);
1900 socket_unlock(so, 1);
0a7de745 1901 return error;
39236c6e
A
1902}
1903
0a7de745 1904#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
91447636
A
1905
1906/*
1907 * sosendcheck will lock the socket buffer if it isn't locked and
1908 * verify that there is space for the data being inserted.
2d21ac55
A
1909 *
1910 * Returns: 0 Success
1911 * EPIPE
1912 * sblock:EWOULDBLOCK
1913 * sblock:EINTR
1914 * sbwait:EBADF
1915 * sbwait:EINTR
1916 * [so_error]:???
91447636 1917 */
39236c6e
A
1918int
1919sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1920 int32_t clen, int32_t atomic, int flags, int *sblocked,
1921 struct mbuf *control)
91447636 1922{
0a7de745 1923 int error = 0;
b0d623f7 1924 int32_t space;
0a7de745 1925 int assumelock = 0;
91447636
A
1926
1927restart:
1928 if (*sblocked == 0) {
3a60a9f5 1929 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
2d21ac55
A
1930 so->so_send_filt_thread != 0 &&
1931 so->so_send_filt_thread == current_thread()) {
3a60a9f5
A
1932 /*
1933 * We're being called recursively from a filter,
1934 * allow this to continue. Radar 4150520.
1935 * Don't set sblocked because we don't want
1936 * to perform an unlock later.
1937 */
1938 assumelock = 1;
2d21ac55 1939 } else {
3a60a9f5
A
1940 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1941 if (error) {
0a7de745 1942 if (so->so_flags & SOF_DEFUNCT) {
6d2010ae 1943 goto defunct;
0a7de745
A
1944 }
1945 return error;
3a60a9f5
A
1946 }
1947 *sblocked = 1;
1948 }
91447636 1949 }
2d21ac55
A
1950
1951 /*
6d2010ae
A
1952 * If a send attempt is made on a socket that has been marked
1953 * as inactive (disconnected), reject the request.
2d21ac55 1954 */
6d2010ae
A
1955 if (so->so_flags & SOF_DEFUNCT) {
1956defunct:
1957 error = EPIPE;
39037602
A
1958 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1959 __func__, proc_selfpid(), proc_best_name(current_proc()),
3e170ce0 1960 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 1961 SOCK_DOM(so), SOCK_TYPE(so), error);
0a7de745 1962 return error;
6d2010ae 1963 }
2d21ac55 1964
fe8ab488
A
1965 if (so->so_state & SS_CANTSENDMORE) {
1966#if CONTENT_FILTER
1967 /*
1968 * Can re-inject data of half closed connections
1969 */
1970 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
0a7de745
A
1971 so->so_snd.sb_cfil_thread == current_thread() &&
1972 cfil_sock_data_pending(&so->so_snd) != 0) {
fe8ab488 1973 CFIL_LOG(LOG_INFO,
0a7de745
A
1974 "so %llx ignore SS_CANTSENDMORE",
1975 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1976 } else
fe8ab488 1977#endif /* CONTENT_FILTER */
0a7de745 1978 return EPIPE;
fe8ab488 1979 }
91447636
A
1980 if (so->so_error) {
1981 error = so->so_error;
1982 so->so_error = 0;
0a7de745 1983 return error;
91447636 1984 }
2d21ac55 1985
91447636 1986 if ((so->so_state & SS_ISCONNECTED) == 0) {
2d21ac55 1987 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
fe8ab488 1988 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
3e170ce0 1989 (resid != 0 || clen == 0) &&
0a7de745
A
1990 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1991 return ENOTCONN;
1992 }
cb323159 1993 } else if (addr == 0) {
0a7de745
A
1994 return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1995 ENOTCONN : EDESTADDRREQ;
2d21ac55 1996 }
91447636 1997 }
3e170ce0 1998
0a7de745 1999 if (so->so_flags & SOF_ENABLE_MSGS) {
39236c6e 2000 space = msgq_sbspace(so, control);
0a7de745 2001 } else {
39236c6e 2002 space = sbspace(&so->so_snd);
0a7de745 2003 }
39236c6e 2004
0a7de745 2005 if (flags & MSG_OOB) {
91447636 2006 space += 1024;
0a7de745 2007 }
91447636 2008 if ((atomic && resid > so->so_snd.sb_hiwat) ||
0a7de745
A
2009 clen > so->so_snd.sb_hiwat) {
2010 return EMSGSIZE;
2011 }
39236c6e 2012
316670eb 2013 if ((space < resid + clen &&
3e170ce0
A
2014 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2015 space < clen)) ||
316670eb 2016 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
3e170ce0
A
2017 /*
2018 * don't block the connectx call when there's more data
2019 * than can be copied.
2020 */
2021 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2022 if (space == 0) {
0a7de745 2023 return EWOULDBLOCK;
3e170ce0
A
2024 }
2025 if (space < (int32_t)so->so_snd.sb_lowat) {
0a7de745 2026 return 0;
3e170ce0
A
2027 }
2028 }
2d21ac55
A
2029 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2030 assumelock) {
0a7de745 2031 return EWOULDBLOCK;
3a60a9f5 2032 }
0a7de745 2033 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
6d2010ae 2034 *sblocked = 0;
91447636
A
2035 error = sbwait(&so->so_snd);
2036 if (error) {
0a7de745 2037 if (so->so_flags & SOF_DEFUNCT) {
6d2010ae 2038 goto defunct;
0a7de745
A
2039 }
2040 return error;
91447636
A
2041 }
2042 goto restart;
2043 }
0a7de745 2044 return 0;
91447636
A
2045}
2046
1c79356b
A
2047/*
2048 * Send on a socket.
2049 * If send must go all at once and message is larger than
2050 * send buffering, then hard error.
2051 * Lock against other senders.
2052 * If must go all at once and not enough room now, then
2053 * inform user that this would block and do nothing.
2054 * Otherwise, if nonblocking, send as much as possible.
2055 * The data to be sent is described by "uio" if nonzero,
2056 * otherwise by the mbuf chain "top" (which must be null
2057 * if uio is not). Data provided in mbuf chain must be small
2058 * enough to send all at once.
2059 *
2060 * Returns nonzero on error, timeout or signal; callers
2061 * must check for short counts if EINTR/ERESTART are returned.
2062 * Data and control buffers are freed on return.
2d21ac55
A
2063 *
2064 * Returns: 0 Success
2065 * EOPNOTSUPP
2066 * EINVAL
2067 * ENOBUFS
2068 * uiomove:EFAULT
2069 * sosendcheck:EPIPE
2070 * sosendcheck:EWOULDBLOCK
2071 * sosendcheck:EINTR
2072 * sosendcheck:EBADF
2073 * sosendcheck:EINTR
2074 * sosendcheck:??? [value from so_error]
2075 * <pru_send>:ECONNRESET[TCP]
2076 * <pru_send>:EINVAL[TCP]
2077 * <pru_send>:ENOBUFS[TCP]
2078 * <pru_send>:EADDRINUSE[TCP]
2079 * <pru_send>:EADDRNOTAVAIL[TCP]
2080 * <pru_send>:EAFNOSUPPORT[TCP]
2081 * <pru_send>:EACCES[TCP]
2082 * <pru_send>:EAGAIN[TCP]
2083 * <pru_send>:EPERM[TCP]
2084 * <pru_send>:EMSGSIZE[TCP]
2085 * <pru_send>:EHOSTUNREACH[TCP]
2086 * <pru_send>:ENETUNREACH[TCP]
2087 * <pru_send>:ENETDOWN[TCP]
2088 * <pru_send>:ENOMEM[TCP]
2089 * <pru_send>:ENOBUFS[TCP]
2090 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2091 * <pru_send>:EINVAL[AF_UNIX]
2092 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2093 * <pru_send>:EPIPE[AF_UNIX]
2094 * <pru_send>:ENOTCONN[AF_UNIX]
2095 * <pru_send>:EISCONN[AF_UNIX]
2096 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2097 * <sf_data_out>:??? [whatever a filter author chooses]
2098 *
2099 * Notes: Other <pru_send> returns depend on the protocol family; all
2100 * <sf_data_out> returns depend on what the filter author causes
2101 * their filter to return.
1c79356b
A
2102 */
2103int
2d21ac55
A
2104sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2105 struct mbuf *top, struct mbuf *control, int flags)
1c79356b
A
2106{
2107 struct mbuf **mp;
39236c6e 2108 struct mbuf *m, *freelist = NULL;
3e170ce0 2109 user_ssize_t space, len, resid, orig_resid;
91447636 2110 int clen = 0, error, dontroute, mlen, sendflags;
1c79356b 2111 int atomic = sosendallatonce(so) || top;
91447636 2112 int sblocked = 0;
1c79356b 2113 struct proc *p = current_proc();
39236c6e 2114 struct mbuf *control_copy = NULL;
3e170ce0
A
2115 uint16_t headroom = 0;
2116 boolean_t en_tracing = FALSE;
1c79356b 2117
0a7de745 2118 if (uio != NULL) {
91447636 2119 resid = uio_resid(uio);
0a7de745 2120 } else {
1c79356b 2121 resid = top->m_pkthdr.len;
0a7de745 2122 }
39236c6e 2123
2d21ac55
A
2124 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2125 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1c79356b 2126
91447636 2127 socket_lock(so, 1);
fe8ab488 2128
3e170ce0
A
2129 /*
2130 * trace if tracing & network (vs. unix) sockets & and
2131 * non-loopback
2132 */
2133 if (ENTR_SHOULDTRACE &&
2134 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2135 struct inpcb *inp = sotoinpcb(so);
2136 if (inp->inp_last_outifp != NULL &&
2137 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2138 en_tracing = TRUE;
2139 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2140 VM_KERNEL_ADDRPERM(so),
2141 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2142 (int64_t)resid);
2143 orig_resid = resid;
2144 }
2145 }
2146
fe8ab488
A
2147 /*
2148 * Re-injection should not affect process accounting
2149 */
2150 if ((flags & MSG_SKIPCFIL) == 0) {
3e170ce0
A
2151 so_update_last_owner_locked(so, p);
2152 so_update_policy(so);
2153
fe8ab488 2154#if NECP
3e170ce0 2155 so_update_necp_policy(so, NULL, addr);
fe8ab488
A
2156#endif /* NECP */
2157 }
3e170ce0 2158
2d21ac55
A
2159 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2160 error = EOPNOTSUPP;
5ba3f43e 2161 goto out_locked;
2d21ac55 2162 }
91447636 2163
1c79356b
A
2164 /*
2165 * In theory resid should be unsigned.
2166 * However, space must be signed, as it might be less than 0
2167 * if we over-committed, and we must use a signed comparison
2168 * of space and resid. On the other hand, a negative resid
2169 * causes us to loop sending 0-length segments to the protocol.
2170 *
39236c6e
A
2171 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2172 * But it will be used by sockets doing message delivery.
2173 *
fe8ab488 2174 * Note: We limit resid to be a positive int value as we use
39236c6e 2175 * imin() to set bytes_to_copy -- radr://14558484
1c79356b 2176 */
fe8ab488 2177 if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
39236c6e 2178 !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
1c79356b 2179 error = EINVAL;
5ba3f43e 2180 goto out_locked;
1c79356b
A
2181 }
2182
39236c6e
A
2183 dontroute = (flags & MSG_DONTROUTE) &&
2184 (so->so_options & SO_DONTROUTE) == 0 &&
1c79356b 2185 (so->so_proto->pr_flags & PR_ATOMIC);
b0d623f7 2186 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
39236c6e 2187
0a7de745 2188 if (control != NULL) {
1c79356b 2189 clen = control->m_len;
0a7de745 2190 }
1c79356b 2191
0a7de745 2192 if (soreserveheadroom != 0) {
3e170ce0 2193 headroom = so->so_pktheadroom;
0a7de745 2194 }
3e170ce0 2195
1c79356b 2196 do {
2d21ac55 2197 error = sosendcheck(so, addr, resid, clen, atomic, flags,
39236c6e 2198 &sblocked, control);
0a7de745 2199 if (error) {
5ba3f43e 2200 goto out_locked;
0a7de745 2201 }
39236c6e 2202
1c79356b 2203 mp = &top;
0a7de745 2204 if (so->so_flags & SOF_ENABLE_MSGS) {
39236c6e 2205 space = msgq_sbspace(so, control);
0a7de745 2206 } else {
39236c6e 2207 space = sbspace(&so->so_snd) - clen;
0a7de745 2208 }
39236c6e 2209 space += ((flags & MSG_OOB) ? 1024 : 0);
fa4905b1 2210
1c79356b 2211 do {
2d21ac55 2212 if (uio == NULL) {
91447636
A
2213 /*
2214 * Data is prepackaged in "top".
2215 */
2216 resid = 0;
0a7de745 2217 if (flags & MSG_EOR) {
1c79356b 2218 top->m_flags |= M_EOR;
0a7de745 2219 }
91447636 2220 } else {
2d21ac55
A
2221 int chainlength;
2222 int bytes_to_copy;
2223 boolean_t jumbocl;
fe8ab488 2224 boolean_t bigcl;
3e170ce0 2225 int bytes_to_alloc;
2d21ac55 2226
b0d623f7 2227 bytes_to_copy = imin(resid, space);
2d21ac55 2228
3e170ce0 2229 bytes_to_alloc = bytes_to_copy;
0a7de745 2230 if (top == NULL) {
3e170ce0 2231 bytes_to_alloc += headroom;
0a7de745 2232 }
3e170ce0 2233
0a7de745 2234 if (sosendminchain > 0) {
91447636 2235 chainlength = 0;
0a7de745 2236 } else {
91447636 2237 chainlength = sosendmaxchain;
0a7de745 2238 }
2d21ac55 2239
fe8ab488 2240 /*
3e170ce0
A
2241 * Use big 4 KB cluster when the outgoing interface
2242 * does not prefer 2 KB clusters
fe8ab488 2243 */
3e170ce0 2244 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
fe8ab488 2245 sosendbigcl_ignore_capab;
3e170ce0 2246
2d21ac55
A
2247 /*
2248 * Attempt to use larger than system page-size
2249 * clusters for large writes only if there is
2250 * a jumbo cluster pool and if the socket is
2251 * marked accordingly.
2252 */
2253 jumbocl = sosendjcl && njcl > 0 &&
2254 ((so->so_flags & SOF_MULTIPAGES) ||
fe8ab488
A
2255 sosendjcl_ignore_capab) &&
2256 bigcl;
2d21ac55 2257
91447636 2258 socket_unlock(so, 0);
2d21ac55 2259
91447636
A
2260 do {
2261 int num_needed;
39236c6e 2262 int hdrs_needed = (top == NULL) ? 1 : 0;
2d21ac55 2263
91447636 2264 /*
2d21ac55
A
2265 * try to maintain a local cache of mbuf
2266 * clusters needed to complete this
2267 * write the list is further limited to
2268 * the number that are currently needed
2269 * to fill the socket this mechanism
2270 * allows a large number of mbufs/
2271 * clusters to be grabbed under a single
2272 * mbuf lock... if we can't get any
2273 * clusters, than fall back to trying
2274 * for mbufs if we fail early (or
2275 * miscalcluate the number needed) make
2276 * sure to release any clusters we
2277 * haven't yet consumed.
91447636 2278 */
2d21ac55 2279 if (freelist == NULL &&
3e170ce0 2280 bytes_to_alloc > MBIGCLBYTES &&
6d2010ae 2281 jumbocl) {
2d21ac55 2282 num_needed =
3e170ce0 2283 bytes_to_alloc / M16KCLBYTES;
2d21ac55 2284
3e170ce0 2285 if ((bytes_to_alloc -
2d21ac55 2286 (num_needed * M16KCLBYTES))
0a7de745 2287 >= MINCLSIZE) {
2d21ac55 2288 num_needed++;
0a7de745 2289 }
91447636 2290
2d21ac55
A
2291 freelist =
2292 m_getpackets_internal(
0a7de745
A
2293 (unsigned int *)&num_needed,
2294 hdrs_needed, M_WAIT, 0,
2295 M16KCLBYTES);
2d21ac55
A
2296 /*
2297 * Fall back to 4K cluster size
2298 * if allocation failed
2299 */
2300 }
2301
2302 if (freelist == NULL &&
3e170ce0 2303 bytes_to_alloc > MCLBYTES &&
fe8ab488 2304 bigcl) {
2d21ac55 2305 num_needed =
3e170ce0 2306 bytes_to_alloc / MBIGCLBYTES;
2d21ac55 2307
3e170ce0 2308 if ((bytes_to_alloc -
6d2010ae 2309 (num_needed * MBIGCLBYTES)) >=
0a7de745 2310 MINCLSIZE) {
91447636 2311 num_needed++;
0a7de745 2312 }
2d21ac55
A
2313
2314 freelist =
2315 m_getpackets_internal(
0a7de745
A
2316 (unsigned int *)&num_needed,
2317 hdrs_needed, M_WAIT, 0,
2318 MBIGCLBYTES);
2d21ac55
A
2319 /*
2320 * Fall back to cluster size
2321 * if allocation failed
2322 */
91447636 2323 }
2d21ac55 2324
3e170ce0
A
2325 /*
2326 * Allocate a cluster as we want to
2327 * avoid to split the data in more
2328 * that one segment and using MINCLSIZE
2329 * would lead us to allocate two mbufs
2330 */
2331 if (soreserveheadroom != 0 &&
2332 freelist == NULL &&
2333 ((top == NULL &&
2334 bytes_to_alloc > _MHLEN) ||
2335 bytes_to_alloc > _MLEN)) {
2336 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2337 MCLBYTES;
2338 freelist =
2339 m_getpackets_internal(
0a7de745
A
2340 (unsigned int *)&num_needed,
2341 hdrs_needed, M_WAIT, 0,
2342 MCLBYTES);
3e170ce0
A
2343 /*
2344 * Fall back to a single mbuf
2345 * if allocation failed
2346 */
2347 } else if (freelist == NULL &&
2348 bytes_to_alloc > MINCLSIZE) {
2d21ac55 2349 num_needed =
3e170ce0 2350 bytes_to_alloc / MCLBYTES;
2d21ac55 2351
3e170ce0 2352 if ((bytes_to_alloc -
2d21ac55 2353 (num_needed * MCLBYTES)) >=
0a7de745 2354 MINCLSIZE) {
91447636 2355 num_needed++;
0a7de745 2356 }
2d21ac55
A
2357
2358 freelist =
2359 m_getpackets_internal(
0a7de745
A
2360 (unsigned int *)&num_needed,
2361 hdrs_needed, M_WAIT, 0,
2362 MCLBYTES);
2d21ac55
A
2363 /*
2364 * Fall back to a single mbuf
2365 * if allocation failed
2366 */
91447636 2367 }
3e170ce0
A
2368 /*
2369 * For datagram protocols, leave
2370 * headroom for protocol headers
2371 * in the first cluster of the chain
2372 */
2373 if (freelist != NULL && atomic &&
2374 top == NULL && headroom > 0) {
2375 freelist->m_data += headroom;
2376 }
39037602 2377
3e170ce0
A
2378 /*
2379 * Fall back to regular mbufs without
2380 * reserving the socket headroom
2381 */
91447636 2382 if (freelist == NULL) {
0a7de745 2383 if (top == NULL) {
2d21ac55
A
2384 MGETHDR(freelist,
2385 M_WAIT, MT_DATA);
0a7de745 2386 } else {
2d21ac55
A
2387 MGET(freelist,
2388 M_WAIT, MT_DATA);
0a7de745 2389 }
91447636
A
2390
2391 if (freelist == NULL) {
2392 error = ENOBUFS;
2393 socket_lock(so, 0);
5ba3f43e 2394 goto out_locked;
91447636
A
2395 }
2396 /*
2d21ac55
A
2397 * For datagram protocols,
2398 * leave room for protocol
2399 * headers in first mbuf.
91447636 2400 */
39236c6e 2401 if (atomic && top == NULL &&
2d21ac55
A
2402 bytes_to_copy < MHLEN) {
2403 MH_ALIGN(freelist,
2404 bytes_to_copy);
2405 }
91447636
A
2406 }
2407 m = freelist;
2408 freelist = m->m_next;
2409 m->m_next = NULL;
2d21ac55 2410
0a7de745 2411 if ((m->m_flags & M_EXT)) {
3e170ce0 2412 mlen = m->m_ext.ext_size -
d9a64523 2413 M_LEADINGSPACE(m);
0a7de745 2414 } else if ((m->m_flags & M_PKTHDR)) {
2d21ac55 2415 mlen =
d9a64523 2416 MHLEN - M_LEADINGSPACE(m);
0a7de745 2417 } else {
d9a64523 2418 mlen = MLEN - M_LEADINGSPACE(m);
0a7de745 2419 }
b0d623f7 2420 len = imin(mlen, bytes_to_copy);
91447636
A
2421
2422 chainlength += len;
2d21ac55 2423
91447636 2424 space -= len;
fa4905b1 2425
2d21ac55 2426 error = uiomove(mtod(m, caddr_t),
b0d623f7 2427 len, uio);
2d21ac55 2428
91447636 2429 resid = uio_resid(uio);
2d21ac55 2430
91447636
A
2431 m->m_len = len;
2432 *mp = m;
2433 top->m_pkthdr.len += len;
0a7de745 2434 if (error) {
91447636 2435 break;
0a7de745 2436 }
91447636
A
2437 mp = &m->m_next;
2438 if (resid <= 0) {
0a7de745 2439 if (flags & MSG_EOR) {
91447636 2440 top->m_flags |= M_EOR;
0a7de745 2441 }
91447636
A
2442 break;
2443 }
2444 bytes_to_copy = min(resid, space);
2d21ac55
A
2445 } while (space > 0 &&
2446 (chainlength < sosendmaxchain || atomic ||
2447 resid < MINCLSIZE));
2448
91447636 2449 socket_lock(so, 0);
2d21ac55 2450
0a7de745 2451 if (error) {
5ba3f43e 2452 goto out_locked;
0a7de745 2453 }
91447636 2454 }
2d21ac55 2455
0a7de745 2456 if (dontroute) {
2d21ac55 2457 so->so_options |= SO_DONTROUTE;
0a7de745 2458 }
2d21ac55 2459
3e170ce0
A
2460 /*
2461 * Compute flags here, for pru_send and NKEs
2462 *
2463 * If the user set MSG_EOF, the protocol
2464 * understands this flag and nothing left to
2465 * send then use PRU_SEND_EOF instead of PRU_SEND.
2466 */
2d21ac55 2467 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2d21ac55 2468 ((flags & MSG_EOF) &&
3e170ce0
A
2469 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2470 (resid <= 0)) ? PRUS_EOF :
2471 /* If there is more to send set PRUS_MORETOCOME */
2472 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2473
fe8ab488
A
2474 if ((flags & MSG_SKIPCFIL) == 0) {
2475 /*
2476 * Socket filter processing
2477 */
2478 error = sflt_data_out(so, addr, &top,
2479 &control, (sendflags & MSG_OOB) ?
2480 sock_data_filt_flag_oob : 0);
2481 if (error) {
2482 if (error == EJUSTRETURN) {
2483 error = 0;
2484 clen = 0;
2485 control = NULL;
2486 top = NULL;
2487 }
5ba3f43e 2488 goto out_locked;
91447636 2489 }
fe8ab488
A
2490#if CONTENT_FILTER
2491 /*
2492 * Content filter processing
2493 */
2494 error = cfil_sock_data_out(so, addr, top,
d9a64523 2495 control, sendflags);
fe8ab488
A
2496 if (error) {
2497 if (error == EJUSTRETURN) {
2498 error = 0;
2499 clen = 0;
2500 control = NULL;
2501 top = NULL;
0a7de745 2502 }
5ba3f43e 2503 goto out_locked;
fe8ab488
A
2504 }
2505#endif /* CONTENT_FILTER */
1c79356b 2506 }
39236c6e
A
2507 if (so->so_flags & SOF_ENABLE_MSGS) {
2508 /*
2509 * Make a copy of control mbuf,
2510 * so that msg priority can be
2511 * passed to subsequent mbufs.
2512 */
2513 control_copy = m_dup(control, M_NOWAIT);
2514 }
6d2010ae 2515 error = (*so->so_proto->pr_usrreqs->pru_send)
39236c6e
A
2516 (so, sendflags, top, addr, control, p);
2517
0a7de745 2518 if (dontroute) {
2d21ac55 2519 so->so_options &= ~SO_DONTROUTE;
0a7de745 2520 }
2d21ac55
A
2521
2522 clen = 0;
39236c6e
A
2523 control = control_copy;
2524 control_copy = NULL;
2525 top = NULL;
2d21ac55 2526 mp = &top;
0a7de745 2527 if (error) {
5ba3f43e 2528 goto out_locked;
0a7de745 2529 }
1c79356b
A
2530 } while (resid && space > 0);
2531 } while (resid);
2532
5ba3f43e 2533out_locked:
0a7de745
A
2534 if (sblocked) {
2535 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2536 } else {
3a60a9f5 2537 socket_unlock(so, 1);
0a7de745
A
2538 }
2539 if (top != NULL) {
1c79356b 2540 m_freem(top);
0a7de745
A
2541 }
2542 if (control != NULL) {
1c79356b 2543 m_freem(control);
0a7de745
A
2544 }
2545 if (freelist != NULL) {
2d21ac55 2546 m_freem_list(freelist);
0a7de745
A
2547 }
2548 if (control_copy != NULL) {
39236c6e 2549 m_freem(control_copy);
0a7de745 2550 }
1c79356b 2551
5ba3f43e 2552 soclearfastopen(so);
3e170ce0
A
2553
2554 if (en_tracing) {
2555 /* resid passed here is the bytes left in uio */
2556 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2557 VM_KERNEL_ADDRPERM(so),
2558 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2559 (int64_t)(orig_resid - resid));
2560 }
2561 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2562 so->so_snd.sb_cc, space, error);
1c79356b 2563
0a7de745 2564 return error;
1c79356b
A
2565}
2566
d9a64523
A
2567int
2568sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2569{
cb323159 2570 struct mbuf *m0 = NULL, *control_end = NULL;
d9a64523
A
2571
2572 socket_lock_assert_owned(so);
2573
2574 /*
2575 * top must points to mbuf chain to be sent.
2576 * If control is not NULL, top must be packet header
2577 */
2578 VERIFY(top != NULL &&
0a7de745 2579 (control == NULL || top->m_flags & M_PKTHDR));
d9a64523
A
2580
2581 /*
2582 * If control is not passed in, see if we can get it
2583 * from top.
2584 */
2585 if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2586 // Locate start of control if present and start of data
2587 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2588 if (m0->m_flags & M_PKTHDR) {
2589 top = m0;
2590 break;
2591 } else if (m0->m_type == MT_CONTROL) {
2592 if (control == NULL) {
2593 // Found start of control
2594 control = m0;
2595 }
2596 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2597 // Found end of control
2598 control_end = m0;
2599 }
2600 }
2601 }
0a7de745 2602 if (control_end != NULL) {
d9a64523 2603 control_end->m_next = NULL;
0a7de745 2604 }
d9a64523
A
2605 }
2606
2607 int error = (*so->so_proto->pr_usrreqs->pru_send)
0a7de745 2608 (so, sendflags, top, addr, control, current_proc());
d9a64523
A
2609
2610 return error;
2611}
2612
3e170ce0
A
2613/*
2614 * Supported only connected sockets (no address) without ancillary data
2615 * (control mbuf) for atomic protocols
2616 */
fe8ab488 2617int
3e170ce0 2618sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
fe8ab488
A
2619{
2620 struct mbuf *m, *freelist = NULL;
2621 user_ssize_t len, resid;
3e170ce0
A
2622 int error, dontroute, mlen;
2623 int atomic = sosendallatonce(so);
fe8ab488
A
2624 int sblocked = 0;
2625 struct proc *p = current_proc();
2626 u_int uiofirst = 0;
2627 u_int uiolast = 0;
3e170ce0
A
2628 struct mbuf *top = NULL;
2629 uint16_t headroom = 0;
2630 boolean_t bigcl;
fe8ab488
A
2631
2632 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2633 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2634
2635 if (so->so_type != SOCK_DGRAM) {
2636 error = EINVAL;
2637 goto out;
2638 }
2639 if (atomic == 0) {
2640 error = EINVAL;
2641 goto out;
2642 }
2643 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2644 error = EPROTONOSUPPORT;
2645 goto out;
2646 }
2647 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2648 error = EINVAL;
2649 goto out;
2650 }
3e170ce0 2651 resid = uio_array_resid(uioarray, uiocnt);
fe8ab488
A
2652
2653 /*
2654 * In theory resid should be unsigned.
2655 * However, space must be signed, as it might be less than 0
2656 * if we over-committed, and we must use a signed comparison
2657 * of space and resid. On the other hand, a negative resid
2658 * causes us to loop sending 0-length segments to the protocol.
2659 *
2660 * Note: We limit resid to be a positive int value as we use
2661 * imin() to set bytes_to_copy -- radr://14558484
2662 */
2663 if (resid < 0 || resid > INT_MAX) {
2664 error = EINVAL;
2665 goto out;
2666 }
fe8ab488
A
2667
2668 socket_lock(so, 1);
2669 so_update_last_owner_locked(so, p);
2670 so_update_policy(so);
3e170ce0 2671
fe8ab488 2672#if NECP
3e170ce0 2673 so_update_necp_policy(so, NULL, NULL);
fe8ab488 2674#endif /* NECP */
3e170ce0 2675
fe8ab488
A
2676 dontroute = (flags & MSG_DONTROUTE) &&
2677 (so->so_options & SO_DONTROUTE) == 0 &&
2678 (so->so_proto->pr_flags & PR_ATOMIC);
2679 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2680
3e170ce0
A
2681 error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2682 &sblocked, NULL);
0a7de745 2683 if (error) {
fe8ab488 2684 goto release;
0a7de745 2685 }
fe8ab488 2686
3e170ce0
A
2687 /*
2688 * Use big 4 KB clusters when the outgoing interface does not prefer
2689 * 2 KB clusters
2690 */
2691 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2692
0a7de745 2693 if (soreserveheadroom != 0) {
3e170ce0 2694 headroom = so->so_pktheadroom;
0a7de745 2695 }
3e170ce0 2696
fe8ab488
A
2697 do {
2698 int i;
3e170ce0
A
2699 int num_needed = 0;
2700 int chainlength;
2701 size_t maxpktlen = 0;
2702 int bytes_to_alloc;
fe8ab488 2703
0a7de745 2704 if (sosendminchain > 0) {
3e170ce0 2705 chainlength = 0;
0a7de745 2706 } else {
3e170ce0 2707 chainlength = sosendmaxchain;
0a7de745 2708 }
fe8ab488 2709
3e170ce0 2710 socket_unlock(so, 0);
fe8ab488 2711
3e170ce0
A
2712 /*
2713 * Find a set of uio that fit in a reasonable number
2714 * of mbuf packets
2715 */
2716 for (i = uiofirst; i < uiocnt; i++) {
2717 struct uio *auio = uioarray[i];
fe8ab488 2718
3e170ce0 2719 len = uio_resid(auio);
fe8ab488 2720
3e170ce0 2721 /* Do nothing for empty messages */
0a7de745 2722 if (len == 0) {
3e170ce0 2723 continue;
0a7de745 2724 }
fe8ab488 2725
3e170ce0
A
2726 num_needed += 1;
2727 uiolast += 1;
fe8ab488 2728
0a7de745 2729 if (len > maxpktlen) {
3e170ce0 2730 maxpktlen = len;
0a7de745 2731 }
fe8ab488 2732
3e170ce0 2733 chainlength += len;
0a7de745 2734 if (chainlength > sosendmaxchain) {
fe8ab488 2735 break;
0a7de745 2736 }
3e170ce0
A
2737 }
2738 /*
2739 * Nothing left to send
2740 */
2741 if (num_needed == 0) {
2742 socket_lock(so, 0);
2743 break;
2744 }
2745 /*
2746 * Allocate buffer large enough to include headroom space for
2747 * network and link header
39037602 2748 *
3e170ce0
A
2749 */
2750 bytes_to_alloc = maxpktlen + headroom;
2751
2752 /*
2753 * Allocate a single contiguous buffer of the smallest available
2754 * size when possible
2755 */
2756 if (bytes_to_alloc > MCLBYTES &&
2757 bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2758 freelist = m_getpackets_internal(
0a7de745
A
2759 (unsigned int *)&num_needed,
2760 num_needed, M_WAIT, 1,
2761 MBIGCLBYTES);
3e170ce0
A
2762 } else if (bytes_to_alloc > _MHLEN &&
2763 bytes_to_alloc <= MCLBYTES) {
2764 freelist = m_getpackets_internal(
0a7de745
A
2765 (unsigned int *)&num_needed,
2766 num_needed, M_WAIT, 1,
2767 MCLBYTES);
3e170ce0 2768 } else {
fe8ab488 2769 freelist = m_allocpacket_internal(
0a7de745
A
2770 (unsigned int *)&num_needed,
2771 bytes_to_alloc, NULL, M_WAIT, 1, 0);
3e170ce0 2772 }
39037602 2773
3e170ce0
A
2774 if (freelist == NULL) {
2775 socket_lock(so, 0);
2776 error = ENOMEM;
2777 goto release;
2778 }
2779 /*
2780 * Copy each uio of the set into its own mbuf packet
2781 */
2782 for (i = uiofirst, m = freelist;
2783 i < uiolast && m != NULL;
2784 i++) {
2785 int bytes_to_copy;
2786 struct mbuf *n;
2787 struct uio *auio = uioarray[i];
fe8ab488 2788
3e170ce0
A
2789 bytes_to_copy = uio_resid(auio);
2790
2791 /* Do nothing for empty messages */
0a7de745 2792 if (bytes_to_copy == 0) {
3e170ce0 2793 continue;
0a7de745 2794 }
fe8ab488 2795 /*
3e170ce0
A
2796 * Leave headroom for protocol headers
2797 * in the first mbuf of the chain
fe8ab488 2798 */
3e170ce0
A
2799 m->m_data += headroom;
2800
2801 for (n = m; n != NULL; n = n->m_next) {
0a7de745 2802 if ((m->m_flags & M_EXT)) {
3e170ce0 2803 mlen = m->m_ext.ext_size -
d9a64523 2804 M_LEADINGSPACE(m);
0a7de745 2805 } else if ((m->m_flags & M_PKTHDR)) {
3e170ce0 2806 mlen =
d9a64523 2807 MHLEN - M_LEADINGSPACE(m);
0a7de745 2808 } else {
d9a64523 2809 mlen = MLEN - M_LEADINGSPACE(m);
0a7de745 2810 }
3e170ce0 2811 len = imin(mlen, bytes_to_copy);
fe8ab488 2812
3e170ce0
A
2813 /*
2814 * Note: uiomove() decrements the iovec
2815 * length
2816 */
2817 error = uiomove(mtod(n, caddr_t),
2818 len, auio);
0a7de745 2819 if (error != 0) {
fe8ab488 2820 break;
0a7de745 2821 }
3e170ce0
A
2822 n->m_len = len;
2823 m->m_pkthdr.len += len;
fe8ab488 2824
3e170ce0 2825 VERIFY(m->m_pkthdr.len <= maxpktlen);
fe8ab488 2826
3e170ce0
A
2827 bytes_to_copy -= len;
2828 resid -= len;
2829 }
2830 if (m->m_pkthdr.len == 0) {
2831 printf(
0a7de745
A
2832 "%s:%d so %llx pkt %llx type %u len null\n",
2833 __func__, __LINE__,
2834 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2835 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2836 m->m_type);
3e170ce0 2837 }
0a7de745 2838 if (error != 0) {
3e170ce0 2839 break;
0a7de745 2840 }
3e170ce0 2841 m = m->m_nextpkt;
fe8ab488
A
2842 }
2843
3e170ce0
A
2844 socket_lock(so, 0);
2845
0a7de745 2846 if (error) {
3e170ce0 2847 goto release;
0a7de745 2848 }
3e170ce0
A
2849 top = freelist;
2850 freelist = NULL;
2851
0a7de745 2852 if (dontroute) {
fe8ab488 2853 so->so_options |= SO_DONTROUTE;
0a7de745 2854 }
fe8ab488
A
2855
2856 if ((flags & MSG_SKIPCFIL) == 0) {
2857 struct mbuf **prevnextp = NULL;
3e170ce0 2858
fe8ab488
A
2859 for (i = uiofirst, m = top;
2860 i < uiolast && m != NULL;
2861 i++) {
2862 struct mbuf *nextpkt = m->m_nextpkt;
2863
2864 /*
2865 * Socket filter processing
2866 */
3e170ce0
A
2867 error = sflt_data_out(so, NULL, &m,
2868 NULL, 0);
0a7de745 2869 if (error != 0 && error != EJUSTRETURN) {
fe8ab488 2870 goto release;
0a7de745 2871 }
3e170ce0 2872
fe8ab488
A
2873#if CONTENT_FILTER
2874 if (error == 0) {
2875 /*
2876 * Content filter processing
2877 */
3e170ce0
A
2878 error = cfil_sock_data_out(so, NULL, m,
2879 NULL, 0);
0a7de745 2880 if (error != 0 && error != EJUSTRETURN) {
fe8ab488 2881 goto release;
0a7de745 2882 }
fe8ab488
A
2883 }
2884#endif /* CONTENT_FILTER */
2885 /*
2886 * Remove packet from the list when
2887 * swallowed by a filter
2888 */
2889 if (error == EJUSTRETURN) {
2890 error = 0;
0a7de745 2891 if (prevnextp != NULL) {
fe8ab488 2892 *prevnextp = nextpkt;
0a7de745 2893 } else {
fe8ab488 2894 top = nextpkt;
0a7de745 2895 }
3e170ce0
A
2896 }
2897
fe8ab488 2898 m = nextpkt;
0a7de745 2899 if (m != NULL) {
fe8ab488 2900 prevnextp = &m->m_nextpkt;
0a7de745 2901 }
fe8ab488
A
2902 }
2903 }
0a7de745 2904 if (top != NULL) {
fe8ab488 2905 error = (*so->so_proto->pr_usrreqs->pru_send_list)
3e170ce0 2906 (so, 0, top, NULL, NULL, p);
0a7de745 2907 }
fe8ab488 2908
0a7de745 2909 if (dontroute) {
fe8ab488 2910 so->so_options &= ~SO_DONTROUTE;
0a7de745 2911 }
fe8ab488 2912
fe8ab488
A
2913 top = NULL;
2914 uiofirst = uiolast;
2915 } while (resid > 0 && error == 0);
2916release:
0a7de745
A
2917 if (sblocked) {
2918 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2919 } else {
fe8ab488 2920 socket_unlock(so, 1);
0a7de745 2921 }
fe8ab488 2922out:
0a7de745 2923 if (top != NULL) {
fe8ab488 2924 m_freem(top);
0a7de745
A
2925 }
2926 if (freelist != NULL) {
fe8ab488 2927 m_freem_list(freelist);
0a7de745 2928 }
fe8ab488
A
2929
2930 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2931 so->so_snd.sb_cc, 0, error);
2932
0a7de745 2933 return error;
fe8ab488
A
2934}
2935
3e170ce0
A
2936/*
2937 * May return ERESTART when packet is dropped by MAC policy check
2938 */
2939static int
2940soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2941 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2942{
2943 int error = 0;
2944 struct mbuf *m = *mp;
2945 struct mbuf *nextrecord = *nextrecordp;
2946
2947 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2948#if CONFIG_MACF_SOCKET_SUBSET
2949 /*
2950 * Call the MAC framework for policy checking if we're in
2951 * the user process context and the socket isn't connected.
2952 */
2953 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2954 struct mbuf *m0 = m;
2955 /*
2956 * Dequeue this record (temporarily) from the receive
2957 * list since we're about to drop the socket's lock
2958 * where a new record may arrive and be appended to
2959 * the list. Upon MAC policy failure, the record
2960 * will be freed. Otherwise, we'll add it back to
2961 * the head of the list. We cannot rely on SB_LOCK
2962 * because append operation uses the socket's lock.
2963 */
2964 do {
2965 m->m_nextpkt = NULL;
2966 sbfree(&so->so_rcv, m);
2967 m = m->m_next;
2968 } while (m != NULL);
2969 m = m0;
2970 so->so_rcv.sb_mb = nextrecord;
2971 SB_EMPTY_FIXUP(&so->so_rcv);
2972 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2973 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2974 socket_unlock(so, 0);
2975
2976 if (mac_socket_check_received(proc_ucred(p), so,
2977 mtod(m, struct sockaddr *)) != 0) {
2978 /*
2979 * MAC policy failure; free this record and
2980 * process the next record (or block until
2981 * one is available). We have adjusted sb_cc
2982 * and sb_mbcnt above so there is no need to
2983 * call sbfree() again.
2984 */
2985 m_freem(m);
2986 /*
2987 * Clear SB_LOCK but don't unlock the socket.
2988 * Process the next record or wait for one.
2989 */
2990 socket_lock(so, 0);
2991 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2992 error = ERESTART;
2993 goto done;
2994 }
2995 socket_lock(so, 0);
2996 /*
2997 * If the socket has been defunct'd, drop it.
2998 */
2999 if (so->so_flags & SOF_DEFUNCT) {
3000 m_freem(m);
3001 error = ENOTCONN;
3002 goto done;
3003 }
3004 /*
3005 * Re-adjust the socket receive list and re-enqueue
3006 * the record in front of any packets which may have
3007 * been appended while we dropped the lock.
3008 */
0a7de745 3009 for (m = m0; m->m_next != NULL; m = m->m_next) {
3e170ce0 3010 sballoc(&so->so_rcv, m);
0a7de745 3011 }
3e170ce0
A
3012 sballoc(&so->so_rcv, m);
3013 if (so->so_rcv.sb_mb == NULL) {
3014 so->so_rcv.sb_lastrecord = m0;
3015 so->so_rcv.sb_mbtail = m;
3016 }
3017 m = m0;
3018 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3019 so->so_rcv.sb_mb = m;
3020 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3021 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3022 }
3023#endif /* CONFIG_MACF_SOCKET_SUBSET */
3024 if (psa != NULL) {
3025 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3026 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3027 error = EWOULDBLOCK;
3028 goto done;
3029 }
3030 }
3031 if (flags & MSG_PEEK) {
3032 m = m->m_next;
3033 } else {
3034 sbfree(&so->so_rcv, m);
3035 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3036 panic("%s: about to create invalid socketbuf",
3037 __func__);
3038 /* NOTREACHED */
3039 }
3040 MFREE(m, so->so_rcv.sb_mb);
3041 m = so->so_rcv.sb_mb;
3042 if (m != NULL) {
3043 m->m_nextpkt = nextrecord;
3044 } else {
3045 so->so_rcv.sb_mb = nextrecord;
3046 SB_EMPTY_FIXUP(&so->so_rcv);
3047 }
3048 }
3049done:
3050 *mp = m;
3051 *nextrecordp = nextrecord;
3052
0a7de745 3053 return error;
3e170ce0
A
3054}
3055
3056/*
3057 * Process one or more MT_CONTROL mbufs present before any data mbufs
3058 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3059 * just copy the data; if !MSG_PEEK, we call into the protocol to
3060 * perform externalization.
3061 */
3062static int
3063soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3064 struct mbuf **mp, struct mbuf **nextrecordp)
3065{
3066 int error = 0;
3067 struct mbuf *cm = NULL, *cmn;
3068 struct mbuf **cme = &cm;
3069 struct sockbuf *sb_rcv = &so->so_rcv;
3070 struct mbuf **msgpcm = NULL;
3071 struct mbuf *m = *mp;
3072 struct mbuf *nextrecord = *nextrecordp;
3073 struct protosw *pr = so->so_proto;
3074
3075 /*
3076 * Externalizing the control messages would require us to
3077 * drop the socket's lock below. Once we re-acquire the
3078 * lock, the mbuf chain might change. In order to preserve
3079 * consistency, we unlink all control messages from the
3080 * first mbuf chain in one shot and link them separately
3081 * onto a different chain.
3082 */
3083 do {
3084 if (flags & MSG_PEEK) {
3085 if (controlp != NULL) {
3086 if (*controlp == NULL) {
3087 msgpcm = controlp;
3088 }
3089 *controlp = m_copy(m, 0, m->m_len);
3090
3091 /*
3092 * If we failed to allocate an mbuf,
3093 * release any previously allocated
3094 * mbufs for control data. Return
3095 * an error. Keep the mbufs in the
3096 * socket as this is using
3097 * MSG_PEEK flag.
3098 */
3099 if (*controlp == NULL) {
3100 m_freem(*msgpcm);
3101 error = ENOBUFS;
3102 goto done;
3103 }
3104 controlp = &(*controlp)->m_next;
3105 }
3106 m = m->m_next;
3107 } else {
3108 m->m_nextpkt = NULL;
3109 sbfree(sb_rcv, m);
3110 sb_rcv->sb_mb = m->m_next;
3111 m->m_next = NULL;
3112 *cme = m;
3113 cme = &(*cme)->m_next;
3114 m = sb_rcv->sb_mb;
3115 }
3116 } while (m != NULL && m->m_type == MT_CONTROL);
3117
3118 if (!(flags & MSG_PEEK)) {
3119 if (sb_rcv->sb_mb != NULL) {
3120 sb_rcv->sb_mb->m_nextpkt = nextrecord;
3121 } else {
3122 sb_rcv->sb_mb = nextrecord;
3123 SB_EMPTY_FIXUP(sb_rcv);
3124 }
0a7de745 3125 if (nextrecord == NULL) {
3e170ce0 3126 sb_rcv->sb_lastrecord = m;
0a7de745 3127 }
3e170ce0
A
3128 }
3129
3130 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3131 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3132
3133 while (cm != NULL) {
3134 int cmsg_type;
3135
3136 cmn = cm->m_next;
3137 cm->m_next = NULL;
3138 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3139
3140 /*
3141 * Call the protocol to externalize SCM_RIGHTS message
3142 * and return the modified message to the caller upon
3143 * success. Otherwise, all other control messages are
3144 * returned unmodified to the caller. Note that we
3145 * only get into this loop if MSG_PEEK is not set.
3146 */
3147 if (pr->pr_domain->dom_externalize != NULL &&
3148 cmsg_type == SCM_RIGHTS) {
3149 /*
3150 * Release socket lock: see 3903171. This
3151 * would also allow more records to be appended
3152 * to the socket buffer. We still have SB_LOCK
3153 * set on it, so we can be sure that the head
3154 * of the mbuf chain won't change.
3155 */
3156 socket_unlock(so, 0);
3157 error = (*pr->pr_domain->dom_externalize)(cm);
3158 socket_lock(so, 0);
3159 } else {
3160 error = 0;
3161 }
3162
3163 if (controlp != NULL && error == 0) {
3164 *controlp = cm;
3165 controlp = &(*controlp)->m_next;
3166 } else {
3167 (void) m_free(cm);
3168 }
3169 cm = cmn;
3170 }
3171 /*
3172 * Update the value of nextrecord in case we received new
3173 * records when the socket was unlocked above for
3174 * externalizing SCM_RIGHTS.
3175 */
0a7de745 3176 if (m != NULL) {
3e170ce0 3177 nextrecord = sb_rcv->sb_mb->m_nextpkt;
0a7de745 3178 } else {
3e170ce0 3179 nextrecord = sb_rcv->sb_mb;
0a7de745 3180 }
3e170ce0
A
3181
3182done:
3183 *mp = m;
3184 *nextrecordp = nextrecord;
3185
0a7de745 3186 return error;
3e170ce0
A
3187}
3188
1c79356b
A
3189/*
3190 * Implement receive operations on a socket.
3191 * We depend on the way that records are added to the sockbuf
3192 * by sbappend*. In particular, each record (mbufs linked through m_next)
3193 * must begin with an address if the protocol so specifies,
3194 * followed by an optional mbuf or mbufs containing ancillary data,
3195 * and then zero or more mbufs of data.
3196 * In order to avoid blocking network interrupts for the entire time here,
3197 * we splx() while doing the actual copy to user space.
3198 * Although the sockbuf is locked, new data may still be appended,
3199 * and thus we must maintain consistency of the sockbuf during that time.
3200 *
3201 * The caller may receive the data as a single mbuf chain by supplying
3202 * an mbuf **mp0 for use in returning the chain. The uio is then used
3203 * only for the count in uio_resid.
2d21ac55
A
3204 *
3205 * Returns: 0 Success
3206 * ENOBUFS
3207 * ENOTCONN
3208 * EWOULDBLOCK
3209 * uiomove:EFAULT
3210 * sblock:EWOULDBLOCK
3211 * sblock:EINTR
3212 * sbwait:EBADF
3213 * sbwait:EINTR
3214 * sodelayed_copy:EFAULT
3215 * <pru_rcvoob>:EINVAL[TCP]
3216 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3217 * <pru_rcvoob>:???
3218 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3219 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3220 * <pr_domain->dom_externalize>:???
3221 *
3222 * Notes: Additional return values from calls through <pru_rcvoob> and
3223 * <pr_domain->dom_externalize> depend on protocols other than
3224 * TCP or AF_UNIX, which are documented above.
1c79356b
A
3225 */
3226int
2d21ac55
A
3227soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3228 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1c79356b 3229{
39236c6e
A
3230 struct mbuf *m, **mp, *ml = NULL;
3231 struct mbuf *nextrecord, *free_list;
3232 int flags, error, offset;
3233 user_ssize_t len;
1c79356b 3234 struct protosw *pr = so->so_proto;
3e170ce0 3235 int moff, type = 0;
39236c6e
A
3236 user_ssize_t orig_resid = uio_resid(uio);
3237 user_ssize_t delayed_copy_len;
55e303ae
A
3238 int can_delay;
3239 int need_event;
3240 struct proc *p = current_proc();
3e170ce0 3241 boolean_t en_tracing = FALSE;
1c79356b 3242
fe8ab488
A
3243 /*
3244 * Sanity check on the length passed by caller as we are making 'int'
3245 * comparisons
3246 */
0a7de745
A
3247 if (orig_resid < 0 || orig_resid > INT_MAX) {
3248 return EINVAL;
3249 }
fe8ab488 3250
3e170ce0
A
3251 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3252 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3253 so->so_rcv.sb_hiwat);
3254
91447636 3255 socket_lock(so, 1);
6d2010ae 3256 so_update_last_owner_locked(so, p);
39236c6e 3257 so_update_policy(so);
1c79356b 3258
91447636 3259#ifdef MORE_LOCKING_DEBUG
39236c6e
A
3260 if (so->so_usecount == 1) {
3261 panic("%s: so=%x no other reference on socket\n", __func__, so);
3262 /* NOTREACHED */
3263 }
91447636 3264#endif
1c79356b 3265 mp = mp0;
0a7de745 3266 if (psa != NULL) {
39236c6e 3267 *psa = NULL;
0a7de745
A
3268 }
3269 if (controlp != NULL) {
39236c6e 3270 *controlp = NULL;
0a7de745
A
3271 }
3272 if (flagsp != NULL) {
3273 flags = *flagsp & ~MSG_EOR;
3274 } else {
1c79356b 3275 flags = 0;
0a7de745 3276 }
2d21ac55
A
3277
3278 /*
3279 * If a recv attempt is made on a previously-accepted socket
3280 * that has been marked as inactive (disconnected), reject
3281 * the request.
3282 */
3283 if (so->so_flags & SOF_DEFUNCT) {
3284 struct sockbuf *sb = &so->so_rcv;
3285
6d2010ae 3286 error = ENOTCONN;
39037602
A
3287 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3288 __func__, proc_pid(p), proc_best_name(p),
3289 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3290 SOCK_DOM(so), SOCK_TYPE(so), error);
2d21ac55
A
3291 /*
3292 * This socket should have been disconnected and flushed
6d2010ae
A
3293 * prior to being returned from sodefunct(); there should
3294 * be no data on its receive list, so panic otherwise.
2d21ac55 3295 */
0a7de745 3296 if (so->so_state & SS_DEFUNCT) {
6d2010ae 3297 sb_empty_assert(sb, __func__);
0a7de745 3298 }
2d21ac55 3299 socket_unlock(so, 1);
0a7de745 3300 return error;
2d21ac55
A
3301 }
3302
3e170ce0
A
3303 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3304 pr->pr_usrreqs->pru_preconnect) {
3305 /*
3306 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3307 * calling write() right after this. *If* the app calls a read
3308 * we do not want to block this read indefinetely. Thus,
3309 * we trigger a connect so that the session gets initiated.
3310 */
3311 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3312
3313 if (error) {
3314 socket_unlock(so, 1);
0a7de745 3315 return error;
3e170ce0
A
3316 }
3317 }
3318
3319 if (ENTR_SHOULDTRACE &&
3320 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3321 /*
3322 * enable energy tracing for inet sockets that go over
3323 * non-loopback interfaces only.
3324 */
3325 struct inpcb *inp = sotoinpcb(so);
3326 if (inp->inp_last_outifp != NULL &&
3327 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3328 en_tracing = TRUE;
3329 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3330 VM_KERNEL_ADDRPERM(so),
3331 ((so->so_state & SS_NBIO) ?
3332 kEnTrFlagNonBlocking : 0),
3333 (int64_t)orig_resid);
3334 }
3335 }
3336
2d21ac55
A
3337 /*
3338 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3339 * regardless of the flags argument. Here is the case were
3340 * out-of-band data is not inline.
3341 */
3342 if ((flags & MSG_OOB) ||
3343 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3344 (so->so_options & SO_OOBINLINE) == 0 &&
3345 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
1c79356b 3346 m = m_get(M_WAIT, MT_DATA);
55e303ae 3347 if (m == NULL) {
91447636 3348 socket_unlock(so, 1);
2d21ac55
A
3349 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3350 ENOBUFS, 0, 0, 0, 0);
0a7de745 3351 return ENOBUFS;
55e303ae 3352 }
1c79356b 3353 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
0a7de745 3354 if (error) {
1c79356b 3355 goto bad;
0a7de745 3356 }
91447636 3357 socket_unlock(so, 0);
1c79356b
A
3358 do {
3359 error = uiomove(mtod(m, caddr_t),
b0d623f7 3360 imin(uio_resid(uio), m->m_len), uio);
1c79356b 3361 m = m_free(m);
39236c6e 3362 } while (uio_resid(uio) && error == 0 && m != NULL);
91447636 3363 socket_lock(so, 0);
1c79356b 3364bad:
0a7de745 3365 if (m != NULL) {
1c79356b 3366 m_freem(m);
0a7de745 3367 }
39236c6e 3368
9bccf70c
A
3369 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3370 if (error == EWOULDBLOCK || error == EINVAL) {
2d21ac55 3371 /*
9bccf70c 3372 * Let's try to get normal data:
2d21ac55
A
3373 * EWOULDBLOCK: out-of-band data not
3374 * receive yet. EINVAL: out-of-band data
3375 * already read.
9bccf70c
A
3376 */
3377 error = 0;
3378 goto nooob;
39236c6e 3379 } else if (error == 0 && flagsp != NULL) {
9bccf70c 3380 *flagsp |= MSG_OOB;
2d21ac55
A
3381 }
3382 }
91447636 3383 socket_unlock(so, 1);
3e170ce0
A
3384 if (en_tracing) {
3385 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3386 VM_KERNEL_ADDRPERM(so), 0,
3387 (int64_t)(orig_resid - uio_resid(uio)));
3388 }
2d21ac55
A
3389 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3390 0, 0, 0, 0);
39236c6e 3391
0a7de745 3392 return error;
1c79356b
A
3393 }
3394nooob:
0a7de745 3395 if (mp != NULL) {
39236c6e 3396 *mp = NULL;
0a7de745 3397 }
fe8ab488
A
3398
3399 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
1c79356b 3400 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
fe8ab488 3401 }
1c79356b 3402
39236c6e 3403 free_list = NULL;
55e303ae 3404 delayed_copy_len = 0;
1c79356b 3405restart:
91447636 3406#ifdef MORE_LOCKING_DEBUG
0a7de745 3407 if (so->so_usecount <= 1) {
fe8ab488 3408 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3e170ce0 3409 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
0a7de745 3410 }
91447636 3411#endif
6601e61a
A
3412 /*
3413 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3414 * and if so just return to the caller. This could happen when
3415 * soreceive() is called by a socket upcall function during the
3416 * time the socket is freed. The socket buffer would have been
3417 * locked across the upcall, therefore we cannot put this thread
3418 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3419 * we may livelock), because the lock on the socket buffer will
3420 * only be released when the upcall routine returns to its caller.
3421 * Because the socket has been officially closed, there can be
3422 * no further read on it.
39236c6e
A
3423 *
3424 * A multipath subflow socket would have its SS_NOFDREF set by
3425 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3426 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
6601e61a
A
3427 */
3428 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
39236c6e 3429 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
6601e61a 3430 socket_unlock(so, 1);
0a7de745 3431 return 0;
6601e61a
A
3432 }
3433
9bccf70c
A
3434 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3435 if (error) {
91447636 3436 socket_unlock(so, 1);
2d21ac55
A
3437 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3438 0, 0, 0, 0);
3e170ce0
A
3439 if (en_tracing) {
3440 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3441 VM_KERNEL_ADDRPERM(so), 0,
3442 (int64_t)(orig_resid - uio_resid(uio)));
3443 }
0a7de745 3444 return error;
1c79356b 3445 }
1c79356b
A
3446
3447 m = so->so_rcv.sb_mb;
3448 /*
3449 * If we have less data than requested, block awaiting more
3450 * (subject to any timeout) if:
3451 * 1. the current count is less than the low water mark, or
3452 * 2. MSG_WAITALL is set, and it is possible to do the entire
3453 * receive operation at once if we block (resid <= hiwat).
3454 * 3. MSG_DONTWAIT is not set
3455 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3456 * we have to do the receive in sections, and thus risk returning
3457 * a short count if a timeout or signal occurs after we start.
3458 */
39236c6e 3459 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
91447636 3460 so->so_rcv.sb_cc < uio_resid(uio)) &&
2d21ac55 3461 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
91447636 3462 ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
39236c6e 3463 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
2d21ac55
A
3464 /*
3465 * Panic if we notice inconsistencies in the socket's
3466 * receive list; both sb_mb and sb_cc should correctly
3467 * reflect the contents of the list, otherwise we may
3468 * end up with false positives during select() or poll()
3469 * which could put the application in a bad state.
3470 */
316670eb 3471 SB_MB_CHECK(&so->so_rcv);
55e303ae 3472
1c79356b 3473 if (so->so_error) {
0a7de745 3474 if (m != NULL) {
1c79356b 3475 goto dontblock;
0a7de745 3476 }
1c79356b 3477 error = so->so_error;
0a7de745 3478 if ((flags & MSG_PEEK) == 0) {
1c79356b 3479 so->so_error = 0;
0a7de745 3480 }
1c79356b
A
3481 goto release;
3482 }
3483 if (so->so_state & SS_CANTRCVMORE) {
fe8ab488
A
3484#if CONTENT_FILTER
3485 /*
3486 * Deal with half closed connections
3487 */
3488 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
0a7de745 3489 cfil_sock_data_pending(&so->so_rcv) != 0) {
fe8ab488 3490 CFIL_LOG(LOG_INFO,
0a7de745
A
3491 "so %llx ignore SS_CANTRCVMORE",
3492 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3493 } else
fe8ab488 3494#endif /* CONTENT_FILTER */
0a7de745 3495 if (m != NULL) {
1c79356b 3496 goto dontblock;
0a7de745 3497 } else {
1c79356b 3498 goto release;
0a7de745 3499 }
1c79356b 3500 }
0a7de745 3501 for (; m != NULL; m = m->m_next) {
2d21ac55 3502 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
1c79356b
A
3503 m = so->so_rcv.sb_mb;
3504 goto dontblock;
3505 }
0a7de745
A
3506 }
3507 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
1c79356b
A
3508 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3509 error = ENOTCONN;
3510 goto release;
3511 }
0a7de745 3512 if (uio_resid(uio) == 0) {
1c79356b 3513 goto release;
0a7de745 3514 }
3e170ce0 3515
2d21ac55 3516 if ((so->so_state & SS_NBIO) ||
0a7de745 3517 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
1c79356b
A
3518 error = EWOULDBLOCK;
3519 goto release;
3520 }
2d21ac55
A
3521 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3522 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
0a7de745 3523 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
2d21ac55 3524#if EVEN_MORE_LOCKING_DEBUG
0a7de745 3525 if (socket_debug) {
2d21ac55 3526 printf("Waiting for socket data\n");
0a7de745 3527 }
91447636 3528#endif
55e303ae 3529
1c79356b 3530 error = sbwait(&so->so_rcv);
2d21ac55 3531#if EVEN_MORE_LOCKING_DEBUG
0a7de745 3532 if (socket_debug) {
2d21ac55 3533 printf("SORECEIVE - sbwait returned %d\n", error);
0a7de745 3534 }
91447636 3535#endif
39236c6e
A
3536 if (so->so_usecount < 1) {
3537 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3538 __func__, so, so->so_usecount);
3539 /* NOTREACHED */
3540 }
9bccf70c 3541 if (error) {
91447636 3542 socket_unlock(so, 1);
2d21ac55
A
3543 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3544 0, 0, 0, 0);
3e170ce0
A
3545 if (en_tracing) {
3546 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3547 VM_KERNEL_ADDRPERM(so), 0,
3548 (int64_t)(orig_resid - uio_resid(uio)));
3549 }
0a7de745 3550 return error;
1c79356b
A
3551 }
3552 goto restart;
3553 }
3554dontblock:
b0d623f7 3555 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
2d21ac55
A
3556 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3557 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
1c79356b 3558 nextrecord = m->m_nextpkt;
fe8ab488 3559
3e170ce0
A
3560 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3561 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3562 mp0 == NULL);
0a7de745 3563 if (error == ERESTART) {
3e170ce0 3564 goto restart;
0a7de745 3565 } else if (error != 0) {
3e170ce0 3566 goto release;
0a7de745 3567 }
1c79356b 3568 orig_resid = 0;
1c79356b 3569 }
2d21ac55
A
3570
3571 /*
3572 * Process one or more MT_CONTROL mbufs present before any data mbufs
3573 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3574 * just copy the data; if !MSG_PEEK, we call into the protocol to
3575 * perform externalization.
3576 */
3577 if (m != NULL && m->m_type == MT_CONTROL) {
3e170ce0 3578 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
0a7de745 3579 if (error != 0) {
3e170ce0 3580 goto release;
0a7de745 3581 }
316670eb 3582 orig_resid = 0;
1c79356b 3583 }
2d21ac55 3584
39236c6e
A
3585 /*
3586 * If the socket is a TCP socket with message delivery
3587 * enabled, then create a control msg to deliver the
3588 * relative TCP sequence number for this data. Waiting
3589 * until this point will protect against failures to
3590 * allocate an mbuf for control msgs.
3591 */
3592 if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3593 (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3594 struct mbuf *seq_cm;
3595
3596 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
0a7de745 3597 sizeof(uint32_t), SCM_SEQNUM, SOL_SOCKET);
39236c6e
A
3598 if (seq_cm == NULL) {
3599 /* unable to allocate a control mbuf */
3600 error = ENOBUFS;
3601 goto release;
3602 }
3603 *controlp = seq_cm;
3604 controlp = &seq_cm->m_next;
3605 }
3606
2d21ac55
A
3607 if (m != NULL) {
3608 if (!(flags & MSG_PEEK)) {
3609 /*
3610 * We get here because m points to an mbuf following
3611 * any MT_SONAME or MT_CONTROL mbufs which have been
3612 * processed above. In any case, m should be pointing
3613 * to the head of the mbuf chain, and the nextrecord
3614 * should be either NULL or equal to m->m_nextpkt.
3615 * See comments above about SB_LOCK.
3616 */
39236c6e
A
3617 if (m != so->so_rcv.sb_mb ||
3618 m->m_nextpkt != nextrecord) {
3619 panic("%s: post-control !sync so=%p m=%p "
3620 "nextrecord=%p\n", __func__, so, m,
3621 nextrecord);
3622 /* NOTREACHED */
3623 }
0a7de745 3624 if (nextrecord == NULL) {
2d21ac55 3625 so->so_rcv.sb_lastrecord = m;
0a7de745 3626 }
2d21ac55 3627 }
1c79356b 3628 type = m->m_type;
0a7de745 3629 if (type == MT_OOBDATA) {
1c79356b 3630 flags |= MSG_OOB;
0a7de745 3631 }
2d21ac55
A
3632 } else {
3633 if (!(flags & MSG_PEEK)) {
2d21ac55
A
3634 SB_EMPTY_FIXUP(&so->so_rcv);
3635 }
1c79356b 3636 }
2d21ac55
A
3637 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3638 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3639
1c79356b
A
3640 moff = 0;
3641 offset = 0;
fa4905b1 3642
0a7de745 3643 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy) {
2d21ac55 3644 can_delay = 1;
0a7de745 3645 } else {
2d21ac55 3646 can_delay = 0;
0a7de745 3647 }
55e303ae
A
3648
3649 need_event = 0;
fa4905b1 3650
39236c6e
A
3651 while (m != NULL &&
3652 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
1c79356b 3653 if (m->m_type == MT_OOBDATA) {
0a7de745 3654 if (type != MT_OOBDATA) {
1c79356b 3655 break;
0a7de745 3656 }
2d21ac55 3657 } else if (type == MT_OOBDATA) {
1c79356b 3658 break;
2d21ac55 3659 }
9bccf70c 3660 /*
2d21ac55 3661 * Make sure to allways set MSG_OOB event when getting
9bccf70c
A
3662 * out of band data inline.
3663 */
1c79356b 3664 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2d21ac55
A
3665 (so->so_options & SO_OOBINLINE) != 0 &&
3666 (so->so_state & SS_RCVATMARK) != 0) {
9bccf70c
A
3667 flags |= MSG_OOB;
3668 }
1c79356b 3669 so->so_state &= ~SS_RCVATMARK;
91447636 3670 len = uio_resid(uio) - delayed_copy_len;
0a7de745 3671 if (so->so_oobmark && len > so->so_oobmark - offset) {
1c79356b 3672 len = so->so_oobmark - offset;
0a7de745
A
3673 }
3674 if (len > m->m_len - moff) {
1c79356b 3675 len = m->m_len - moff;
0a7de745 3676 }
1c79356b
A
3677 /*
3678 * If mp is set, just pass back the mbufs.
3679 * Otherwise copy them out via the uio, then free.
3680 * Sockbuf must be consistent here (points to current mbuf,
3681 * it points to next record) when we drop priority;
3682 * we must note any additions to the sockbuf when we
3683 * block interrupts again.
3684 */
39236c6e 3685 if (mp == NULL) {
2d21ac55
A
3686 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3687 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
55e303ae 3688 if (can_delay && len == m->m_len) {
2d21ac55 3689 /*
55e303ae
A
3690 * only delay the copy if we're consuming the
3691 * mbuf and we're NOT in MSG_PEEK mode
3692 * and we have enough data to make it worthwile
2d21ac55
A
3693 * to drop and retake the lock... can_delay
3694 * reflects the state of the 2 latter
3695 * constraints moff should always be zero
3696 * in these cases
55e303ae 3697 */
2d21ac55 3698 delayed_copy_len += len;
55e303ae 3699 } else {
2d21ac55
A
3700 if (delayed_copy_len) {
3701 error = sodelayed_copy(so, uio,
3702 &free_list, &delayed_copy_len);
55e303ae
A
3703
3704 if (error) {
55e303ae
A
3705 goto release;
3706 }
2d21ac55
A
3707 /*
3708 * can only get here if MSG_PEEK is not
3709 * set therefore, m should point at the
3710 * head of the rcv queue; if it doesn't,
3711 * it means something drastically
3712 * changed while we were out from behind
3713 * the lock in sodelayed_copy. perhaps
3714 * a RST on the stream. in any event,
3715 * the stream has been interrupted. it's
3716 * probably best just to return whatever
3717 * data we've moved and let the caller
3718 * sort it out...
3719 */
55e303ae 3720 if (m != so->so_rcv.sb_mb) {
2d21ac55 3721 break;
55e303ae
A
3722 }
3723 }
91447636 3724 socket_unlock(so, 0);
2d21ac55
A
3725 error = uiomove(mtod(m, caddr_t) + moff,
3726 (int)len, uio);
91447636 3727 socket_lock(so, 0);
55e303ae 3728
0a7de745 3729 if (error) {
2d21ac55 3730 goto release;
0a7de745 3731 }
55e303ae 3732 }
2d21ac55 3733 } else {
91447636 3734 uio_setresid(uio, (uio_resid(uio) - len));
2d21ac55 3735 }
1c79356b 3736 if (len == m->m_len - moff) {
0a7de745 3737 if (m->m_flags & M_EOR) {
1c79356b 3738 flags |= MSG_EOR;
0a7de745 3739 }
1c79356b
A
3740 if (flags & MSG_PEEK) {
3741 m = m->m_next;
3742 moff = 0;
3743 } else {
3744 nextrecord = m->m_nextpkt;
3745 sbfree(&so->so_rcv, m);
91447636 3746 m->m_nextpkt = NULL;
55e303ae 3747
39236c6e
A
3748 /*
3749 * If this packet is an unordered packet
3750 * (indicated by M_UNORDERED_DATA flag), remove
3751 * the additional bytes added to the
3752 * receive socket buffer size.
3753 */
3754 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3755 m->m_len &&
3756 (m->m_flags & M_UNORDERED_DATA) &&
3757 sbreserve(&so->so_rcv,
3758 so->so_rcv.sb_hiwat - m->m_len)) {
3759 if (so->so_msg_state->msg_uno_bytes >
3760 m->m_len) {
3761 so->so_msg_state->
0a7de745 3762 msg_uno_bytes -= m->m_len;
39236c6e
A
3763 } else {
3764 so->so_msg_state->
0a7de745 3765 msg_uno_bytes = 0;
39236c6e
A
3766 }
3767 m->m_flags &= ~M_UNORDERED_DATA;
3768 }
3769
3770 if (mp != NULL) {
1c79356b
A
3771 *mp = m;
3772 mp = &m->m_next;
3773 so->so_rcv.sb_mb = m = m->m_next;
39236c6e 3774 *mp = NULL;
1c79356b 3775 } else {
0a7de745 3776 if (free_list == NULL) {
2d21ac55 3777 free_list = m;
0a7de745 3778 } else {
2d21ac55 3779 ml->m_next = m;
0a7de745 3780 }
2d21ac55 3781 ml = m;
14353aa8 3782 so->so_rcv.sb_mb = m = m->m_next;
39236c6e 3783 ml->m_next = NULL;
1c79356b 3784 }
2d21ac55 3785 if (m != NULL) {
1c79356b 3786 m->m_nextpkt = nextrecord;
0a7de745 3787 if (nextrecord == NULL) {
2d21ac55 3788 so->so_rcv.sb_lastrecord = m;
0a7de745 3789 }
2d21ac55
A
3790 } else {
3791 so->so_rcv.sb_mb = nextrecord;
3792 SB_EMPTY_FIXUP(&so->so_rcv);
3793 }
3794 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3795 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
1c79356b
A
3796 }
3797 } else {
2d21ac55 3798 if (flags & MSG_PEEK) {
1c79356b 3799 moff += len;
2d21ac55 3800 } else {
6d2010ae
A
3801 if (mp != NULL) {
3802 int copy_flag;
3803
0a7de745 3804 if (flags & MSG_DONTWAIT) {
6d2010ae 3805 copy_flag = M_DONTWAIT;
0a7de745 3806 } else {
6d2010ae 3807 copy_flag = M_WAIT;
0a7de745 3808 }
6d2010ae 3809 *mp = m_copym(m, 0, len, copy_flag);
39236c6e
A
3810 /*
3811 * Failed to allocate an mbuf?
3812 * Adjust uio_resid back, it was
3813 * adjusted down by len bytes which
3814 * we didn't copy over.
3815 */
6d2010ae 3816 if (*mp == NULL) {
39236c6e
A
3817 uio_setresid(uio,
3818 (uio_resid(uio) + len));
6d2010ae
A
3819 break;
3820 }
3821 }
1c79356b
A
3822 m->m_data += len;
3823 m->m_len -= len;
3824 so->so_rcv.sb_cc -= len;
3825 }
3826 }
3827 if (so->so_oobmark) {
3828 if ((flags & MSG_PEEK) == 0) {
3829 so->so_oobmark -= len;
3830 if (so->so_oobmark == 0) {
2d21ac55
A
3831 so->so_state |= SS_RCVATMARK;
3832 /*
3833 * delay posting the actual event until
3834 * after any delayed copy processing
3835 * has finished
3836 */
3837 need_event = 1;
3838 break;
1c79356b
A
3839 }
3840 } else {
3841 offset += len;
0a7de745 3842 if (offset == so->so_oobmark) {
1c79356b 3843 break;
0a7de745 3844 }
1c79356b
A
3845 }
3846 }
0a7de745 3847 if (flags & MSG_EOR) {
1c79356b 3848 break;
0a7de745 3849 }
1c79356b 3850 /*
2d21ac55
A
3851 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3852 * (for non-atomic socket), we must not quit until
3853 * "uio->uio_resid == 0" or an error termination.
3854 * If a signal/timeout occurs, return with a short
3855 * count but without error. Keep sockbuf locked
3856 * against other readers.
1c79356b 3857 */
0a7de745 3858 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
2d21ac55 3859 (uio_resid(uio) - delayed_copy_len) > 0 &&
1c79356b 3860 !sosendallatonce(so) && !nextrecord) {
fe8ab488
A
3861 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3862#if CONTENT_FILTER
3863 && cfil_sock_data_pending(&so->so_rcv) == 0
3864#endif /* CONTENT_FILTER */
0a7de745 3865 )) {
2d21ac55 3866 goto release;
0a7de745 3867 }
fa4905b1 3868
2d21ac55
A
3869 /*
3870 * Depending on the protocol (e.g. TCP), the following
3871 * might cause the socket lock to be dropped and later
3872 * be reacquired, and more data could have arrived and
3873 * have been appended to the receive socket buffer by
3874 * the time it returns. Therefore, we only sleep in
3875 * sbwait() below if and only if the socket buffer is
3876 * empty, in order to avoid a false sleep.
3877 */
3878 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3879 (((struct inpcb *)so->so_pcb)->inp_state !=
0a7de745 3880 INPCB_STATE_DEAD)) {
2d21ac55 3881 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
0a7de745 3882 }
2d21ac55
A
3883
3884 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3885 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3886
3887 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3888 error = 0;
55e303ae 3889 goto release;
fa4905b1 3890 }
55e303ae 3891 /*
2d21ac55
A
3892 * have to wait until after we get back from the sbwait
3893 * to do the copy because we will drop the lock if we
3894 * have enough data that has been delayed... by dropping
3895 * the lock we open up a window allowing the netisr
3896 * thread to process the incoming packets and to change
3897 * the state of this socket... we're issuing the sbwait
3898 * because the socket is empty and we're expecting the
3899 * netisr thread to wake us up when more packets arrive;
3900 * if we allow that processing to happen and then sbwait
3901 * we could stall forever with packets sitting in the
3902 * socket if no further packets arrive from the remote
3903 * side.
55e303ae 3904 *
2d21ac55
A
3905 * we want to copy before we've collected all the data
3906 * to satisfy this request to allow the copy to overlap
3907 * the incoming packet processing on an MP system
55e303ae 3908 */
2d21ac55
A
3909 if (delayed_copy_len > sorecvmincopy &&
3910 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3911 error = sodelayed_copy(so, uio,
3912 &free_list, &delayed_copy_len);
55e303ae 3913
0a7de745 3914 if (error) {
2d21ac55 3915 goto release;
0a7de745 3916 }
1c79356b
A
3917 }
3918 m = so->so_rcv.sb_mb;
39236c6e 3919 if (m != NULL) {
1c79356b 3920 nextrecord = m->m_nextpkt;
fa4905b1 3921 }
316670eb 3922 SB_MB_CHECK(&so->so_rcv);
1c79356b
A
3923 }
3924 }
91447636 3925#ifdef MORE_LOCKING_DEBUG
39236c6e
A
3926 if (so->so_usecount <= 1) {
3927 panic("%s: after big while so=%p ref=%d on socket\n",
3928 __func__, so, so->so_usecount);
3929 /* NOTREACHED */
3930 }
91447636 3931#endif
1c79356b 3932
39236c6e 3933 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
2d21ac55 3934 if (so->so_options & SO_DONTTRUNC) {
1c79356b 3935 flags |= MSG_RCVMORE;
2d21ac55 3936 } else {
9bccf70c 3937 flags |= MSG_TRUNC;
0a7de745 3938 if ((flags & MSG_PEEK) == 0) {
1c79356b 3939 (void) sbdroprecord(&so->so_rcv);
0a7de745 3940 }
1c79356b
A
3941 }
3942 }
2d21ac55
A
3943
3944 /*
3945 * pru_rcvd below (for TCP) may cause more data to be received
3946 * if the socket lock is dropped prior to sending the ACK; some
3947 * legacy OpenTransport applications don't handle this well
3948 * (if it receives less data than requested while MSG_HAVEMORE
3949 * is set), and so we set the flag now based on what we know
3950 * prior to calling pru_rcvd.
3951 */
0a7de745 3952 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
2d21ac55 3953 flags |= MSG_HAVEMORE;
0a7de745 3954 }
2d21ac55 3955
1c79356b 3956 if ((flags & MSG_PEEK) == 0) {
39236c6e 3957 if (m == NULL) {
1c79356b 3958 so->so_rcv.sb_mb = nextrecord;
2d21ac55
A
3959 /*
3960 * First part is an inline SB_EMPTY_FIXUP(). Second
3961 * part makes sure sb_lastrecord is up-to-date if
3962 * there is still data in the socket buffer.
3963 */
3964 if (so->so_rcv.sb_mb == NULL) {
3965 so->so_rcv.sb_mbtail = NULL;
3966 so->so_rcv.sb_lastrecord = NULL;
3967 } else if (nextrecord->m_nextpkt == NULL) {
3968 so->so_rcv.sb_lastrecord = nextrecord;
3969 }
316670eb 3970 SB_MB_CHECK(&so->so_rcv);
2d21ac55
A
3971 }
3972 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3973 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
0a7de745 3974 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
1c79356b 3975 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
0a7de745 3976 }
1c79356b 3977 }
39236c6e 3978
55e303ae 3979 if (delayed_copy_len) {
91447636 3980 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
0a7de745 3981 if (error) {
2d21ac55 3982 goto release;
0a7de745 3983 }
55e303ae 3984 }
39236c6e
A
3985 if (free_list != NULL) {
3986 m_freem_list(free_list);
3987 free_list = NULL;
55e303ae 3988 }
0a7de745 3989 if (need_event) {
2d21ac55 3990 postevent(so, 0, EV_OOB);
0a7de745 3991 }
39236c6e 3992
91447636 3993 if (orig_resid == uio_resid(uio) && orig_resid &&
1c79356b 3994 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
0a7de745 3995 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
1c79356b
A
3996 goto restart;
3997 }
3998
0a7de745 3999 if (flagsp != NULL) {
1c79356b 4000 *flagsp |= flags;
0a7de745 4001 }
1c79356b 4002release:
91447636 4003#ifdef MORE_LOCKING_DEBUG
39236c6e
A
4004 if (so->so_usecount <= 1) {
4005 panic("%s: release so=%p ref=%d on socket\n", __func__,
2d21ac55 4006 so, so->so_usecount);
39236c6e
A
4007 /* NOTREACHED */
4008 }
91447636 4009#endif
0a7de745 4010 if (delayed_copy_len) {
2d21ac55 4011 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
0a7de745 4012 }
1c79356b 4013
0a7de745 4014 if (free_list != NULL) {
39236c6e 4015 m_freem_list(free_list);
0a7de745 4016 }
39236c6e 4017
0a7de745 4018 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
39236c6e 4019
3e170ce0
A
4020 if (en_tracing) {
4021 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4022 VM_KERNEL_ADDRPERM(so),
4023 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4024 (int64_t)(orig_resid - uio_resid(uio)));
4025 }
2d21ac55
A
4026 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4027 so->so_rcv.sb_cc, 0, error);
1c79356b 4028
0a7de745 4029 return error;
1c79356b
A
4030}
4031
2d21ac55
A
4032/*
4033 * Returns: 0 Success
4034 * uiomove:EFAULT
4035 */
4036static int
4037sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
39236c6e 4038 user_ssize_t *resid)
55e303ae 4039{
2d21ac55 4040 int error = 0;
55e303ae
A
4041 struct mbuf *m;
4042
4043 m = *free_list;
4044
91447636 4045 socket_unlock(so, 0);
55e303ae 4046
39236c6e 4047 while (m != NULL && error == 0) {
2d21ac55 4048 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
2d21ac55
A
4049 m = m->m_next;
4050 }
4051 m_freem_list(*free_list);
4052
39236c6e 4053 *free_list = NULL;
2d21ac55
A
4054 *resid = 0;
4055
4056 socket_lock(so, 0);
55e303ae 4057
0a7de745 4058 return error;
2d21ac55
A
4059}
4060
3e170ce0
A
4061static int
4062sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
4063 u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
4064{
4065#pragma unused(so)
4066 int error = 0;
4067 struct mbuf *ml, *m;
4068 int i = 0;
4069 struct uio *auio;
4070
4071 for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
4072 ml = ml->m_nextpkt, i++) {
4073 auio = msgarray[i].uio;
4074 for (m = ml; m != NULL; m = m->m_next) {
4075 error = uiomove(mtod(m, caddr_t), m->m_len, auio);
0a7de745 4076 if (error != 0) {
3e170ce0 4077 goto out;
0a7de745 4078 }
3e170ce0
A
4079 }
4080 }
4081out:
4082 m_freem_list(*free_list);
4083
4084 *free_list = NULL;
4085 *resid = 0;
4086
0a7de745 4087 return error;
3e170ce0
A
4088}
4089
2d21ac55 4090int
3e170ce0
A
4091soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
4092 int *flagsp)
2d21ac55 4093{
3e170ce0 4094 struct mbuf *m;
fe8ab488 4095 struct mbuf *nextrecord;
3e170ce0
A
4096 struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
4097 int error;
4098 user_ssize_t len, pktlen, delayed_copy_len = 0;
fe8ab488 4099 struct protosw *pr = so->so_proto;
3e170ce0 4100 user_ssize_t resid;
fe8ab488
A
4101 struct proc *p = current_proc();
4102 struct uio *auio = NULL;
3e170ce0 4103 int npkts = 0;
fe8ab488 4104 int sblocked = 0;
3e170ce0
A
4105 struct sockaddr **psa = NULL;
4106 struct mbuf **controlp = NULL;
4107 int can_delay;
4108 int flags;
4109 struct mbuf *free_others = NULL;
55e303ae 4110
fe8ab488
A
4111 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
4112 so, uiocnt,
4113 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
4114
fe8ab488
A
4115 /*
4116 * Sanity checks:
4117 * - Only supports don't wait flags
4118 * - Only support datagram sockets (could be extended to raw)
4119 * - Must be atomic
4120 * - Protocol must support packet chains
4121 * - The uio array is NULL (should we panic?)
4122 */
0a7de745 4123 if (flagsp != NULL) {
3e170ce0 4124 flags = *flagsp;
0a7de745 4125 } else {
3e170ce0 4126 flags = 0;
0a7de745 4127 }
3e170ce0
A
4128 if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
4129 MSG_NBIO)) {
4130 printf("%s invalid flags 0x%x\n", __func__, flags);
4131 error = EINVAL;
fe8ab488
A
4132 goto out;
4133 }
4134 if (so->so_type != SOCK_DGRAM) {
4135 error = EINVAL;
4136 goto out;
4137 }
4138 if (sosendallatonce(so) == 0) {
4139 error = EINVAL;
4140 goto out;
4141 }
4142 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4143 error = EPROTONOSUPPORT;
4144 goto out;
4145 }
3e170ce0 4146 if (msgarray == NULL) {
fe8ab488
A
4147 printf("%s uioarray is NULL\n", __func__);
4148 error = EINVAL;
4149 goto out;
4150 }
4151 if (uiocnt == 0) {
4152 printf("%s uiocnt is 0\n", __func__);
4153 error = EINVAL;
4154 goto out;
4155 }
4156 /*
4157 * Sanity check on the length passed by caller as we are making 'int'
4158 * comparisons
4159 */
3e170ce0
A
4160 resid = recv_msg_array_resid(msgarray, uiocnt);
4161 if (resid < 0 || resid > INT_MAX) {
fe8ab488
A
4162 error = EINVAL;
4163 goto out;
4164 }
4165
0a7de745 4166 if (!(flags & MSG_PEEK) && sorecvmincopy > 0) {
3e170ce0 4167 can_delay = 1;
0a7de745 4168 } else {
3e170ce0 4169 can_delay = 0;
0a7de745 4170 }
3e170ce0 4171
fe8ab488
A
4172 socket_lock(so, 1);
4173 so_update_last_owner_locked(so, p);
4174 so_update_policy(so);
4175
4176#if NECP
4177 so_update_necp_policy(so, NULL, NULL);
4178#endif /* NECP */
3e170ce0 4179
fe8ab488
A
4180 /*
4181 * If a recv attempt is made on a previously-accepted socket
4182 * that has been marked as inactive (disconnected), reject
4183 * the request.
4184 */
4185 if (so->so_flags & SOF_DEFUNCT) {
4186 struct sockbuf *sb = &so->so_rcv;
4187
4188 error = ENOTCONN;
39037602
A
4189 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4190 __func__, proc_pid(p), proc_best_name(p),
4191 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4192 SOCK_DOM(so), SOCK_TYPE(so), error);
fe8ab488
A
4193 /*
4194 * This socket should have been disconnected and flushed
4195 * prior to being returned from sodefunct(); there should
4196 * be no data on its receive list, so panic otherwise.
4197 */
0a7de745 4198 if (so->so_state & SS_DEFUNCT) {
fe8ab488 4199 sb_empty_assert(sb, __func__);
0a7de745 4200 }
fe8ab488
A
4201 goto release;
4202 }
3e170ce0
A
4203
4204next:
4205 /*
4206 * The uio may be empty
4207 */
4208 if (npkts >= uiocnt) {
4209 error = 0;
4210 goto release;
4211 }
fe8ab488
A
4212restart:
4213 /*
4214 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4215 * and if so just return to the caller. This could happen when
4216 * soreceive() is called by a socket upcall function during the
4217 * time the socket is freed. The socket buffer would have been
4218 * locked across the upcall, therefore we cannot put this thread
4219 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4220 * we may livelock), because the lock on the socket buffer will
4221 * only be released when the upcall routine returns to its caller.
4222 * Because the socket has been officially closed, there can be
4223 * no further read on it.
4224 */
4225 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4226 (SS_NOFDREF | SS_CANTRCVMORE)) {
4227 error = 0;
4228 goto release;
4229 }
4230
4231 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4232 if (error) {
4233 goto release;
4234 }
4235 sblocked = 1;
4236
fe8ab488
A
4237 m = so->so_rcv.sb_mb;
4238 /*
4239 * Block awaiting more datagram if needed
4240 */
3e170ce0
A
4241 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4242 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4243 ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
fe8ab488
A
4244 /*
4245 * Panic if we notice inconsistencies in the socket's
4246 * receive list; both sb_mb and sb_cc should correctly
4247 * reflect the contents of the list, otherwise we may
4248 * end up with false positives during select() or poll()
4249 * which could put the application in a bad state.
4250 */
4251 SB_MB_CHECK(&so->so_rcv);
4252
4253 if (so->so_error) {
4254 error = so->so_error;
0a7de745 4255 if ((flags & MSG_PEEK) == 0) {
3e170ce0 4256 so->so_error = 0;
0a7de745 4257 }
fe8ab488
A
4258 goto release;
4259 }
4260 if (so->so_state & SS_CANTRCVMORE) {
4261 goto release;
4262 }
0a7de745 4263 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
fe8ab488
A
4264 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4265 error = ENOTCONN;
4266 goto release;
4267 }
4268 if ((so->so_state & SS_NBIO) ||
0a7de745 4269 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
fe8ab488
A
4270 error = EWOULDBLOCK;
4271 goto release;
4272 }
4273 /*
4274 * Do not block if we got some data
fe8ab488 4275 */
3e170ce0 4276 if (free_list != NULL) {
fe8ab488
A
4277 error = 0;
4278 goto release;
4279 }
3e170ce0 4280
fe8ab488
A
4281 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4282 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4283
0a7de745 4284 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
fe8ab488
A
4285 sblocked = 0;
4286
4287 error = sbwait(&so->so_rcv);
4288 if (error) {
4289 goto release;
4290 }
4291 goto restart;
4292 }
4293
fe8ab488
A
4294 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4295 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4296 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4297
4298 /*
4299 * Consume the current uio index as we have a datagram
4300 */
3e170ce0
A
4301 auio = msgarray[npkts].uio;
4302 resid = uio_resid(auio);
4303 msgarray[npkts].which |= SOCK_MSG_DATA;
4304 psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4305 &msgarray[npkts].psa : NULL;
4306 controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4307 &msgarray[npkts].controlp : NULL;
4308 npkts += 1;
fe8ab488
A
4309 nextrecord = m->m_nextpkt;
4310
fe8ab488 4311 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3e170ce0 4312 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
0a7de745 4313 if (error == ERESTART) {
3e170ce0 4314 goto restart;
0a7de745 4315 } else if (error != 0) {
3e170ce0 4316 goto release;
0a7de745 4317 }
fe8ab488 4318 }
fe8ab488 4319
fe8ab488 4320 if (m != NULL && m->m_type == MT_CONTROL) {
3e170ce0 4321 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
0a7de745 4322 if (error != 0) {
3e170ce0 4323 goto release;
0a7de745 4324 }
fe8ab488 4325 }
fe8ab488 4326
3e170ce0
A
4327 if (m->m_pkthdr.len == 0) {
4328 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4329 __func__, __LINE__,
4330 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4331 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4332 m->m_type);
4333 }
fe8ab488
A
4334
4335 /*
3e170ce0
A
4336 * Loop to copy the mbufs of the current record
4337 * Support zero length packets
fe8ab488 4338 */
3e170ce0
A
4339 ml = NULL;
4340 pktlen = 0;
4341 while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
0a7de745 4342 if (m->m_len == 0) {
3e170ce0 4343 panic("%p m_len zero", m);
0a7de745
A
4344 }
4345 if (m->m_type == 0) {
3e170ce0 4346 panic("%p m_type zero", m);
0a7de745 4347 }
fe8ab488
A
4348 /*
4349 * Clip to the residual length
4350 */
0a7de745 4351 if (len > m->m_len) {
fe8ab488 4352 len = m->m_len;
0a7de745 4353 }
3e170ce0 4354 pktlen += len;
fe8ab488 4355 /*
3e170ce0 4356 * Copy the mbufs via the uio or delay the copy
fe8ab488
A
4357 * Sockbuf must be consistent here (points to current mbuf,
4358 * it points to next record) when we drop priority;
4359 * we must note any additions to the sockbuf when we
4360 * block interrupts again.
4361 */
3e170ce0 4362 if (len > 0 && can_delay == 0) {
fe8ab488
A
4363 socket_unlock(so, 0);
4364 error = uiomove(mtod(m, caddr_t), (int)len, auio);
4365 socket_lock(so, 0);
0a7de745 4366 if (error) {
fe8ab488 4367 goto release;
0a7de745 4368 }
3e170ce0
A
4369 } else {
4370 delayed_copy_len += len;
fe8ab488 4371 }
3e170ce0 4372
fe8ab488
A
4373 if (len == m->m_len) {
4374 /*
3e170ce0 4375 * m was entirely copied
fe8ab488 4376 */
fe8ab488 4377 sbfree(&so->so_rcv, m);
3e170ce0 4378 nextrecord = m->m_nextpkt;
fe8ab488
A
4379 m->m_nextpkt = NULL;
4380
4381 /*
3e170ce0 4382 * Set the first packet to the head of the free list
fe8ab488 4383 */
0a7de745 4384 if (free_list == NULL) {
3e170ce0 4385 free_list = m;
0a7de745 4386 }
3e170ce0
A
4387 /*
4388 * Link current packet to tail of free list
4389 */
4390 if (ml == NULL) {
0a7de745 4391 if (free_tail != NULL) {
3e170ce0 4392 free_tail->m_nextpkt = m;
0a7de745 4393 }
3e170ce0 4394 free_tail = m;
fe8ab488 4395 }
3e170ce0
A
4396 /*
4397 * Link current mbuf to last mbuf of current packet
4398 */
0a7de745 4399 if (ml != NULL) {
3e170ce0 4400 ml->m_next = m;
0a7de745 4401 }
3e170ce0
A
4402 ml = m;
4403
4404 /*
4405 * Move next buf to head of socket buffer
4406 */
4407 so->so_rcv.sb_mb = m = ml->m_next;
4408 ml->m_next = NULL;
4409
fe8ab488
A
4410 if (m != NULL) {
4411 m->m_nextpkt = nextrecord;
0a7de745 4412 if (nextrecord == NULL) {
fe8ab488 4413 so->so_rcv.sb_lastrecord = m;
0a7de745 4414 }
fe8ab488
A
4415 } else {
4416 so->so_rcv.sb_mb = nextrecord;
4417 SB_EMPTY_FIXUP(&so->so_rcv);
4418 }
4419 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4420 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4421 } else {
4422 /*
4423 * Stop the loop on partial copy
4424 */
fe8ab488
A
4425 break;
4426 }
4427 }
4428#ifdef MORE_LOCKING_DEBUG
4429 if (so->so_usecount <= 1) {
4430 panic("%s: after big while so=%llx ref=%d on socket\n",
4431 __func__,
3e170ce0 4432 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
fe8ab488
A
4433 /* NOTREACHED */
4434 }
4435#endif
4436 /*
4437 * Tell the caller we made a partial copy
4438 */
4439 if (m != NULL) {
4440 if (so->so_options & SO_DONTTRUNC) {
3e170ce0
A
4441 /*
4442 * Copyout first the freelist then the partial mbuf
4443 */
4444 socket_unlock(so, 0);
0a7de745 4445 if (delayed_copy_len) {
3e170ce0
A
4446 error = sodelayed_copy_list(so, msgarray,
4447 uiocnt, &free_list, &delayed_copy_len);
0a7de745 4448 }
3e170ce0
A
4449
4450 if (error == 0) {
4451 error = uiomove(mtod(m, caddr_t), (int)len,
4452 auio);
4453 }
4454 socket_lock(so, 0);
0a7de745 4455 if (error) {
3e170ce0 4456 goto release;
0a7de745 4457 }
3e170ce0 4458
fe8ab488
A
4459 m->m_data += len;
4460 m->m_len -= len;
4461 so->so_rcv.sb_cc -= len;
4462 flags |= MSG_RCVMORE;
4463 } else {
4464 (void) sbdroprecord(&so->so_rcv);
4465 nextrecord = so->so_rcv.sb_mb;
4466 m = NULL;
4467 flags |= MSG_TRUNC;
4468 }
4469 }
4470
4471 if (m == NULL) {
4472 so->so_rcv.sb_mb = nextrecord;
4473 /*
4474 * First part is an inline SB_EMPTY_FIXUP(). Second
4475 * part makes sure sb_lastrecord is up-to-date if
4476 * there is still data in the socket buffer.
4477 */
4478 if (so->so_rcv.sb_mb == NULL) {
4479 so->so_rcv.sb_mbtail = NULL;
4480 so->so_rcv.sb_lastrecord = NULL;
4481 } else if (nextrecord->m_nextpkt == NULL) {
4482 so->so_rcv.sb_lastrecord = nextrecord;
4483 }
4484 SB_MB_CHECK(&so->so_rcv);
4485 }
4486 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4487 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4488
4489 /*
4490 * We can continue to the next packet as long as:
4491 * - We haven't exhausted the uio array
4492 * - There was no error
4493 * - A packet was not truncated
4494 * - We can still receive more data
3e170ce0
A
4495 */
4496 if (npkts < uiocnt && error == 0 &&
4497 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4498 (so->so_state & SS_CANTRCVMORE) == 0) {
0a7de745 4499 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
fe8ab488
A
4500 sblocked = 0;
4501
3e170ce0 4502 goto next;
fe8ab488 4503 }
0a7de745 4504 if (flagsp != NULL) {
3e170ce0 4505 *flagsp |= flags;
0a7de745 4506 }
fe8ab488
A
4507
4508release:
4509 /*
4510 * pru_rcvd may cause more data to be received if the socket lock
4511 * is dropped so we set MSG_HAVEMORE now based on what we know.
3e170ce0
A
4512 * That way the caller won't be surprised if it receives less data
4513 * than requested.
fe8ab488 4514 */
0a7de745 4515 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
fe8ab488 4516 flags |= MSG_HAVEMORE;
0a7de745 4517 }
fe8ab488 4518
0a7de745 4519 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
fe8ab488 4520 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
0a7de745 4521 }
fe8ab488 4522
0a7de745
A
4523 if (sblocked) {
4524 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4525 } else {
fe8ab488 4526 socket_unlock(so, 1);
0a7de745 4527 }
3e170ce0 4528
0a7de745 4529 if (delayed_copy_len) {
3e170ce0
A
4530 error = sodelayed_copy_list(so, msgarray, uiocnt,
4531 &free_list, &delayed_copy_len);
0a7de745 4532 }
fe8ab488
A
4533out:
4534 /*
3e170ce0 4535 * Amortize the cost of freeing the mbufs
fe8ab488 4536 */
0a7de745 4537 if (free_list != NULL) {
fe8ab488 4538 m_freem_list(free_list);
0a7de745
A
4539 }
4540 if (free_others != NULL) {
3e170ce0 4541 m_freem_list(free_others);
0a7de745 4542 }
fe8ab488
A
4543
4544 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4545 0, 0, 0, 0);
0a7de745 4546 return error;
fe8ab488
A
4547}
4548
cb323159
A
4549static int
4550so_statistics_event_to_nstat_event(int64_t *input_options,
4551 uint64_t *nstat_event)
4552{
4553 int error = 0;
4554 switch (*input_options) {
4555 case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4556 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4557 break;
4558 case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4559 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4560 break;
4561#if (DEBUG || DEVELOPMENT)
4562 case SO_STATISTICS_EVENT_RESERVED_1:
4563 *nstat_event = NSTAT_EVENT_SRC_RESERVED_1;
4564 break;
4565 case SO_STATISTICS_EVENT_RESERVED_2:
4566 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4567 break;
4568#endif /* (DEBUG || DEVELOPMENT) */
4569 default:
4570 error = EINVAL;
4571 break;
4572 }
4573 return error;
4574}
4575
fe8ab488
A
4576/*
4577 * Returns: 0 Success
4578 * EINVAL
4579 * ENOTCONN
4580 * <pru_shutdown>:EINVAL
4581 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4582 * <pru_shutdown>:ENOBUFS[TCP]
4583 * <pru_shutdown>:EMSGSIZE[TCP]
4584 * <pru_shutdown>:EHOSTUNREACH[TCP]
4585 * <pru_shutdown>:ENETUNREACH[TCP]
4586 * <pru_shutdown>:ENETDOWN[TCP]
4587 * <pru_shutdown>:ENOMEM[TCP]
4588 * <pru_shutdown>:EACCES[TCP]
4589 * <pru_shutdown>:EMSGSIZE[TCP]
4590 * <pru_shutdown>:ENOBUFS[TCP]
4591 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4592 * <pru_shutdown>:??? [other protocol families]
4593 */
4594int
4595soshutdown(struct socket *so, int how)
4596{
4597 int error;
4598
4599 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4600
4601 switch (how) {
4602 case SHUT_RD:
4603 case SHUT_WR:
4604 case SHUT_RDWR:
4605 socket_lock(so, 1);
4606 if ((so->so_state &
0a7de745 4607 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
fe8ab488 4608 error = ENOTCONN;
2d21ac55
A
4609 } else {
4610 error = soshutdownlock(so, how);
4611 }
4612 socket_unlock(so, 1);
4613 break;
4614 default:
4615 error = EINVAL;
4616 break;
55e303ae 4617 }
55e303ae 4618
fe8ab488
A
4619 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4620
0a7de745 4621 return error;
55e303ae
A
4622}
4623
1c79356b 4624int
fe8ab488 4625soshutdownlock_final(struct socket *so, int how)
1c79356b 4626{
2d21ac55
A
4627 struct protosw *pr = so->so_proto;
4628 int error = 0;
1c79356b 4629
91447636 4630 sflt_notify(so, sock_evt_shutdown, &how);
1c79356b 4631
9bccf70c 4632 if (how != SHUT_WR) {
2d21ac55
A
4633 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4634 /* read already shut down */
4635 error = ENOTCONN;
4636 goto done;
4637 }
1c79356b
A
4638 sorflush(so);
4639 postevent(so, 0, EV_RCLOSED);
4640 }
9bccf70c 4641 if (how != SHUT_RD) {
2d21ac55
A
4642 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4643 /* write already shut down */
4644 error = ENOTCONN;
4645 goto done;
4646 }
4647 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4648 postevent(so, 0, EV_WCLOSED);
1c79356b 4649 }
2d21ac55 4650done:
fe8ab488 4651 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
0a7de745 4652 return error;
fe8ab488
A
4653}
4654
4655int
4656soshutdownlock(struct socket *so, int how)
4657{
4658 int error = 0;
4659
4660#if CONTENT_FILTER
4661 /*
4662 * A content filter may delay the actual shutdown until it
4663 * has processed the pending data
4664 */
4665 if (so->so_flags & SOF_CONTENT_FILTER) {
4666 error = cfil_sock_shutdown(so, &how);
4667 if (error == EJUSTRETURN) {
4668 error = 0;
4669 goto done;
4670 } else if (error != 0) {
4671 goto done;
4672 }
4673 }
4674#endif /* CONTENT_FILTER */
3e170ce0 4675
fe8ab488
A
4676 error = soshutdownlock_final(so, how);
4677
4678done:
0a7de745 4679 return error;
1c79356b
A
4680}
4681
39236c6e
A
4682void
4683sowflush(struct socket *so)
4684{
4685 struct sockbuf *sb = &so->so_snd;
39236c6e
A
4686
4687 /*
4688 * Obtain lock on the socket buffer (SB_LOCK). This is required
4689 * to prevent the socket buffer from being unexpectedly altered
4690 * while it is used by another thread in socket send/receive.
4691 *
4692 * sblock() must not fail here, hence the assertion.
4693 */
4694 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4695 VERIFY(sb->sb_flags & SB_LOCK);
4696
0a7de745
A
4697 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4698 sb->sb_flags |= SB_DROP;
4699 sb->sb_upcall = NULL;
4700 sb->sb_upcallarg = NULL;
39236c6e 4701
0a7de745 4702 sbunlock(sb, TRUE); /* keep socket locked */
39236c6e
A
4703
4704 selthreadclear(&sb->sb_sel);
4705 sbrelease(sb);
4706}
4707
1c79356b 4708void
2d21ac55 4709sorflush(struct socket *so)
1c79356b 4710{
39236c6e
A
4711 struct sockbuf *sb = &so->so_rcv;
4712 struct protosw *pr = so->so_proto;
1c79356b 4713 struct sockbuf asb;
39236c6e 4714#ifdef notyet
2d21ac55 4715 lck_mtx_t *mutex_held;
39236c6e
A
4716 /*
4717 * XXX: This code is currently commented out, because we may get here
4718 * as part of sofreelastref(), and at that time, pr_getlock() may no
4719 * longer be able to return us the lock; this will be fixed in future.
4720 */
0a7de745 4721 if (so->so_proto->pr_getlock != NULL) {
91447636 4722 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
0a7de745 4723 } else {
91447636 4724 mutex_held = so->so_proto->pr_domain->dom_mtx;
0a7de745 4725 }
39236c6e 4726
5ba3f43e 4727 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
39236c6e 4728#endif /* notyet */
91447636
A
4729
4730 sflt_notify(so, sock_evt_flush_read, NULL);
1c79356b 4731
1c79356b 4732 socantrcvmore(so);
39236c6e
A
4733
4734 /*
4735 * Obtain lock on the socket buffer (SB_LOCK). This is required
4736 * to prevent the socket buffer from being unexpectedly altered
4737 * while it is used by another thread in socket send/receive.
4738 *
4739 * sblock() must not fail here, hence the assertion.
4740 */
4741 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4742 VERIFY(sb->sb_flags & SB_LOCK);
4743
4744 /*
4745 * Copy only the relevant fields from "sb" to "asb" which we
4746 * need for sbrelease() to function. In particular, skip
4747 * sb_sel as it contains the wait queue linkage, which would
4748 * wreak havoc if we were to issue selthreadclear() on "asb".
4749 * Make sure to not carry over SB_LOCK in "asb", as we need
4750 * to acquire it later as part of sbrelease().
4751 */
0a7de745
A
4752 bzero(&asb, sizeof(asb));
4753 asb.sb_cc = sb->sb_cc;
4754 asb.sb_hiwat = sb->sb_hiwat;
4755 asb.sb_mbcnt = sb->sb_mbcnt;
4756 asb.sb_mbmax = sb->sb_mbmax;
4757 asb.sb_ctl = sb->sb_ctl;
4758 asb.sb_lowat = sb->sb_lowat;
4759 asb.sb_mb = sb->sb_mb;
4760 asb.sb_mbtail = sb->sb_mbtail;
4761 asb.sb_lastrecord = sb->sb_lastrecord;
4762 asb.sb_so = sb->sb_so;
4763 asb.sb_flags = sb->sb_flags;
4764 asb.sb_flags &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4765 asb.sb_flags |= SB_DROP;
39236c6e
A
4766
4767 /*
4768 * Ideally we'd bzero() these and preserve the ones we need;
4769 * but to do that we'd need to shuffle things around in the
4770 * sockbuf, and we can't do it now because there are KEXTS
4771 * that are directly referring to the socket structure.
4772 *
4773 * Setting SB_DROP acts as a barrier to prevent further appends.
4774 * Clearing SB_SEL is done for selthreadclear() below.
4775 */
0a7de745
A
4776 sb->sb_cc = 0;
4777 sb->sb_hiwat = 0;
4778 sb->sb_mbcnt = 0;
4779 sb->sb_mbmax = 0;
4780 sb->sb_ctl = 0;
4781 sb->sb_lowat = 0;
4782 sb->sb_mb = NULL;
4783 sb->sb_mbtail = NULL;
4784 sb->sb_lastrecord = NULL;
4785 sb->sb_timeo.tv_sec = 0;
4786 sb->sb_timeo.tv_usec = 0;
4787 sb->sb_upcall = NULL;
4788 sb->sb_upcallarg = NULL;
4789 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4790 sb->sb_flags |= SB_DROP;
4791
4792 sbunlock(sb, TRUE); /* keep socket locked */
39236c6e
A
4793
4794 /*
4795 * Note that selthreadclear() is called on the original "sb" and
4796 * not the local "asb" because of the way wait queue linkage is
4797 * implemented. Given that selwakeup() may be triggered, SB_SEL
4798 * should no longer be set (cleared above.)
4799 */
0b4e3aa0 4800 selthreadclear(&sb->sb_sel);
39236c6e 4801
0a7de745 4802 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
1c79356b 4803 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
0a7de745 4804 }
39236c6e 4805
1c79356b
A
4806 sbrelease(&asb);
4807}
4808
4809/*
4810 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4811 * an additional variant to handle the case where the option value needs
4812 * to be some kind of integer, but not a specific size.
4813 * In addition to their use here, these functions are also called by the
4814 * protocol-level pr_ctloutput() routines.
2d21ac55
A
4815 *
4816 * Returns: 0 Success
4817 * EINVAL
4818 * copyin:EFAULT
1c79356b
A
4819 */
4820int
2d21ac55 4821sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
1c79356b 4822{
0a7de745 4823 size_t valsize;
1c79356b
A
4824
4825 /*
4826 * If the user gives us more than we wanted, we ignore it,
4827 * but if we don't get the minimum length the caller
4828 * wants, we return EINVAL. On success, sopt->sopt_valsize
4829 * is set to however much we actually retrieved.
4830 */
0a7de745
A
4831 if ((valsize = sopt->sopt_valsize) < minlen) {
4832 return EINVAL;
4833 }
4834 if (valsize > len) {
1c79356b 4835 sopt->sopt_valsize = valsize = len;
0a7de745 4836 }
1c79356b 4837
0a7de745
A
4838 if (sopt->sopt_p != kernproc) {
4839 return copyin(sopt->sopt_val, buf, valsize);
4840 }
1c79356b 4841
91447636 4842 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
0a7de745 4843 return 0;
2d21ac55
A
4844}
4845
4846/*
4847 * sooptcopyin_timeval
4848 * Copy in a timeval value into tv_p, and take into account whether the
4849 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4850 * code here so that we can verify the 64-bit tv_sec value before we lose
4851 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4852 */
4853static int
39236c6e 4854sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
2d21ac55 4855{
0a7de745 4856 int error;
b0d623f7 4857
2d21ac55 4858 if (proc_is64bit(sopt->sopt_p)) {
0a7de745 4859 struct user64_timeval tv64;
2d21ac55 4860
0a7de745
A
4861 if (sopt->sopt_valsize < sizeof(tv64)) {
4862 return EINVAL;
4863 }
39236c6e 4864
0a7de745 4865 sopt->sopt_valsize = sizeof(tv64);
b0d623f7 4866 if (sopt->sopt_p != kernproc) {
0a7de745
A
4867 error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4868 if (error != 0) {
4869 return error;
4870 }
b0d623f7
A
4871 } else {
4872 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
0a7de745 4873 sizeof(tv64));
2d21ac55 4874 }
39236c6e 4875 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
0a7de745
A
4876 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4877 return EDOM;
4878 }
39236c6e 4879
2d21ac55
A
4880 tv_p->tv_sec = tv64.tv_sec;
4881 tv_p->tv_usec = tv64.tv_usec;
4882 } else {
0a7de745 4883 struct user32_timeval tv32;
b0d623f7 4884
0a7de745
A
4885 if (sopt->sopt_valsize < sizeof(tv32)) {
4886 return EINVAL;
4887 }
39236c6e 4888
0a7de745 4889 sopt->sopt_valsize = sizeof(tv32);
b0d623f7 4890 if (sopt->sopt_p != kernproc) {
0a7de745 4891 error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
2d21ac55 4892 if (error != 0) {
0a7de745 4893 return error;
2d21ac55
A
4894 }
4895 } else {
b0d623f7 4896 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
0a7de745 4897 sizeof(tv32));
2d21ac55 4898 }
39236c6e
A
4899#ifndef __LP64__
4900 /*
4901 * K64todo "comparison is always false due to
4902 * limited range of data type"
4903 */
4904 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
0a7de745
A
4905 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4906 return EDOM;
4907 }
b0d623f7
A
4908#endif
4909 tv_p->tv_sec = tv32.tv_sec;
4910 tv_p->tv_usec = tv32.tv_usec;
2d21ac55 4911 }
0a7de745 4912 return 0;
1c79356b
A
4913}
4914
5ba3f43e 4915int
cb323159
A
4916soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4917 boolean_t ignore_delegate)
39037602
A
4918{
4919 kauth_cred_t cred = NULL;
4920 proc_t ep = PROC_NULL;
5ba3f43e
A
4921 uid_t uid;
4922 int error = 0;
39037602 4923
cb323159 4924 if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
39037602 4925 ep = proc_find(so->e_pid);
0a7de745 4926 if (ep) {
39037602 4927 cred = kauth_cred_proc_ref(ep);
0a7de745 4928 }
39037602 4929 }
5ba3f43e
A
4930
4931 uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4932
4933 /* uid is 0 for root */
0a7de745 4934 if (uid != 0 || !allow_root) {
5ba3f43e 4935 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
0a7de745
A
4936 }
4937 if (cred) {
39037602 4938 kauth_cred_unref(&cred);
0a7de745
A
4939 }
4940 if (ep != PROC_NULL) {
39037602 4941 proc_rele(ep);
0a7de745 4942 }
39037602 4943
0a7de745 4944 return error;
39037602
A
4945}
4946
2d21ac55
A
4947/*
4948 * Returns: 0 Success
4949 * EINVAL
4950 * ENOPROTOOPT
4951 * ENOBUFS
4952 * EDOM
4953 * sooptcopyin:EINVAL
4954 * sooptcopyin:EFAULT
4955 * sooptcopyin_timeval:EINVAL
4956 * sooptcopyin_timeval:EFAULT
4957 * sooptcopyin_timeval:EDOM
4958 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4959 * <pr_ctloutput>:???w
4960 * sflt_attach_private:??? [whatever a filter author chooses]
4961 * <sf_setoption>:??? [whatever a filter author chooses]
4962 *
4963 * Notes: Other <pru_listen> returns depend on the protocol family; all
4964 * <sf_listen> returns depend on what the filter author causes
4965 * their filter to return.
4966 */
1c79356b 4967int
39236c6e 4968sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
1c79356b 4969{
0a7de745 4970 int error, optval;
cb323159 4971 int64_t long_optval;
0a7de745
A
4972 struct linger l;
4973 struct timeval tv;
2d21ac55
A
4974#if CONFIG_MACF_SOCKET
4975 struct mac extmac;
4976#endif /* MAC_SOCKET */
91447636 4977
0a7de745 4978 if (sopt->sopt_dir != SOPT_SET) {
39236c6e 4979 sopt->sopt_dir = SOPT_SET;
0a7de745 4980 }
39236c6e 4981
0a7de745 4982 if (dolock) {
39236c6e 4983 socket_lock(so, 1);
0a7de745 4984 }
39236c6e
A
4985
4986 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4987 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
b0d623f7 4988 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
2d21ac55
A
4989 /* the socket has been shutdown, no more sockopt's */
4990 error = EINVAL;
39236c6e 4991 goto out;
9bccf70c
A
4992 }
4993
6d2010ae 4994 error = sflt_setsockopt(so, sopt);
39236c6e 4995 if (error != 0) {
0a7de745 4996 if (error == EJUSTRETURN) {
6d2010ae 4997 error = 0;
0a7de745 4998 }
39236c6e 4999 goto out;
1c79356b
A
5000 }
5001
1c79356b 5002 if (sopt->sopt_level != SOL_SOCKET) {
39236c6e
A
5003 if (so->so_proto != NULL &&
5004 so->so_proto->pr_ctloutput != NULL) {
2d21ac55 5005 error = (*so->so_proto->pr_ctloutput)(so, sopt);
39236c6e 5006 goto out;
91447636 5007 }
1c79356b
A
5008 error = ENOPROTOOPT;
5009 } else {
39236c6e
A
5010 /*
5011 * Allow socket-level (SOL_SOCKET) options to be filtered by
5012 * the protocol layer, if needed. A zero value returned from
5013 * the handler means use default socket-level processing as
5014 * done by the rest of this routine. Otherwise, any other
5015 * return value indicates that the option is unsupported.
5016 */
5017 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
0a7de745 5018 pru_socheckopt(so, sopt)) != 0) {
39236c6e 5019 goto out;
0a7de745 5020 }
39236c6e
A
5021
5022 error = 0;
1c79356b
A
5023 switch (sopt->sopt_name) {
5024 case SO_LINGER:
91447636 5025 case SO_LINGER_SEC:
0a7de745
A
5026 error = sooptcopyin(sopt, &l, sizeof(l), sizeof(l));
5027 if (error != 0) {
39236c6e 5028 goto out;
0a7de745 5029 }
1c79356b 5030
2d21ac55
A
5031 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
5032 l.l_linger : l.l_linger * hz;
0a7de745 5033 if (l.l_onoff != 0) {
1c79356b 5034 so->so_options |= SO_LINGER;
0a7de745 5035 } else {
1c79356b 5036 so->so_options &= ~SO_LINGER;
0a7de745 5037 }
1c79356b
A
5038 break;
5039
5040 case SO_DEBUG:
5041 case SO_KEEPALIVE:
5042 case SO_DONTROUTE:
5043 case SO_USELOOPBACK:
5044 case SO_BROADCAST:
5045 case SO_REUSEADDR:
5046 case SO_REUSEPORT:
5047 case SO_OOBINLINE:
5048 case SO_TIMESTAMP:
6d2010ae 5049 case SO_TIMESTAMP_MONOTONIC:
d9a64523 5050 case SO_TIMESTAMP_CONTINUOUS:
1c79356b
A
5051 case SO_DONTTRUNC:
5052 case SO_WANTMORE:
9bccf70c 5053 case SO_WANTOOBFLAG:
fe8ab488 5054 case SO_NOWAKEFROMSLEEP:
39037602 5055 case SO_NOAPNFALLBK:
0a7de745
A
5056 error = sooptcopyin(sopt, &optval, sizeof(optval),
5057 sizeof(optval));
5058 if (error != 0) {
39236c6e 5059 goto out;
0a7de745
A
5060 }
5061 if (optval) {
1c79356b 5062 so->so_options |= sopt->sopt_name;
0a7de745 5063 } else {
1c79356b 5064 so->so_options &= ~sopt->sopt_name;
0a7de745 5065 }
1c79356b
A
5066 break;
5067
5068 case SO_SNDBUF:
5069 case SO_RCVBUF:
5070 case SO_SNDLOWAT:
5071 case SO_RCVLOWAT:
0a7de745
A
5072 error = sooptcopyin(sopt, &optval, sizeof(optval),
5073 sizeof(optval));
5074 if (error != 0) {
39236c6e 5075 goto out;
0a7de745 5076 }
1c79356b
A
5077
5078 /*
5079 * Values < 1 make no sense for any of these
5080 * options, so disallow them.
5081 */
5082 if (optval < 1) {
5083 error = EINVAL;
39236c6e 5084 goto out;
1c79356b
A
5085 }
5086
5087 switch (sopt->sopt_name) {
5088 case SO_SNDBUF:
39236c6e
A
5089 case SO_RCVBUF: {
5090 struct sockbuf *sb =
5091 (sopt->sopt_name == SO_SNDBUF) ?
5092 &so->so_snd : &so->so_rcv;
5093 if (sbreserve(sb, (u_int32_t)optval) == 0) {
1c79356b 5094 error = ENOBUFS;
39236c6e 5095 goto out;
1c79356b 5096 }
316670eb
A
5097 sb->sb_flags |= SB_USRSIZE;
5098 sb->sb_flags &= ~SB_AUTOSIZE;
5099 sb->sb_idealsize = (u_int32_t)optval;
1c79356b 5100 break;
316670eb 5101 }
1c79356b
A
5102 /*
5103 * Make sure the low-water is never greater than
5104 * the high-water.
5105 */
fe8ab488
A
5106 case SO_SNDLOWAT: {
5107 int space = sbspace(&so->so_snd);
5108 u_int32_t hiwat = so->so_snd.sb_hiwat;
5109
5110 if (so->so_snd.sb_flags & SB_UNIX) {
5111 struct unpcb *unp =
5112 (struct unpcb *)(so->so_pcb);
3e170ce0
A
5113 if (unp != NULL &&
5114 unp->unp_conn != NULL) {
fe8ab488
A
5115 hiwat += unp->unp_conn->unp_cc;
5116 }
5117 }
5118
1c79356b 5119 so->so_snd.sb_lowat =
fe8ab488
A
5120 (optval > hiwat) ?
5121 hiwat : optval;
5122
5123 if (space >= so->so_snd.sb_lowat) {
5124 sowwakeup(so);
5125 }
1c79356b 5126 break;
3e170ce0 5127 }
fe8ab488
A
5128 case SO_RCVLOWAT: {
5129 int64_t data_len;
1c79356b
A
5130 so->so_rcv.sb_lowat =
5131 (optval > so->so_rcv.sb_hiwat) ?
5132 so->so_rcv.sb_hiwat : optval;
3e170ce0 5133 data_len = so->so_rcv.sb_cc
fe8ab488 5134 - so->so_rcv.sb_ctl;
0a7de745
A
5135 if (data_len >= so->so_rcv.sb_lowat) {
5136 sorwakeup(so);
5137 }
1c79356b
A
5138 break;
5139 }
fe8ab488 5140 }
1c79356b
A
5141 break;
5142
5143 case SO_SNDTIMEO:
5144 case SO_RCVTIMEO:
2d21ac55 5145 error = sooptcopyin_timeval(sopt, &tv);
0a7de745 5146 if (error != 0) {
39236c6e 5147 goto out;
0a7de745 5148 }
1c79356b 5149
1c79356b
A
5150 switch (sopt->sopt_name) {
5151 case SO_SNDTIMEO:
91447636 5152 so->so_snd.sb_timeo = tv;
1c79356b
A
5153 break;
5154 case SO_RCVTIMEO:
91447636 5155 so->so_rcv.sb_timeo = tv;
1c79356b
A
5156 break;
5157 }
5158 break;
5159
39236c6e 5160 case SO_NKE: {
9bccf70c 5161 struct so_nke nke;
1c79356b 5162
0a7de745
A
5163 error = sooptcopyin(sopt, &nke, sizeof(nke),
5164 sizeof(nke));
5165 if (error != 0) {
39236c6e 5166 goto out;
0a7de745 5167 }
1c79356b 5168
6d2010ae 5169 error = sflt_attach_internal(so, nke.nke_handle);
1c79356b
A
5170 break;
5171 }
5172
9bccf70c 5173 case SO_NOSIGPIPE:
0a7de745
A
5174 error = sooptcopyin(sopt, &optval, sizeof(optval),
5175 sizeof(optval));
5176 if (error != 0) {
39236c6e 5177 goto out;
0a7de745
A
5178 }
5179 if (optval != 0) {
2d21ac55 5180 so->so_flags |= SOF_NOSIGPIPE;
0a7de745 5181 } else {
2d21ac55 5182 so->so_flags &= ~SOF_NOSIGPIPE;
0a7de745 5183 }
9bccf70c
A
5184 break;
5185
55e303ae 5186 case SO_NOADDRERR:
0a7de745
A
5187 error = sooptcopyin(sopt, &optval, sizeof(optval),
5188 sizeof(optval));
5189 if (error != 0) {
39236c6e 5190 goto out;
0a7de745
A
5191 }
5192 if (optval != 0) {
2d21ac55 5193 so->so_flags |= SOF_NOADDRAVAIL;
0a7de745 5194 } else {
2d21ac55 5195 so->so_flags &= ~SOF_NOADDRAVAIL;
0a7de745 5196 }
2d21ac55
A
5197 break;
5198
5199 case SO_REUSESHAREUID:
0a7de745
A
5200 error = sooptcopyin(sopt, &optval, sizeof(optval),
5201 sizeof(optval));
5202 if (error != 0) {
39236c6e 5203 goto out;
0a7de745
A
5204 }
5205 if (optval != 0) {
2d21ac55 5206 so->so_flags |= SOF_REUSESHAREUID;
0a7de745 5207 } else {
2d21ac55 5208 so->so_flags &= ~SOF_REUSESHAREUID;
0a7de745 5209 }
2d21ac55 5210 break;
39236c6e 5211
2d21ac55
A
5212 case SO_NOTIFYCONFLICT:
5213 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
5214 error = EPERM;
39236c6e 5215 goto out;
2d21ac55 5216 }
0a7de745
A
5217 error = sooptcopyin(sopt, &optval, sizeof(optval),
5218 sizeof(optval));
5219 if (error != 0) {
39236c6e 5220 goto out;
0a7de745
A
5221 }
5222 if (optval != 0) {
2d21ac55 5223 so->so_flags |= SOF_NOTIFYCONFLICT;
0a7de745 5224 } else {
2d21ac55 5225 so->so_flags &= ~SOF_NOTIFYCONFLICT;
0a7de745 5226 }
2d21ac55 5227 break;
39236c6e 5228
2d21ac55 5229 case SO_RESTRICTIONS:
0a7de745
A
5230 error = sooptcopyin(sopt, &optval, sizeof(optval),
5231 sizeof(optval));
5232 if (error != 0) {
39236c6e 5233 goto out;
0a7de745 5234 }
39236c6e
A
5235
5236 error = so_set_restrictions(so, optval);
2d21ac55
A
5237 break;
5238
fe8ab488
A
5239 case SO_AWDL_UNRESTRICTED:
5240 if (SOCK_DOM(so) != PF_INET &&
5241 SOCK_DOM(so) != PF_INET6) {
5242 error = EOPNOTSUPP;
5243 goto out;
5244 }
5245 error = sooptcopyin(sopt, &optval, sizeof(optval),
5246 sizeof(optval));
0a7de745 5247 if (error != 0) {
fe8ab488 5248 goto out;
0a7de745 5249 }
fe8ab488 5250 if (optval != 0) {
39037602 5251 error = soopt_cred_check(so,
cb323159 5252 PRIV_NET_RESTRICTED_AWDL, false, false);
0a7de745 5253 if (error == 0) {
fe8ab488 5254 inp_set_awdl_unrestricted(
0a7de745
A
5255 sotoinpcb(so));
5256 }
5257 } else {
fe8ab488 5258 inp_clear_awdl_unrestricted(sotoinpcb(so));
0a7de745 5259 }
fe8ab488 5260 break;
39037602
A
5261 case SO_INTCOPROC_ALLOW:
5262 if (SOCK_DOM(so) != PF_INET6) {
5263 error = EOPNOTSUPP;
5264 goto out;
5265 }
5266 error = sooptcopyin(sopt, &optval, sizeof(optval),
5267 sizeof(optval));
0a7de745 5268 if (error != 0) {
39037602 5269 goto out;
0a7de745 5270 }
743345f9 5271 if (optval != 0 &&
0a7de745 5272 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
39037602 5273 error = soopt_cred_check(so,
cb323159 5274 PRIV_NET_RESTRICTED_INTCOPROC, false, false);
0a7de745 5275 if (error == 0) {
39037602 5276 inp_set_intcoproc_allowed(
0a7de745
A
5277 sotoinpcb(so));
5278 }
5279 } else if (optval == 0) {
39037602 5280 inp_clear_intcoproc_allowed(sotoinpcb(so));
0a7de745 5281 }
39037602 5282 break;
fe8ab488 5283
2d21ac55
A
5284 case SO_LABEL:
5285#if CONFIG_MACF_SOCKET
0a7de745
A
5286 if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
5287 sizeof(extmac))) != 0) {
39236c6e 5288 goto out;
0a7de745 5289 }
2d21ac55
A
5290
5291 error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
5292 so, &extmac);
5293#else
5294 error = EOPNOTSUPP;
5295#endif /* MAC_SOCKET */
55e303ae
A
5296 break;
5297
4a3eedf9 5298 case SO_UPCALLCLOSEWAIT:
0a7de745
A
5299 error = sooptcopyin(sopt, &optval, sizeof(optval),
5300 sizeof(optval));
5301 if (error != 0) {
39236c6e 5302 goto out;
0a7de745
A
5303 }
5304 if (optval != 0) {
4a3eedf9 5305 so->so_flags |= SOF_UPCALLCLOSEWAIT;
0a7de745 5306 } else {
4a3eedf9 5307 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
0a7de745 5308 }
4a3eedf9 5309 break;
4a3eedf9 5310
b0d623f7 5311 case SO_RANDOMPORT:
0a7de745
A
5312 error = sooptcopyin(sopt, &optval, sizeof(optval),
5313 sizeof(optval));
5314 if (error != 0) {
39236c6e 5315 goto out;
0a7de745
A
5316 }
5317 if (optval != 0) {
b0d623f7 5318 so->so_flags |= SOF_BINDRANDOMPORT;
0a7de745 5319 } else {
b0d623f7 5320 so->so_flags &= ~SOF_BINDRANDOMPORT;
0a7de745 5321 }
b0d623f7
A
5322 break;
5323
5324 case SO_NP_EXTENSIONS: {
5325 struct so_np_extensions sonpx;
5326
0a7de745
A
5327 error = sooptcopyin(sopt, &sonpx, sizeof(sonpx),
5328 sizeof(sonpx));
5329 if (error != 0) {
39236c6e 5330 goto out;
0a7de745 5331 }
b0d623f7
A
5332 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5333 error = EINVAL;
39236c6e 5334 goto out;
b0d623f7
A
5335 }
5336 /*
5337 * Only one bit defined for now
5338 */
5339 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
0a7de745 5340 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
b0d623f7 5341 so->so_flags |= SOF_NPX_SETOPTSHUT;
0a7de745 5342 } else {
b0d623f7 5343 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
0a7de745 5344 }
b0d623f7
A
5345 }
5346 break;
5347 }
5348
d41d1dae 5349 case SO_TRAFFIC_CLASS: {
0a7de745
A
5350 error = sooptcopyin(sopt, &optval, sizeof(optval),
5351 sizeof(optval));
5352 if (error != 0) {
39236c6e 5353 goto out;
0a7de745 5354 }
39037602
A
5355 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5356 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5357 error = so_set_net_service_type(so, netsvc);
5358 goto out;
5359 }
6d2010ae 5360 error = so_set_traffic_class(so, optval);
0a7de745 5361 if (error != 0) {
39236c6e 5362 goto out;
0a7de745 5363 }
39037602
A
5364 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5365 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
6d2010ae 5366 break;
d41d1dae 5367 }
6d2010ae
A
5368
5369 case SO_RECV_TRAFFIC_CLASS: {
0a7de745
A
5370 error = sooptcopyin(sopt, &optval, sizeof(optval),
5371 sizeof(optval));
5372 if (error != 0) {
39236c6e 5373 goto out;
0a7de745
A
5374 }
5375 if (optval == 0) {
6d2010ae 5376 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
0a7de745 5377 } else {
6d2010ae 5378 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
0a7de745 5379 }
6d2010ae
A
5380 break;
5381 }
316670eb 5382
39037602 5383#if (DEVELOPMENT || DEBUG)
6d2010ae
A
5384 case SO_TRAFFIC_CLASS_DBG: {
5385 struct so_tcdbg so_tcdbg;
316670eb
A
5386
5387 error = sooptcopyin(sopt, &so_tcdbg,
0a7de745
A
5388 sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5389 if (error != 0) {
39236c6e 5390 goto out;
0a7de745 5391 }
6d2010ae 5392 error = so_set_tcdbg(so, &so_tcdbg);
0a7de745 5393 if (error != 0) {
39236c6e 5394 goto out;
0a7de745 5395 }
6d2010ae
A
5396 break;
5397 }
39037602 5398#endif /* (DEVELOPMENT || DEBUG) */
316670eb
A
5399
5400 case SO_PRIVILEGED_TRAFFIC_CLASS:
5401 error = priv_check_cred(kauth_cred_get(),
5402 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
0a7de745 5403 if (error != 0) {
39236c6e 5404 goto out;
0a7de745
A
5405 }
5406 error = sooptcopyin(sopt, &optval, sizeof(optval),
5407 sizeof(optval));
5408 if (error != 0) {
39236c6e 5409 goto out;
0a7de745
A
5410 }
5411 if (optval == 0) {
316670eb 5412 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
0a7de745 5413 } else {
316670eb 5414 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
0a7de745 5415 }
316670eb
A
5416 break;
5417
a39ff7e2
A
5418#if (DEVELOPMENT || DEBUG)
5419 case SO_DEFUNCTIT:
5420 error = sosetdefunct(current_proc(), so, 0, FALSE);
0a7de745 5421 if (error == 0) {
a39ff7e2 5422 error = sodefunct(current_proc(), so, 0);
0a7de745 5423 }
a39ff7e2
A
5424
5425 break;
5426#endif /* (DEVELOPMENT || DEBUG) */
5427
6d2010ae 5428 case SO_DEFUNCTOK:
0a7de745
A
5429 error = sooptcopyin(sopt, &optval, sizeof(optval),
5430 sizeof(optval));
6d2010ae 5431 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
0a7de745 5432 if (error == 0) {
6d2010ae 5433 error = EBADF;
0a7de745 5434 }
39236c6e 5435 goto out;
6d2010ae
A
5436 }
5437 /*
5438 * Any process can set SO_DEFUNCTOK (clear
5439 * SOF_NODEFUNCT), but only root can clear
5440 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5441 */
5442 if (optval == 0 &&
5443 kauth_cred_issuser(kauth_cred_get()) == 0) {
5444 error = EPERM;
39236c6e 5445 goto out;
6d2010ae 5446 }
0a7de745 5447 if (optval) {
6d2010ae 5448 so->so_flags &= ~SOF_NODEFUNCT;
0a7de745 5449 } else {
6d2010ae 5450 so->so_flags |= SOF_NODEFUNCT;
0a7de745 5451 }
6d2010ae 5452
39236c6e
A
5453 if (SOCK_DOM(so) == PF_INET ||
5454 SOCK_DOM(so) == PF_INET6) {
5455 char s[MAX_IPv6_STR_LEN];
5456 char d[MAX_IPv6_STR_LEN];
5457 struct inpcb *inp = sotoinpcb(so);
5458
39037602
A
5459 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5460 "[%s %s:%d -> %s:%d] is now marked "
5461 "as %seligible for "
39236c6e 5462 "defunct\n", __func__, proc_selfpid(),
39037602 5463 proc_best_name(current_proc()),
3e170ce0 5464 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39236c6e
A
5465 (SOCK_TYPE(so) == SOCK_STREAM) ?
5466 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5467 ((SOCK_DOM(so) == PF_INET) ?
5468 (void *)&inp->inp_laddr.s_addr :
0a7de745 5469 (void *)&inp->in6p_laddr), s, sizeof(s)),
39236c6e
A
5470 ntohs(inp->in6p_lport),
5471 inet_ntop(SOCK_DOM(so),
5472 (SOCK_DOM(so) == PF_INET) ?
5473 (void *)&inp->inp_faddr.s_addr :
0a7de745 5474 (void *)&inp->in6p_faddr, d, sizeof(d)),
39236c6e
A
5475 ntohs(inp->in6p_fport),
5476 (so->so_flags & SOF_NODEFUNCT) ?
39037602 5477 "not " : "");
39236c6e 5478 } else {
39037602
A
5479 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5480 "is now marked as %seligible for "
5481 "defunct\n",
39236c6e 5482 __func__, proc_selfpid(),
39037602 5483 proc_best_name(current_proc()),
3e170ce0 5484 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39236c6e
A
5485 SOCK_DOM(so), SOCK_TYPE(so),
5486 (so->so_flags & SOF_NODEFUNCT) ?
39037602 5487 "not " : "");
39236c6e 5488 }
6d2010ae
A
5489 break;
5490
5491 case SO_ISDEFUNCT:
5492 /* This option is not settable */
5493 error = EINVAL;
5494 break;
d41d1dae 5495
316670eb 5496 case SO_OPPORTUNISTIC:
0a7de745
A
5497 error = sooptcopyin(sopt, &optval, sizeof(optval),
5498 sizeof(optval));
5499 if (error == 0) {
316670eb 5500 error = so_set_opportunistic(so, optval);
0a7de745 5501 }
316670eb
A
5502 break;
5503
5504 case SO_FLUSH:
5505 /* This option is handled by lower layer(s) */
5506 error = 0;
5507 break;
5508
5509 case SO_RECV_ANYIF:
0a7de745
A
5510 error = sooptcopyin(sopt, &optval, sizeof(optval),
5511 sizeof(optval));
5512 if (error == 0) {
316670eb 5513 error = so_set_recv_anyif(so, optval);
0a7de745 5514 }
316670eb
A
5515 break;
5516
39236c6e
A
5517 case SO_TRAFFIC_MGT_BACKGROUND: {
5518 /* This option is handled by lower layer(s) */
5519 error = 0;
5520 break;
5521 }
5522
5523#if FLOW_DIVERT
5524 case SO_FLOW_DIVERT_TOKEN:
5525 error = flow_divert_token_set(so, sopt);
5526 break;
0a7de745 5527#endif /* FLOW_DIVERT */
39236c6e
A
5528
5529
5530 case SO_DELEGATED:
0a7de745
A
5531 if ((error = sooptcopyin(sopt, &optval, sizeof(optval),
5532 sizeof(optval))) != 0) {
39236c6e 5533 break;
0a7de745 5534 }
39236c6e 5535
cb323159 5536 error = so_set_effective_pid(so, optval, sopt->sopt_p, true);
39236c6e
A
5537 break;
5538
5539 case SO_DELEGATED_UUID: {
5540 uuid_t euuid;
5541
0a7de745
A
5542 if ((error = sooptcopyin(sopt, &euuid, sizeof(euuid),
5543 sizeof(euuid))) != 0) {
39236c6e 5544 break;
0a7de745 5545 }
39236c6e 5546
cb323159 5547 error = so_set_effective_uuid(so, euuid, sopt->sopt_p, true);
39236c6e
A
5548 break;
5549 }
3e170ce0 5550
fe8ab488
A
5551#if NECP
5552 case SO_NECP_ATTRIBUTES:
5553 error = necp_set_socket_attributes(so, sopt);
5554 break;
fe8ab488 5555
cb323159 5556 case SO_NECP_CLIENTUUID: {
5ba3f43e
A
5557 if (SOCK_DOM(so) == PF_MULTIPATH) {
5558 /* Handled by MPTCP itself */
fe8ab488
A
5559 break;
5560 }
5561
5ba3f43e
A
5562 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5563 error = EINVAL;
fe8ab488 5564 goto out;
5ba3f43e
A
5565 }
5566
5567 struct inpcb *inp = sotoinpcb(so);
5568 if (!uuid_is_null(inp->necp_client_uuid)) {
5569 // Clear out the old client UUID if present
5570 necp_inpcb_remove_cb(inp);
5571 }
5572
5573 error = sooptcopyin(sopt, &inp->necp_client_uuid,
0a7de745 5574 sizeof(uuid_t), sizeof(uuid_t));
5ba3f43e
A
5575 if (error != 0) {
5576 goto out;
5577 }
5578
5579 if (uuid_is_null(inp->necp_client_uuid)) {
5580 error = EINVAL;
5581 goto out;
5582 }
5583
cb323159
A
5584 pid_t current_pid = proc_pid(current_proc());
5585 error = necp_client_register_socket_flow(current_pid,
5ba3f43e
A
5586 inp->necp_client_uuid, inp);
5587 if (error != 0) {
5588 uuid_clear(inp->necp_client_uuid);
5589 goto out;
5590 }
5591
5592 if (inp->inp_lport != 0) {
cb323159 5593 // There is a bound local port, so this is not
5ba3f43e 5594 // a fresh socket. Assign to the client.
cb323159 5595 necp_client_assign_from_socket(current_pid, inp->necp_client_uuid, inp);
5ba3f43e
A
5596 }
5597
fe8ab488 5598 break;
cb323159
A
5599 }
5600 case SO_NECP_LISTENUUID: {
5601 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5602 error = EINVAL;
5603 goto out;
5604 }
5605
5606 struct inpcb *inp = sotoinpcb(so);
5607 if (!uuid_is_null(inp->necp_client_uuid)) {
5608 error = EINVAL;
5609 goto out;
5610 }
5611
5612 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5613 sizeof(uuid_t), sizeof(uuid_t));
5614 if (error != 0) {
5615 goto out;
5616 }
5617
5618 if (uuid_is_null(inp->necp_client_uuid)) {
5619 error = EINVAL;
5620 goto out;
5621 }
5622
5623 error = necp_client_register_socket_listener(proc_pid(current_proc()),
5624 inp->necp_client_uuid, inp);
5625 if (error != 0) {
5626 uuid_clear(inp->necp_client_uuid);
5627 goto out;
5628 }
5629
5630 // Mark that the port registration is held by NECP
5631 inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5632
5633 break;
5634 }
5ba3f43e 5635#endif /* NECP */
39236c6e 5636
3e170ce0 5637 case SO_EXTENDED_BK_IDLE:
0a7de745
A
5638 error = sooptcopyin(sopt, &optval, sizeof(optval),
5639 sizeof(optval));
5640 if (error == 0) {
3e170ce0 5641 error = so_set_extended_bk_idle(so, optval);
0a7de745 5642 }
3e170ce0
A
5643 break;
5644
490019cf
A
5645 case SO_MARK_CELLFALLBACK:
5646 error = sooptcopyin(sopt, &optval, sizeof(optval),
5647 sizeof(optval));
0a7de745 5648 if (error != 0) {
490019cf 5649 goto out;
0a7de745 5650 }
490019cf
A
5651 if (optval < 0) {
5652 error = EINVAL;
5653 goto out;
5654 }
0a7de745 5655 if (optval == 0) {
490019cf 5656 so->so_flags1 &= ~SOF1_CELLFALLBACK;
0a7de745 5657 } else {
490019cf 5658 so->so_flags1 |= SOF1_CELLFALLBACK;
0a7de745 5659 }
490019cf 5660 break;
39037602 5661
cb323159
A
5662 case SO_STATISTICS_EVENT:
5663 error = sooptcopyin(sopt, &long_optval,
5664 sizeof(long_optval), sizeof(long_optval));
5665 if (error != 0) {
5666 goto out;
5667 }
5668 u_int64_t nstat_event = 0;
5669 error = so_statistics_event_to_nstat_event(
5670 &long_optval, &nstat_event);
5671 if (error != 0) {
5672 goto out;
5673 }
5674 nstat_pcb_event(sotoinpcb(so), nstat_event);
5675 break;
5676
39037602
A
5677 case SO_NET_SERVICE_TYPE: {
5678 error = sooptcopyin(sopt, &optval, sizeof(optval),
5679 sizeof(optval));
0a7de745 5680 if (error != 0) {
39037602 5681 goto out;
0a7de745 5682 }
39037602
A
5683 error = so_set_net_service_type(so, optval);
5684 break;
5685 }
5686
5687 case SO_QOSMARKING_POLICY_OVERRIDE:
5688 error = priv_check_cred(kauth_cred_get(),
5689 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
0a7de745 5690 if (error != 0) {
39037602 5691 goto out;
0a7de745 5692 }
39037602
A
5693 error = sooptcopyin(sopt, &optval, sizeof(optval),
5694 sizeof(optval));
0a7de745 5695 if (error != 0) {
39037602 5696 goto out;
0a7de745
A
5697 }
5698 if (optval == 0) {
39037602 5699 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
0a7de745 5700 } else {
39037602 5701 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
0a7de745 5702 }
39037602
A
5703 break;
5704
cb323159
A
5705 case SO_MPKL_SEND_INFO: {
5706 struct so_mpkl_send_info so_mpkl_send_info;
5707
5708 error = sooptcopyin(sopt, &so_mpkl_send_info,
5709 sizeof(struct so_mpkl_send_info), sizeof(struct so_mpkl_send_info));
5710 if (error != 0) {
5711 goto out;
5712 }
5713 uuid_copy(so->so_mpkl_send_uuid, so_mpkl_send_info.mpkl_uuid);
5714 so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5715
5716 if (uuid_is_null(so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5717 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5718 } else {
5719 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5720 }
5721 break;
5722 }
1c79356b
A
5723 default:
5724 error = ENOPROTOOPT;
5725 break;
5726 }
39236c6e
A
5727 if (error == 0 && so->so_proto != NULL &&
5728 so->so_proto->pr_ctloutput != NULL) {
5729 (void) so->so_proto->pr_ctloutput(so, sopt);
1c79356b
A
5730 }
5731 }
39236c6e 5732out:
0a7de745 5733 if (dolock) {
39236c6e 5734 socket_unlock(so, 1);
0a7de745
A
5735 }
5736 return error;
1c79356b
A
5737}
5738
2d21ac55 5739/* Helper routines for getsockopt */
1c79356b 5740int
2d21ac55 5741sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
1c79356b 5742{
0a7de745
A
5743 int error;
5744 size_t valsize;
1c79356b
A
5745
5746 error = 0;
5747
5748 /*
5749 * Documented get behavior is that we always return a value,
5750 * possibly truncated to fit in the user's buffer.
5751 * Traditional behavior is that we always tell the user
5752 * precisely how much we copied, rather than something useful
5753 * like the total amount we had available for her.
5754 * Note that this interface is not idempotent; the entire answer must
5755 * generated ahead of time.
5756 */
5757 valsize = min(len, sopt->sopt_valsize);
5758 sopt->sopt_valsize = valsize;
91447636 5759 if (sopt->sopt_val != USER_ADDR_NULL) {
0a7de745 5760 if (sopt->sopt_p != kernproc) {
1c79356b 5761 error = copyout(buf, sopt->sopt_val, valsize);
0a7de745 5762 } else {
91447636 5763 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
0a7de745 5764 }
1c79356b 5765 }
0a7de745 5766 return error;
2d21ac55
A
5767}
5768
5769static int
39236c6e 5770sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
2d21ac55 5771{
0a7de745
A
5772 int error;
5773 size_t len;
5774 struct user64_timeval tv64 = {};
5775 struct user32_timeval tv32 = {};
5776 const void * val;
5777 size_t valsize;
b0d623f7 5778
2d21ac55
A
5779 error = 0;
5780 if (proc_is64bit(sopt->sopt_p)) {
0a7de745 5781 len = sizeof(tv64);
2d21ac55
A
5782 tv64.tv_sec = tv_p->tv_sec;
5783 tv64.tv_usec = tv_p->tv_usec;
5784 val = &tv64;
5785 } else {
0a7de745 5786 len = sizeof(tv32);
b0d623f7
A
5787 tv32.tv_sec = tv_p->tv_sec;
5788 tv32.tv_usec = tv_p->tv_usec;
5789 val = &tv32;
2d21ac55
A
5790 }
5791 valsize = min(len, sopt->sopt_valsize);
5792 sopt->sopt_valsize = valsize;
5793 if (sopt->sopt_val != USER_ADDR_NULL) {
0a7de745 5794 if (sopt->sopt_p != kernproc) {
2d21ac55 5795 error = copyout(val, sopt->sopt_val, valsize);
0a7de745 5796 } else {
2d21ac55 5797 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
0a7de745 5798 }
2d21ac55 5799 }
0a7de745 5800 return error;
1c79356b
A
5801}
5802
2d21ac55
A
5803/*
5804 * Return: 0 Success
5805 * ENOPROTOOPT
5806 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5807 * <pr_ctloutput>:???
5808 * <sf_getoption>:???
5809 */
1c79356b 5810int
39236c6e 5811sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
1c79356b 5812{
0a7de745
A
5813 int error, optval;
5814 struct linger l;
5815 struct timeval tv;
2d21ac55
A
5816#if CONFIG_MACF_SOCKET
5817 struct mac extmac;
5818#endif /* MAC_SOCKET */
1c79356b 5819
0a7de745 5820 if (sopt->sopt_dir != SOPT_GET) {
2d21ac55 5821 sopt->sopt_dir = SOPT_GET;
0a7de745 5822 }
9bccf70c 5823
0a7de745 5824 if (dolock) {
39236c6e 5825 socket_lock(so, 1);
0a7de745 5826 }
2d21ac55 5827
6d2010ae 5828 error = sflt_getsockopt(so, sopt);
39236c6e 5829 if (error != 0) {
0a7de745 5830 if (error == EJUSTRETURN) {
6d2010ae 5831 error = 0;
0a7de745 5832 }
39236c6e 5833 goto out;
1c79356b 5834 }
39236c6e 5835
1c79356b 5836 if (sopt->sopt_level != SOL_SOCKET) {
39236c6e
A
5837 if (so->so_proto != NULL &&
5838 so->so_proto->pr_ctloutput != NULL) {
2d21ac55 5839 error = (*so->so_proto->pr_ctloutput)(so, sopt);
39236c6e 5840 goto out;
91447636 5841 }
39236c6e 5842 error = ENOPROTOOPT;
1c79356b 5843 } else {
39236c6e
A
5844 /*
5845 * Allow socket-level (SOL_SOCKET) options to be filtered by
5846 * the protocol layer, if needed. A zero value returned from
5847 * the handler means use default socket-level processing as
5848 * done by the rest of this routine. Otherwise, any other
5849 * return value indicates that the option is unsupported.
5850 */
5851 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
0a7de745 5852 pru_socheckopt(so, sopt)) != 0) {
39236c6e 5853 goto out;
0a7de745 5854 }
39236c6e
A
5855
5856 error = 0;
1c79356b
A
5857 switch (sopt->sopt_name) {
5858 case SO_LINGER:
91447636 5859 case SO_LINGER_SEC:
39236c6e 5860 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
2d21ac55
A
5861 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5862 so->so_linger : so->so_linger / hz;
0a7de745 5863 error = sooptcopyout(sopt, &l, sizeof(l));
1c79356b
A
5864 break;
5865
5866 case SO_USELOOPBACK:
5867 case SO_DONTROUTE:
5868 case SO_DEBUG:
5869 case SO_KEEPALIVE:
5870 case SO_REUSEADDR:
5871 case SO_REUSEPORT:
5872 case SO_BROADCAST:
5873 case SO_OOBINLINE:
5874 case SO_TIMESTAMP:
6d2010ae 5875 case SO_TIMESTAMP_MONOTONIC:
d9a64523 5876 case SO_TIMESTAMP_CONTINUOUS:
1c79356b
A
5877 case SO_DONTTRUNC:
5878 case SO_WANTMORE:
9bccf70c 5879 case SO_WANTOOBFLAG:
fe8ab488 5880 case SO_NOWAKEFROMSLEEP:
39037602 5881 case SO_NOAPNFALLBK:
1c79356b
A
5882 optval = so->so_options & sopt->sopt_name;
5883integer:
0a7de745 5884 error = sooptcopyout(sopt, &optval, sizeof(optval));
1c79356b
A
5885 break;
5886
5887 case SO_TYPE:
5888 optval = so->so_type;
5889 goto integer;
5890
5891 case SO_NREAD:
2d21ac55
A
5892 if (so->so_proto->pr_flags & PR_ATOMIC) {
5893 int pkt_total;
5894 struct mbuf *m1;
1c79356b 5895
2d21ac55
A
5896 pkt_total = 0;
5897 m1 = so->so_rcv.sb_mb;
39236c6e
A
5898 while (m1 != NULL) {
5899 if (m1->m_type == MT_DATA ||
5900 m1->m_type == MT_HEADER ||
0a7de745 5901 m1->m_type == MT_OOBDATA) {
1c79356b 5902 pkt_total += m1->m_len;
0a7de745 5903 }
1c79356b
A
5904 m1 = m1->m_next;
5905 }
5906 optval = pkt_total;
2d21ac55
A
5907 } else {
5908 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5909 }
1c79356b 5910 goto integer;
39236c6e 5911
fe8ab488
A
5912 case SO_NUMRCVPKT:
5913 if (so->so_proto->pr_flags & PR_ATOMIC) {
5914 int cnt = 0;
5915 struct mbuf *m1;
5916
5917 m1 = so->so_rcv.sb_mb;
5918 while (m1 != NULL) {
cb323159 5919 cnt += 1;
fe8ab488
A
5920 m1 = m1->m_nextpkt;
5921 }
5922 optval = cnt;
5923 goto integer;
5924 } else {
cb323159 5925 error = ENOPROTOOPT;
fe8ab488
A
5926 break;
5927 }
5928
91447636
A
5929 case SO_NWRITE:
5930 optval = so->so_snd.sb_cc;
2d21ac55 5931 goto integer;
39236c6e 5932
1c79356b
A
5933 case SO_ERROR:
5934 optval = so->so_error;
5935 so->so_error = 0;
5936 goto integer;
5937
fe8ab488
A
5938 case SO_SNDBUF: {
5939 u_int32_t hiwat = so->so_snd.sb_hiwat;
1c79356b 5940
fe8ab488
A
5941 if (so->so_snd.sb_flags & SB_UNIX) {
5942 struct unpcb *unp =
5943 (struct unpcb *)(so->so_pcb);
5944 if (unp != NULL && unp->unp_conn != NULL) {
5945 hiwat += unp->unp_conn->unp_cc;
5946 }
5947 }
5948
5949 optval = hiwat;
5950 goto integer;
5951 }
1c79356b
A
5952 case SO_RCVBUF:
5953 optval = so->so_rcv.sb_hiwat;
5954 goto integer;
5955
5956 case SO_SNDLOWAT:
5957 optval = so->so_snd.sb_lowat;
5958 goto integer;
5959
5960 case SO_RCVLOWAT:
5961 optval = so->so_rcv.sb_lowat;
5962 goto integer;
5963
5964 case SO_SNDTIMEO:
5965 case SO_RCVTIMEO:
91447636 5966 tv = (sopt->sopt_name == SO_SNDTIMEO ?
2d21ac55 5967 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1c79356b 5968
2d21ac55
A
5969 error = sooptcopyout_timeval(sopt, &tv);
5970 break;
1c79356b 5971
91447636
A
5972 case SO_NOSIGPIPE:
5973 optval = (so->so_flags & SOF_NOSIGPIPE);
5974 goto integer;
9bccf70c 5975
55e303ae 5976 case SO_NOADDRERR:
91447636
A
5977 optval = (so->so_flags & SOF_NOADDRAVAIL);
5978 goto integer;
55e303ae 5979
2d21ac55
A
5980 case SO_REUSESHAREUID:
5981 optval = (so->so_flags & SOF_REUSESHAREUID);
5982 goto integer;
5983
39236c6e 5984
2d21ac55
A
5985 case SO_NOTIFYCONFLICT:
5986 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5987 goto integer;
39236c6e 5988
2d21ac55 5989 case SO_RESTRICTIONS:
39236c6e 5990 optval = so_get_restrictions(so);
2d21ac55
A
5991 goto integer;
5992
fe8ab488 5993 case SO_AWDL_UNRESTRICTED:
3e170ce0 5994 if (SOCK_DOM(so) == PF_INET ||
fe8ab488
A
5995 SOCK_DOM(so) == PF_INET6) {
5996 optval = inp_get_awdl_unrestricted(
0a7de745 5997 sotoinpcb(so));
fe8ab488 5998 goto integer;
0a7de745 5999 } else {
fe8ab488 6000 error = EOPNOTSUPP;
0a7de745 6001 }
fe8ab488
A
6002 break;
6003
39037602
A
6004 case SO_INTCOPROC_ALLOW:
6005 if (SOCK_DOM(so) == PF_INET6) {
6006 optval = inp_get_intcoproc_allowed(
0a7de745 6007 sotoinpcb(so));
39037602 6008 goto integer;
0a7de745 6009 } else {
39037602 6010 error = EOPNOTSUPP;
0a7de745 6011 }
39037602
A
6012 break;
6013
2d21ac55
A
6014 case SO_LABEL:
6015#if CONFIG_MACF_SOCKET
0a7de745
A
6016 if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
6017 sizeof(extmac))) != 0 ||
2d21ac55 6018 (error = mac_socket_label_get(proc_ucred(
0a7de745 6019 sopt->sopt_p), so, &extmac)) != 0) {
2d21ac55 6020 break;
0a7de745 6021 }
2d21ac55 6022
0a7de745 6023 error = sooptcopyout(sopt, &extmac, sizeof(extmac));
2d21ac55
A
6024#else
6025 error = EOPNOTSUPP;
6026#endif /* MAC_SOCKET */
6027 break;
6028
6029 case SO_PEERLABEL:
6030#if CONFIG_MACF_SOCKET
0a7de745
A
6031 if ((error = sooptcopyin(sopt, &extmac, sizeof(extmac),
6032 sizeof(extmac))) != 0 ||
2d21ac55 6033 (error = mac_socketpeer_label_get(proc_ucred(
0a7de745 6034 sopt->sopt_p), so, &extmac)) != 0) {
2d21ac55 6035 break;
0a7de745 6036 }
2d21ac55 6037
0a7de745 6038 error = sooptcopyout(sopt, &extmac, sizeof(extmac));
2d21ac55
A
6039#else
6040 error = EOPNOTSUPP;
6041#endif /* MAC_SOCKET */
6042 break;
6043
4a3eedf9
A
6044#ifdef __APPLE_API_PRIVATE
6045 case SO_UPCALLCLOSEWAIT:
6046 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6047 goto integer;
6048#endif
b0d623f7
A
6049 case SO_RANDOMPORT:
6050 optval = (so->so_flags & SOF_BINDRANDOMPORT);
6051 goto integer;
6052
6053 case SO_NP_EXTENSIONS: {
527f9951 6054 struct so_np_extensions sonpx = {};
b0d623f7 6055
39236c6e
A
6056 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6057 SONPX_SETOPTSHUT : 0;
b0d623f7 6058 sonpx.npx_mask = SONPX_MASK_VALID;
4a3eedf9 6059
39236c6e 6060 error = sooptcopyout(sopt, &sonpx,
0a7de745 6061 sizeof(struct so_np_extensions));
39236c6e 6062 break;
b0d623f7 6063 }
6d2010ae 6064
d41d1dae
A
6065 case SO_TRAFFIC_CLASS:
6066 optval = so->so_traffic_class;
6067 goto integer;
316670eb 6068
6d2010ae
A
6069 case SO_RECV_TRAFFIC_CLASS:
6070 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6071 goto integer;
6072
6073 case SO_TRAFFIC_CLASS_STATS:
39236c6e 6074 error = sooptcopyout(sopt, &so->so_tc_stats,
0a7de745 6075 sizeof(so->so_tc_stats));
316670eb 6076 break;
6d2010ae 6077
39037602 6078#if (DEVELOPMENT || DEBUG)
39236c6e 6079 case SO_TRAFFIC_CLASS_DBG:
6d2010ae
A
6080 error = sogetopt_tcdbg(so, sopt);
6081 break;
39037602 6082#endif /* (DEVELOPMENT || DEBUG) */
316670eb
A
6083
6084 case SO_PRIVILEGED_TRAFFIC_CLASS:
6085 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6086 goto integer;
6087
6d2010ae
A
6088 case SO_DEFUNCTOK:
6089 optval = !(so->so_flags & SOF_NODEFUNCT);
6090 goto integer;
6091
6092 case SO_ISDEFUNCT:
6093 optval = (so->so_flags & SOF_DEFUNCT);
6094 goto integer;
d41d1dae 6095
316670eb
A
6096 case SO_OPPORTUNISTIC:
6097 optval = so_get_opportunistic(so);
6098 goto integer;
6099
6100 case SO_FLUSH:
6101 /* This option is not gettable */
6102 error = EINVAL;
6103 break;
6104
6105 case SO_RECV_ANYIF:
6106 optval = so_get_recv_anyif(so);
6107 goto integer;
6108
39236c6e
A
6109 case SO_TRAFFIC_MGT_BACKGROUND:
6110 /* This option is handled by lower layer(s) */
6111 if (so->so_proto != NULL &&
6112 so->so_proto->pr_ctloutput != NULL) {
6113 (void) so->so_proto->pr_ctloutput(so, sopt);
6114 }
6115 break;
6116
6117#if FLOW_DIVERT
6118 case SO_FLOW_DIVERT_TOKEN:
6119 error = flow_divert_token_get(so, sopt);
6120 break;
0a7de745 6121#endif /* FLOW_DIVERT */
3e170ce0 6122
fe8ab488
A
6123#if NECP
6124 case SO_NECP_ATTRIBUTES:
6125 error = necp_get_socket_attributes(so, sopt);
6126 break;
5ba3f43e 6127
cb323159 6128 case SO_NECP_CLIENTUUID: {
5ba3f43e
A
6129 uuid_t *ncu;
6130
6131 if (SOCK_DOM(so) == PF_MULTIPATH) {
6132 ncu = &mpsotomppcb(so)->necp_client_uuid;
6133 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6134 ncu = &sotoinpcb(so)->necp_client_uuid;
6135 } else {
6136 error = EINVAL;
6137 goto out;
6138 }
6139
6140 error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
6141 break;
6142 }
cb323159
A
6143
6144 case SO_NECP_LISTENUUID: {
6145 uuid_t *nlu;
6146
6147 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6148 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6149 nlu = &sotoinpcb(so)->necp_client_uuid;
6150 } else {
6151 error = ENOENT;
6152 goto out;
6153 }
6154 } else {
6155 error = EINVAL;
6156 goto out;
6157 }
6158
6159 error = sooptcopyout(sopt, nlu, sizeof(uuid_t));
6160 break;
6161 }
fe8ab488
A
6162#endif /* NECP */
6163
6164#if CONTENT_FILTER
6165 case SO_CFIL_SOCK_ID: {
6166 cfil_sock_id_t sock_id;
6167
6168 sock_id = cfil_sock_id_from_socket(so);
6169
3e170ce0 6170 error = sooptcopyout(sopt, &sock_id,
0a7de745 6171 sizeof(cfil_sock_id_t));
fe8ab488
A
6172 break;
6173 }
0a7de745 6174#endif /* CONTENT_FILTER */
fe8ab488 6175
3e170ce0
A
6176 case SO_EXTENDED_BK_IDLE:
6177 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6178 goto integer;
490019cf
A
6179 case SO_MARK_CELLFALLBACK:
6180 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6181 ? 1 : 0;
6182 goto integer;
39037602 6183 case SO_NET_SERVICE_TYPE: {
0a7de745 6184 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
39037602 6185 optval = so->so_netsvctype;
0a7de745 6186 } else {
39037602 6187 optval = NET_SERVICE_TYPE_BE;
0a7de745 6188 }
39037602
A
6189 goto integer;
6190 }
6191 case SO_NETSVC_MARKING_LEVEL:
6192 optval = so_get_netsvc_marking_level(so);
6193 goto integer;
6194
cb323159
A
6195 case SO_MPKL_SEND_INFO: {
6196 struct so_mpkl_send_info so_mpkl_send_info;
6197
6198 uuid_copy(so_mpkl_send_info.mpkl_uuid, so->so_mpkl_send_uuid);
6199 so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6200 error = sooptcopyout(sopt, &so_mpkl_send_info,
6201 sizeof(struct so_mpkl_send_info));
6202 break;
6203 }
1c79356b
A
6204 default:
6205 error = ENOPROTOOPT;
6206 break;
6207 }
1c79356b 6208 }
39236c6e 6209out:
0a7de745 6210 if (dolock) {
39236c6e 6211 socket_unlock(so, 1);
0a7de745
A
6212 }
6213 return error;
1c79356b 6214}
39236c6e
A
6215
6216/*
6217 * The size limits on our soopt_getm is different from that on FreeBSD.
6d2010ae
A
6218 * We limit the size of options to MCLBYTES. This will have to change
6219 * if we need to define options that need more space than MCLBYTES.
6220 */
1c79356b 6221int
9bccf70c 6222soopt_getm(struct sockopt *sopt, struct mbuf **mp)
1c79356b
A
6223{
6224 struct mbuf *m, *m_prev;
6225 int sopt_size = sopt->sopt_valsize;
b0d623f7 6226 int how;
1c79356b 6227
0a7de745
A
6228 if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6229 return EMSGSIZE;
6230 }
a3d08fcd 6231
b0d623f7
A
6232 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6233 MGET(m, how, MT_DATA);
0a7de745
A
6234 if (m == NULL) {
6235 return ENOBUFS;
6236 }
1c79356b 6237 if (sopt_size > MLEN) {
b0d623f7 6238 MCLGET(m, how);
1c79356b
A
6239 if ((m->m_flags & M_EXT) == 0) {
6240 m_free(m);
0a7de745 6241 return ENOBUFS;
1c79356b
A
6242 }
6243 m->m_len = min(MCLBYTES, sopt_size);
6244 } else {
6245 m->m_len = min(MLEN, sopt_size);
6246 }
6247 sopt_size -= m->m_len;
6248 *mp = m;
6249 m_prev = m;
6250
6d2010ae 6251 while (sopt_size > 0) {
b0d623f7 6252 MGET(m, how, MT_DATA);
39236c6e 6253 if (m == NULL) {
1c79356b 6254 m_freem(*mp);
0a7de745 6255 return ENOBUFS;
1c79356b
A
6256 }
6257 if (sopt_size > MLEN) {
b0d623f7 6258 MCLGET(m, how);
1c79356b
A
6259 if ((m->m_flags & M_EXT) == 0) {
6260 m_freem(*mp);
6d2010ae 6261 m_freem(m);
0a7de745 6262 return ENOBUFS;
1c79356b
A
6263 }
6264 m->m_len = min(MCLBYTES, sopt_size);
6265 } else {
6266 m->m_len = min(MLEN, sopt_size);
6267 }
6268 sopt_size -= m->m_len;
6269 m_prev->m_next = m;
6270 m_prev = m;
6271 }
0a7de745 6272 return 0;
1c79356b
A
6273}
6274
6d2010ae 6275/* copyin sopt data into mbuf chain */
1c79356b 6276int
9bccf70c 6277soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
1c79356b
A
6278{
6279 struct mbuf *m0 = m;
6280
0a7de745
A
6281 if (sopt->sopt_val == USER_ADDR_NULL) {
6282 return 0;
6283 }
1c79356b 6284 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
b0d623f7 6285 if (sopt->sopt_p != kernproc) {
1c79356b
A
6286 int error;
6287
2d21ac55
A
6288 error = copyin(sopt->sopt_val, mtod(m, char *),
6289 m->m_len);
1c79356b
A
6290 if (error != 0) {
6291 m_freem(m0);
0a7de745 6292 return error;
1c79356b 6293 }
2d21ac55
A
6294 } else {
6295 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6296 mtod(m, char *), m->m_len);
6297 }
1c79356b 6298 sopt->sopt_valsize -= m->m_len;
2d21ac55 6299 sopt->sopt_val += m->m_len;
1c79356b
A
6300 m = m->m_next;
6301 }
39236c6e
A
6302 /* should be allocated enoughly at ip6_sooptmcopyin() */
6303 if (m != NULL) {
9bccf70c 6304 panic("soopt_mcopyin");
39236c6e
A
6305 /* NOTREACHED */
6306 }
0a7de745 6307 return 0;
1c79356b
A
6308}
6309
6d2010ae 6310/* copyout mbuf chain data into soopt */
1c79356b 6311int
9bccf70c 6312soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
1c79356b
A
6313{
6314 struct mbuf *m0 = m;
6315 size_t valsize = 0;
6316
0a7de745
A
6317 if (sopt->sopt_val == USER_ADDR_NULL) {
6318 return 0;
6319 }
1c79356b 6320 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
b0d623f7 6321 if (sopt->sopt_p != kernproc) {
1c79356b
A
6322 int error;
6323
2d21ac55
A
6324 error = copyout(mtod(m, char *), sopt->sopt_val,
6325 m->m_len);
1c79356b
A
6326 if (error != 0) {
6327 m_freem(m0);
0a7de745 6328 return error;
1c79356b 6329 }
2d21ac55
A
6330 } else {
6331 bcopy(mtod(m, char *),
6332 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
6333 }
6334 sopt->sopt_valsize -= m->m_len;
6335 sopt->sopt_val += m->m_len;
6336 valsize += m->m_len;
6337 m = m->m_next;
1c79356b
A
6338 }
6339 if (m != NULL) {
6340 /* enough soopt buffer should be given from user-land */
6341 m_freem(m0);
0a7de745 6342 return EINVAL;
1c79356b
A
6343 }
6344 sopt->sopt_valsize = valsize;
0a7de745 6345 return 0;
1c79356b
A
6346}
6347
9bccf70c 6348void
2d21ac55 6349sohasoutofband(struct socket *so)
9bccf70c 6350{
0a7de745 6351 if (so->so_pgid < 0) {
9bccf70c 6352 gsignal(-so->so_pgid, SIGURG);
0a7de745 6353 } else if (so->so_pgid > 0) {
2d21ac55 6354 proc_signal(so->so_pgid, SIGURG);
0a7de745 6355 }
9bccf70c 6356 selwakeup(&so->so_rcv.sb_sel);
39037602
A
6357 if (so->so_rcv.sb_flags & SB_KNOTE) {
6358 KNOTE(&so->so_rcv.sb_sel.si_note,
6359 (NOTE_OOB | SO_FILT_HINT_LOCKED));
6360 }
9bccf70c
A
6361}
6362
6363int
39236c6e 6364sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
9bccf70c 6365{
39236c6e 6366#pragma unused(cred)
9bccf70c
A
6367 struct proc *p = current_proc();
6368 int revents = 0;
91447636
A
6369
6370 socket_lock(so, 1);
39236c6e
A
6371 so_update_last_owner_locked(so, PROC_NULL);
6372 so_update_policy(so);
9bccf70c 6373
0a7de745
A
6374 if (events & (POLLIN | POLLRDNORM)) {
6375 if (soreadable(so)) {
9bccf70c 6376 revents |= events & (POLLIN | POLLRDNORM);
0a7de745
A
6377 }
6378 }
9bccf70c 6379
0a7de745
A
6380 if (events & (POLLOUT | POLLWRNORM)) {
6381 if (sowriteable(so)) {
9bccf70c 6382 revents |= events & (POLLOUT | POLLWRNORM);
0a7de745
A
6383 }
6384 }
9bccf70c 6385
0a7de745
A
6386 if (events & (POLLPRI | POLLRDBAND)) {
6387 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
9bccf70c 6388 revents |= events & (POLLPRI | POLLRDBAND);
0a7de745
A
6389 }
6390 }
9bccf70c
A
6391
6392 if (revents == 0) {
6393 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
2d21ac55
A
6394 /*
6395 * Darwin sets the flag first,
6396 * BSD calls selrecord first
6397 */
9bccf70c
A
6398 so->so_rcv.sb_flags |= SB_SEL;
6399 selrecord(p, &so->so_rcv.sb_sel, wql);
6400 }
6401
6402 if (events & (POLLOUT | POLLWRNORM)) {
2d21ac55
A
6403 /*
6404 * Darwin sets the flag first,
6405 * BSD calls selrecord first
6406 */
9bccf70c
A
6407 so->so_snd.sb_flags |= SB_SEL;
6408 selrecord(p, &so->so_snd.sb_sel, wql);
6409 }
6410 }
6411
91447636 6412 socket_unlock(so, 1);
0a7de745 6413 return revents;
9bccf70c 6414}
55e303ae 6415
55e303ae 6416int
cb323159 6417soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
55e303ae 6418{
cb323159 6419 struct socket *so = (struct socket *)fp->f_fglob->fg_data;
39037602 6420 int result;
2d21ac55 6421
91447636 6422 socket_lock(so, 1);
39236c6e
A
6423 so_update_last_owner_locked(so, PROC_NULL);
6424 so_update_policy(so);
55e303ae 6425
2d21ac55 6426#if CONFIG_MACF_SOCKET
cb323159
A
6427 proc_t p = knote_get_kq(kn)->kq_p;
6428 if (mac_socket_check_kqfilter(proc_ucred(p), kn, so) != 0) {
2d21ac55 6429 socket_unlock(so, 1);
cb323159 6430 knote_set_error(kn, EPERM);
39037602 6431 return 0;
2d21ac55
A
6432 }
6433#endif /* MAC_SOCKET */
6434
55e303ae
A
6435 switch (kn->kn_filter) {
6436 case EVFILT_READ:
39037602 6437 kn->kn_filtid = EVFILTID_SOREAD;
55e303ae
A
6438 break;
6439 case EVFILT_WRITE:
39037602 6440 kn->kn_filtid = EVFILTID_SOWRITE;
316670eb
A
6441 break;
6442 case EVFILT_SOCK:
39037602
A
6443 kn->kn_filtid = EVFILTID_SCK;
6444 break;
6445 case EVFILT_EXCEPT:
6446 kn->kn_filtid = EVFILTID_SOEXCEPT;
55e303ae
A
6447 break;
6448 default:
91447636 6449 socket_unlock(so, 1);
cb323159 6450 knote_set_error(kn, EINVAL);
39037602 6451 return 0;
316670eb 6452 }
55e303ae 6453
39037602
A
6454 /*
6455 * call the appropriate sub-filter attach
6456 * with the socket still locked
6457 */
5ba3f43e 6458 result = knote_fops(kn)->f_attach(kn, kev);
55e303ae 6459
91447636 6460 socket_unlock(so, 1);
39037602
A
6461
6462 return result;
55e303ae
A
6463}
6464
55e303ae 6465static int
cb323159 6466filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
55e303ae 6467{
cb323159
A
6468 int retval = 0;
6469 int64_t data = 0;
b0d623f7 6470
cb323159 6471 if (so->so_options & SO_ACCEPTCONN) {
39236c6e
A
6472 /*
6473 * Radar 6615193 handle the listen case dynamically
6474 * for kqueue read filter. This allows to call listen()
6475 * after registering the kqueue EVFILT_READ.
b0d623f7
A
6476 */
6477
cb323159
A
6478 retval = !TAILQ_EMPTY(&so->so_comp);
6479 data = so->so_qlen;
6480 goto out;
b0d623f7
A
6481 }
6482
6483 /* socket isn't a listener */
3e170ce0
A
6484 /*
6485 * NOTE_LOWAT specifies new low water mark in data, i.e.
6486 * the bytes of protocol data. We therefore exclude any
6487 * control bytes.
6488 */
cb323159 6489 data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3e170ce0 6490
39037602
A
6491 if (kn->kn_sfflags & NOTE_OOB) {
6492 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6493 kn->kn_fflags |= NOTE_OOB;
cb323159
A
6494 data -= so->so_oobmark;
6495 retval = 1;
6496 goto out;
91447636 6497 }
04b8595b 6498 }
3e170ce0 6499
04b8595b 6500 if ((so->so_state & SS_CANTRCVMORE)
fe8ab488 6501#if CONTENT_FILTER
04b8595b 6502 && cfil_sock_data_pending(&so->so_rcv) == 0
fe8ab488 6503#endif /* CONTENT_FILTER */
0a7de745 6504 ) {
04b8595b
A
6505 kn->kn_flags |= EV_EOF;
6506 kn->kn_fflags = so->so_error;
cb323159
A
6507 retval = 1;
6508 goto out;
91447636
A
6509 }
6510
0a7de745 6511 if (so->so_error) { /* temporary udp error */
cb323159
A
6512 retval = 1;
6513 goto out;
91447636
A
6514 }
6515
0a7de745 6516 int64_t lowwat = so->so_rcv.sb_lowat;
3e170ce0
A
6517 /*
6518 * Ensure that when NOTE_LOWAT is used, the derived
6519 * low water mark is bounded by socket's rcv buf's
6520 * high and low water mark values.
6521 */
39236c6e 6522 if (kn->kn_sfflags & NOTE_LOWAT) {
0a7de745 6523 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6d2010ae 6524 lowwat = so->so_rcv.sb_hiwat;
0a7de745 6525 } else if (kn->kn_sdata > lowwat) {
6d2010ae 6526 lowwat = kn->kn_sdata;
0a7de745 6527 }
6d2010ae 6528 }
39236c6e 6529
cb323159 6530 retval = (data >= lowwat);
3e170ce0 6531
cb323159
A
6532out:
6533 if (retval && kev) {
6534 knote_fill_kevent(kn, kev, data);
6535 }
6536 return retval;
55e303ae
A
6537}
6538
39037602 6539static int
cb323159 6540filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
39037602
A
6541{
6542 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6543
6544 /* socket locked */
6545
6546 /*
6547 * If the caller explicitly asked for OOB results (e.g. poll())
6548 * from EVFILT_READ, then save that off in the hookid field
6549 * and reserve the kn_flags EV_OOBAND bit for output only.
6550 */
6551 if (kn->kn_filter == EVFILT_READ &&
6552 kn->kn_flags & EV_OOBAND) {
6553 kn->kn_flags &= ~EV_OOBAND;
cb323159 6554 kn->kn_hook32 = EV_OOBAND;
39037602 6555 } else {
cb323159 6556 kn->kn_hook32 = 0;
39037602 6557 }
0a7de745 6558 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
39037602 6559 so->so_rcv.sb_flags |= SB_KNOTE;
0a7de745 6560 }
39037602
A
6561
6562 /* indicate if event is already fired */
cb323159 6563 return filt_soread_common(kn, NULL, so);
39037602
A
6564}
6565
55e303ae 6566static void
39037602 6567filt_sordetach(struct knote *kn)
55e303ae 6568{
91447636 6569 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
39037602 6570
91447636 6571 socket_lock(so, 1);
0a7de745
A
6572 if (so->so_rcv.sb_flags & SB_KNOTE) {
6573 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
39037602 6574 so->so_rcv.sb_flags &= ~SB_KNOTE;
0a7de745
A
6575 }
6576 }
39037602
A
6577 socket_unlock(so, 1);
6578}
6579
6580/*ARGSUSED*/
6581static int
6582filt_soread(struct knote *kn, long hint)
6583{
6584 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6585 int retval;
6586
0a7de745 6587 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
39037602 6588 socket_lock(so, 1);
0a7de745 6589 }
39037602 6590
cb323159 6591 retval = filt_soread_common(kn, NULL, so);
39037602 6592
0a7de745 6593 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
39037602 6594 socket_unlock(so, 1);
0a7de745 6595 }
39037602
A
6596
6597 return retval;
6598}
6599
6600static int
cb323159 6601filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
39037602
A
6602{
6603 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6604 int retval;
6605
6606 socket_lock(so, 1);
6607
6608 /* save off the new input fflags and data */
6609 kn->kn_sfflags = kev->fflags;
6610 kn->kn_sdata = kev->data;
39037602
A
6611
6612 /* determine if changes result in fired events */
cb323159 6613 retval = filt_soread_common(kn, NULL, so);
55e303ae 6614
91447636 6615 socket_unlock(so, 1);
39037602
A
6616
6617 return retval;
6618}
6619
6620static int
cb323159 6621filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
39037602 6622{
39037602
A
6623 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6624 int retval;
6625
6626 socket_lock(so, 1);
cb323159 6627 retval = filt_soread_common(kn, kev, so);
39037602
A
6628 socket_unlock(so, 1);
6629
6630 return retval;
55e303ae
A
6631}
6632
316670eb
A
6633int
6634so_wait_for_if_feedback(struct socket *so)
6635{
39236c6e 6636 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
316670eb
A
6637 (so->so_state & SS_ISCONNECTED)) {
6638 struct inpcb *inp = sotoinpcb(so);
0a7de745
A
6639 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6640 return 1;
6641 }
316670eb 6642 }
0a7de745 6643 return 0;
316670eb
A
6644}
6645
55e303ae 6646static int
cb323159 6647filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
55e303ae 6648{
316670eb 6649 int ret = 0;
cb323159 6650 int64_t data = sbspace(&so->so_snd);
91447636 6651
55e303ae 6652 if (so->so_state & SS_CANTSENDMORE) {
2d21ac55 6653 kn->kn_flags |= EV_EOF;
55e303ae 6654 kn->kn_fflags = so->so_error;
cb323159
A
6655 ret = 1;
6656 goto out;
55e303ae 6657 }
cb323159 6658
0a7de745 6659 if (so->so_error) { /* temporary udp error */
cb323159
A
6660 ret = 1;
6661 goto out;
91447636 6662 }
cb323159 6663
3e170ce0 6664 if (!socanwrite(so)) {
cb323159
A
6665 ret = 0;
6666 goto out;
91447636 6667 }
cb323159 6668
3e170ce0 6669 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
cb323159
A
6670 ret = 1;
6671 goto out;
3e170ce0 6672 }
cb323159 6673
0a7de745 6674 int64_t lowwat = so->so_snd.sb_lowat;
cb323159 6675
39236c6e 6676 if (kn->kn_sfflags & NOTE_LOWAT) {
0a7de745 6677 if (kn->kn_sdata > so->so_snd.sb_hiwat) {
6d2010ae 6678 lowwat = so->so_snd.sb_hiwat;
0a7de745 6679 } else if (kn->kn_sdata > lowwat) {
6d2010ae 6680 lowwat = kn->kn_sdata;
0a7de745 6681 }
6d2010ae 6682 }
cb323159
A
6683
6684 if (data >= lowwat) {
39037602
A
6685 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6686#if (DEBUG || DEVELOPMENT)
6687 && so_notsent_lowat_check == 1
6688#endif /* DEBUG || DEVELOPMENT */
6689 ) {
6690 if ((SOCK_DOM(so) == PF_INET ||
6691 SOCK_DOM(so) == PF_INET6) &&
6692 so->so_type == SOCK_STREAM) {
fe8ab488
A
6693 ret = tcp_notsent_lowat_check(so);
6694 }
6695#if MPTCP
6696 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6697 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6698 ret = mptcp_notsent_lowat_check(so);
6699 }
6700#endif
6701 else {
cb323159
A
6702 ret = 1;
6703 goto out;
fe8ab488 6704 }
316670eb
A
6705 } else {
6706 ret = 1;
6707 }
6708 }
0a7de745 6709 if (so_wait_for_if_feedback(so)) {
316670eb 6710 ret = 0;
0a7de745 6711 }
cb323159
A
6712
6713out:
6714 if (ret && kev) {
6715 knote_fill_kevent(kn, kev, data);
6716 }
0a7de745 6717 return ret;
316670eb
A
6718}
6719
39037602 6720static int
cb323159 6721filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
39037602
A
6722{
6723 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6724
6725 /* socket locked */
0a7de745 6726 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
39037602 6727 so->so_snd.sb_flags |= SB_KNOTE;
0a7de745 6728 }
39037602
A
6729
6730 /* determine if its already fired */
cb323159 6731 return filt_sowrite_common(kn, NULL, so);
39037602
A
6732}
6733
316670eb 6734static void
39037602 6735filt_sowdetach(struct knote *kn)
316670eb
A
6736{
6737 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6738 socket_lock(so, 1);
39236c6e 6739
0a7de745
A
6740 if (so->so_snd.sb_flags & SB_KNOTE) {
6741 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
39037602 6742 so->so_snd.sb_flags &= ~SB_KNOTE;
0a7de745
A
6743 }
6744 }
316670eb
A
6745 socket_unlock(so, 1);
6746}
6747
39037602 6748/*ARGSUSED*/
316670eb 6749static int
39037602 6750filt_sowrite(struct knote *kn, long hint)
316670eb 6751{
316670eb 6752 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
39037602 6753 int ret;
316670eb 6754
0a7de745 6755 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
316670eb 6756 socket_lock(so, 1);
0a7de745 6757 }
39037602 6758
cb323159 6759 ret = filt_sowrite_common(kn, NULL, so);
39037602 6760
0a7de745 6761 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
39037602 6762 socket_unlock(so, 1);
0a7de745 6763 }
39037602
A
6764
6765 return ret;
6766}
6767
6768static int
cb323159 6769filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
39037602
A
6770{
6771 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6772 int ret;
6773
6774 socket_lock(so, 1);
6775
6776 /*save off the new input fflags and data */
6777 kn->kn_sfflags = kev->fflags;
6778 kn->kn_sdata = kev->data;
39037602
A
6779
6780 /* determine if these changes result in a triggered event */
cb323159 6781 ret = filt_sowrite_common(kn, NULL, so);
39037602
A
6782
6783 socket_unlock(so, 1);
6784
6785 return ret;
6786}
6787
6788static int
cb323159 6789filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
39037602 6790{
39037602
A
6791 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6792 int ret;
6793
6794 socket_lock(so, 1);
cb323159 6795 ret = filt_sowrite_common(kn, kev, so);
39037602 6796 socket_unlock(so, 1);
cb323159 6797
39037602
A
6798 return ret;
6799}
6800
6801static int
cb323159
A
6802filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6803 struct socket *so, long ev_hint)
39037602
A
6804{
6805 int ret = 0;
cb323159 6806 int64_t data = 0;
39037602 6807 uint32_t level_trigger = 0;
316670eb 6808
39236c6e 6809 if (ev_hint & SO_FILT_HINT_CONNRESET) {
3e170ce0 6810 kn->kn_fflags |= NOTE_CONNRESET;
39236c6e
A
6811 }
6812 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
3e170ce0 6813 kn->kn_fflags |= NOTE_TIMEOUT;
39236c6e
A
6814 }
6815 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
3e170ce0 6816 kn->kn_fflags |= NOTE_NOSRCADDR;
39236c6e
A
6817 }
6818 if (ev_hint & SO_FILT_HINT_IFDENIED) {
3e170ce0 6819 kn->kn_fflags |= NOTE_IFDENIED;
39236c6e
A
6820 }
6821 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
3e170ce0 6822 kn->kn_fflags |= NOTE_KEEPALIVE;
316670eb 6823 }
39236c6e 6824 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
3e170ce0 6825 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
39236c6e
A
6826 }
6827 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
3e170ce0 6828 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
39236c6e 6829 }
3e170ce0
A
6830 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6831 (so->so_state & SS_ISCONNECTED)) {
6832 kn->kn_fflags |= NOTE_CONNECTED;
6833 level_trigger |= NOTE_CONNECTED;
39236c6e 6834 }
3e170ce0
A
6835 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6836 (so->so_state & SS_ISDISCONNECTED)) {
6837 kn->kn_fflags |= NOTE_DISCONNECTED;
6838 level_trigger |= NOTE_DISCONNECTED;
39236c6e
A
6839 }
6840 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6841 if (so->so_proto != NULL &&
0a7de745 6842 (so->so_proto->pr_flags & PR_EVCONNINFO)) {
39236c6e 6843 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
0a7de745 6844 }
39236c6e 6845 }
316670eb 6846
39037602
A
6847 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6848 tcp_notify_ack_active(so)) {
6849 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6850 }
6851
3e170ce0 6852 if ((so->so_state & SS_CANTRCVMORE)
fe8ab488 6853#if CONTENT_FILTER
3e170ce0 6854 && cfil_sock_data_pending(&so->so_rcv) == 0
fe8ab488 6855#endif /* CONTENT_FILTER */
3e170ce0 6856 ) {
316670eb 6857 kn->kn_fflags |= NOTE_READCLOSED;
3e170ce0
A
6858 level_trigger |= NOTE_READCLOSED;
6859 }
316670eb 6860
3e170ce0 6861 if (so->so_state & SS_CANTSENDMORE) {
316670eb 6862 kn->kn_fflags |= NOTE_WRITECLOSED;
3e170ce0
A
6863 level_trigger |= NOTE_WRITECLOSED;
6864 }
316670eb 6865
3e170ce0
A
6866 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6867 (so->so_flags & SOF_SUSPENDED)) {
39236c6e 6868 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
3e170ce0
A
6869
6870 /* If resume event was delivered before, reset it */
cb323159 6871 kn->kn_hook32 &= ~NOTE_RESUME;
3e170ce0 6872
316670eb 6873 kn->kn_fflags |= NOTE_SUSPEND;
3e170ce0 6874 level_trigger |= NOTE_SUSPEND;
316670eb
A
6875 }
6876
3e170ce0
A
6877 if ((ev_hint & SO_FILT_HINT_RESUME) ||
6878 (so->so_flags & SOF_SUSPENDED) == 0) {
39236c6e 6879 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
3e170ce0
A
6880
6881 /* If suspend event was delivered before, reset it */
cb323159 6882 kn->kn_hook32 &= ~NOTE_SUSPEND;
3e170ce0 6883
316670eb 6884 kn->kn_fflags |= NOTE_RESUME;
3e170ce0 6885 level_trigger |= NOTE_RESUME;
316670eb
A
6886 }
6887
6888 if (so->so_error != 0) {
6889 ret = 1;
cb323159 6890 data = so->so_error;
316670eb
A
6891 kn->kn_flags |= EV_EOF;
6892 } else {
cb323159
A
6893 u_int32_t data32;
6894 get_sockev_state(so, &data32);
6895 data = data32;
316670eb
A
6896 }
6897
3e170ce0
A
6898 /* Reset any events that are not requested on this knote */
6899 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6900 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6901
6902 /* Find the level triggerred events that are already delivered */
cb323159 6903 level_trigger &= kn->kn_hook32;
3e170ce0
A
6904 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6905
6906 /* Do not deliver level triggerred events more than once */
0a7de745 6907 if ((kn->kn_fflags & ~level_trigger) != 0) {
316670eb 6908 ret = 1;
0a7de745 6909 }
316670eb 6910
cb323159
A
6911 if (ret && kev) {
6912 /*
6913 * Store the state of the events being delivered. This
6914 * state can be used to deliver level triggered events
6915 * ateast once and still avoid waking up the application
6916 * multiple times as long as the event is active.
6917 */
6918 if (kn->kn_fflags != 0) {
6919 kn->kn_hook32 |= (kn->kn_fflags &
6920 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6921 }
6922
6923 /*
6924 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6925 * only one of them and remember the last one that was
6926 * delivered last
6927 */
6928 if (kn->kn_fflags & NOTE_SUSPEND) {
6929 kn->kn_hook32 &= ~NOTE_RESUME;
6930 }
6931 if (kn->kn_fflags & NOTE_RESUME) {
6932 kn->kn_hook32 &= ~NOTE_SUSPEND;
6933 }
6934
6935 knote_fill_kevent(kn, kev, data);
6936 }
0a7de745 6937 return ret;
316670eb
A
6938}
6939
39037602 6940static int
cb323159 6941filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
39037602
A
6942{
6943 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6944
6945 /* socket locked */
cb323159 6946 kn->kn_hook32 = 0;
0a7de745 6947 if (KNOTE_ATTACH(&so->so_klist, kn)) {
39037602 6948 so->so_flags |= SOF_KNOTE;
0a7de745 6949 }
39037602
A
6950
6951 /* determine if event already fired */
cb323159 6952 return filt_sockev_common(kn, NULL, so, 0);
39037602
A
6953}
6954
3e170ce0 6955static void
39037602 6956filt_sockdetach(struct knote *kn)
3e170ce0 6957{
39037602
A
6958 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6959 socket_lock(so, 1);
3e170ce0 6960
0a7de745
A
6961 if ((so->so_flags & SOF_KNOTE) != 0) {
6962 if (KNOTE_DETACH(&so->so_klist, kn)) {
39037602 6963 so->so_flags &= ~SOF_KNOTE;
0a7de745
A
6964 }
6965 }
39037602
A
6966 socket_unlock(so, 1);
6967}
6968
6969static int
6970filt_sockev(struct knote *kn, long hint)
6971{
6972 int ret = 0, locked = 0;
6973 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6974 long ev_hint = (hint & SO_FILT_HINT_EV);
6975
6976 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6977 socket_lock(so, 1);
6978 locked = 1;
3e170ce0 6979 }
39037602 6980
cb323159 6981 ret = filt_sockev_common(kn, NULL, so, ev_hint);
39037602 6982
0a7de745 6983 if (locked) {
39037602 6984 socket_unlock(so, 1);
0a7de745 6985 }
39037602
A
6986
6987 return ret;
6988}
6989
6990
6991
6992/*
6993 * filt_socktouch - update event state
6994 */
6995static int
6996filt_socktouch(
6997 struct knote *kn,
cb323159 6998 struct kevent_qos_s *kev)
39037602
A
6999{
7000 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
7001 uint32_t changed_flags;
7002 int ret;
7003
7004 socket_lock(so, 1);
7005
7006 /* save off the [result] data and fflags */
cb323159 7007 changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
39037602
A
7008
7009 /* save off the new input fflags and data */
7010 kn->kn_sfflags = kev->fflags;
7011 kn->kn_sdata = kev->data;
39037602
A
7012
7013 /* restrict the current results to the (smaller?) set of new interest */
7014 /*
7015 * For compatibility with previous implementations, we leave kn_fflags
7016 * as they were before.
7017 */
7018 //kn->kn_fflags &= kev->fflags;
7019
7020 /*
7021 * Since we keep track of events that are already
7022 * delivered, if any of those events are not requested
7023 * anymore the state related to them can be reset
7024 */
cb323159 7025 kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
39037602
A
7026
7027 /* determine if we have events to deliver */
cb323159 7028 ret = filt_sockev_common(kn, NULL, so, 0);
39037602
A
7029
7030 socket_unlock(so, 1);
7031
7032 return ret;
7033}
7034
7035/*
7036 * filt_sockprocess - query event fired state and return data
7037 */
7038static int
cb323159 7039filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
39037602 7040{
39037602
A
7041 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
7042 int ret = 0;
7043
7044 socket_lock(so, 1);
7045
cb323159 7046 ret = filt_sockev_common(kn, kev, so, 0);
39037602
A
7047
7048 socket_unlock(so, 1);
7049
7050 return ret;
3e170ce0
A
7051}
7052
316670eb 7053void
39236c6e
A
7054get_sockev_state(struct socket *so, u_int32_t *statep)
7055{
316670eb
A
7056 u_int32_t state = *(statep);
7057
39037602
A
7058 /*
7059 * If the state variable is already used by a previous event,
7060 * reset it.
7061 */
0a7de745 7062 if (state != 0) {
39037602 7063 return;
0a7de745 7064 }
39037602 7065
0a7de745 7066 if (so->so_state & SS_ISCONNECTED) {
316670eb 7067 state |= SOCKEV_CONNECTED;
0a7de745 7068 } else {
316670eb 7069 state &= ~(SOCKEV_CONNECTED);
0a7de745 7070 }
39236c6e 7071 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
316670eb 7072 *(statep) = state;
55e303ae
A
7073}
7074
0a7de745 7075#define SO_LOCK_HISTORY_STR_LEN \
39236c6e 7076 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
b0d623f7 7077
39236c6e
A
7078__private_extern__ const char *
7079solockhistory_nr(struct socket *so)
55e303ae 7080{
39236c6e
A
7081 size_t n = 0;
7082 int i;
7083 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7084
0a7de745 7085 bzero(lock_history_str, sizeof(lock_history_str));
39236c6e
A
7086 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7087 n += snprintf(lock_history_str + n,
7088 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7089 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7090 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
b0d623f7 7091 }
0a7de745 7092 return lock_history_str;
55e303ae
A
7093}
7094
cb323159
A
7095lck_mtx_t *
7096socket_getlock(struct socket *so, int flags)
7097{
7098 if (so->so_proto->pr_getlock != NULL) {
7099 return (*so->so_proto->pr_getlock)(so, flags);
7100 } else {
7101 return so->so_proto->pr_domain->dom_mtx;
7102 }
7103}
7104
5ba3f43e 7105void
2d21ac55 7106socket_lock(struct socket *so, int refcount)
91447636 7107{
b0d623f7 7108 void *lr_saved;
0c530ab8 7109
b0d623f7 7110 lr_saved = __builtin_return_address(0);
91447636
A
7111
7112 if (so->so_proto->pr_lock) {
5ba3f43e 7113 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
2d21ac55 7114 } else {
91447636 7115#ifdef MORE_LOCKING_DEBUG
5ba3f43e 7116 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
2d21ac55 7117 LCK_MTX_ASSERT_NOTOWNED);
91447636
A
7118#endif
7119 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
0a7de745 7120 if (refcount) {
91447636 7121 so->so_usecount++;
0a7de745 7122 }
b0d623f7 7123 so->lock_lr[so->next_lock_lr] = lr_saved;
0a7de745 7124 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
91447636 7125 }
5ba3f43e 7126}
91447636 7127
5ba3f43e
A
7128void
7129socket_lock_assert_owned(struct socket *so)
7130{
7131 lck_mtx_t *mutex_held;
7132
0a7de745 7133 if (so->so_proto->pr_getlock != NULL) {
5ba3f43e 7134 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
0a7de745 7135 } else {
5ba3f43e 7136 mutex_held = so->so_proto->pr_domain->dom_mtx;
0a7de745 7137 }
5ba3f43e
A
7138
7139 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
91447636
A
7140}
7141
7142int
5ba3f43e
A
7143socket_try_lock(struct socket *so)
7144{
7145 lck_mtx_t *mtx;
7146
0a7de745 7147 if (so->so_proto->pr_getlock != NULL) {
5ba3f43e 7148 mtx = (*so->so_proto->pr_getlock)(so, 0);
0a7de745 7149 } else {
5ba3f43e 7150 mtx = so->so_proto->pr_domain->dom_mtx;
0a7de745 7151 }
5ba3f43e 7152
0a7de745 7153 return lck_mtx_try_lock(mtx);
5ba3f43e
A
7154}
7155
7156void
2d21ac55 7157socket_unlock(struct socket *so, int refcount)
91447636 7158{
b0d623f7 7159 void *lr_saved;
2d21ac55 7160 lck_mtx_t *mutex_held;
91447636 7161
b0d623f7 7162 lr_saved = __builtin_return_address(0);
91447636 7163
cb323159 7164 if (so == NULL || so->so_proto == NULL) {
39236c6e
A
7165 panic("%s: null so_proto so=%p\n", __func__, so);
7166 /* NOTREACHED */
7167 }
91447636 7168
cb323159 7169 if (so->so_proto->pr_unlock) {
5ba3f43e 7170 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
2d21ac55 7171 } else {
91447636
A
7172 mutex_held = so->so_proto->pr_domain->dom_mtx;
7173#ifdef MORE_LOCKING_DEBUG
5ba3f43e 7174 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
91447636 7175#endif
b0d623f7 7176 so->unlock_lr[so->next_unlock_lr] = lr_saved;
0a7de745 7177 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
0c530ab8 7178
91447636 7179 if (refcount) {
39236c6e
A
7180 if (so->so_usecount <= 0) {
7181 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7182 "lrh=%s", __func__, so->so_usecount, so,
7183 SOCK_DOM(so), so->so_type,
7184 SOCK_PROTO(so), solockhistory_nr(so));
7185 /* NOTREACHED */
7186 }
7187
91447636 7188 so->so_usecount--;
0a7de745 7189 if (so->so_usecount == 0) {
91447636 7190 sofreelastref(so, 1);
0a7de745 7191 }
91447636
A
7192 }
7193 lck_mtx_unlock(mutex_held);
7194 }
91447636 7195}
2d21ac55
A
7196
7197/* Called with socket locked, will unlock socket */
91447636 7198void
2d21ac55 7199sofree(struct socket *so)
91447636 7200{
2d21ac55 7201 lck_mtx_t *mutex_held;
39236c6e 7202
0a7de745 7203 if (so->so_proto->pr_getlock != NULL) {
91447636 7204 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
0a7de745 7205 } else {
91447636 7206 mutex_held = so->so_proto->pr_domain->dom_mtx;
0a7de745 7207 }
5ba3f43e 7208 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
2d21ac55 7209
91447636
A
7210 sofreelastref(so, 0);
7211}
7212
7213void
2d21ac55 7214soreference(struct socket *so)
91447636 7215{
0a7de745
A
7216 socket_lock(so, 1); /* locks & take one reference on socket */
7217 socket_unlock(so, 0); /* unlock only */
91447636
A
7218}
7219
7220void
2d21ac55 7221sodereference(struct socket *so)
91447636
A
7222{
7223 socket_lock(so, 0);
7224 socket_unlock(so, 1);
7225}
2d21ac55
A
7226
7227/*
7228 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7229 * possibility of using jumbo clusters. Caller must ensure to hold
7230 * the socket lock.
7231 */
7232void
7233somultipages(struct socket *so, boolean_t set)
7234{
0a7de745 7235 if (set) {
2d21ac55 7236 so->so_flags |= SOF_MULTIPAGES;
0a7de745 7237 } else {
2d21ac55 7238 so->so_flags &= ~SOF_MULTIPAGES;
0a7de745 7239 }
2d21ac55 7240}
b0d623f7 7241
fe8ab488
A
7242void
7243soif2kcl(struct socket *so, boolean_t set)
7244{
0a7de745 7245 if (set) {
fe8ab488 7246 so->so_flags1 |= SOF1_IF_2KCL;
0a7de745 7247 } else {
fe8ab488 7248 so->so_flags1 &= ~SOF1_IF_2KCL;
0a7de745 7249 }
fe8ab488
A
7250}
7251
b0d623f7 7252int
0a7de745
A
7253so_isdstlocal(struct socket *so)
7254{
b0d623f7
A
7255 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7256
0a7de745
A
7257 if (SOCK_DOM(so) == PF_INET) {
7258 return inaddr_local(inp->inp_faddr);
7259 } else if (SOCK_DOM(so) == PF_INET6) {
7260 return in6addr_local(&inp->in6p_faddr);
7261 }
39236c6e 7262
0a7de745 7263 return 0;
b0d623f7 7264}
6d2010ae
A
7265
7266int
7267sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7268{
39236c6e 7269 struct sockbuf *rcv, *snd;
6d2010ae
A
7270 int err = 0, defunct;
7271
39236c6e
A
7272 rcv = &so->so_rcv;
7273 snd = &so->so_snd;
7274
6d2010ae
A
7275 defunct = (so->so_flags & SOF_DEFUNCT);
7276 if (defunct) {
39236c6e 7277 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
6d2010ae 7278 panic("%s: SB_DROP not set", __func__);
39236c6e
A
7279 /* NOTREACHED */
7280 }
6d2010ae
A
7281 goto done;
7282 }
7283
7284 if (so->so_flags & SOF_NODEFUNCT) {
7285 if (noforce) {
7286 err = EOPNOTSUPP;
d9a64523
A
7287 if (p != PROC_NULL) {
7288 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7289 "name %s level %d) so 0x%llx [%d,%d] "
7290 "is not eligible for defunct "
7291 "(%d)\n", __func__, proc_selfpid(),
7292 proc_best_name(current_proc()), proc_pid(p),
7293 proc_best_name(p), level,
7294 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7295 SOCK_DOM(so), SOCK_TYPE(so), err);
7296 }
0a7de745 7297 return err;
d9a64523
A
7298 }
7299 so->so_flags &= ~SOF_NODEFUNCT;
7300 if (p != PROC_NULL) {
39037602
A
7301 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7302 "name %s level %d) so 0x%llx [%d,%d] "
d9a64523 7303 "defunct by force "
39037602
A
7304 "(%d)\n", __func__, proc_selfpid(),
7305 proc_best_name(current_proc()), proc_pid(p),
7306 proc_best_name(p), level,
7307 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7308 SOCK_DOM(so), SOCK_TYPE(so), err);
6d2010ae 7309 }
3e170ce0
A
7310 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7311 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7312 struct ifnet *ifp = inp->inp_last_outifp;
7313
7314 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7315 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7316 } else if (so->so_flags & SOF_DELEGATED) {
7317 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7318 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7319 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
d9a64523 7320 } else if (noforce && p != PROC_NULL) {
3e170ce0 7321 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
39037602 7322
3e170ce0
A
7323 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7324 so->so_extended_bk_start = net_uptime();
7325 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
39037602 7326
3e170ce0 7327 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
39037602 7328
3e170ce0 7329 err = EOPNOTSUPP;
d9a64523
A
7330 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7331 "name %s level %d) so 0x%llx [%d,%d] "
7332 "extend bk idle "
7333 "(%d)\n", __func__, proc_selfpid(),
39037602
A
7334 proc_best_name(current_proc()), proc_pid(p),
7335 proc_best_name(p), level,
7336 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
d9a64523 7337 SOCK_DOM(so), SOCK_TYPE(so), err);
0a7de745 7338 return err;
3e170ce0
A
7339 } else {
7340 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7341 }
6d2010ae
A
7342 }
7343
7344 so->so_flags |= SOF_DEFUNCT;
39236c6e 7345
6d2010ae 7346 /* Prevent further data from being appended to the socket buffers */
39236c6e
A
7347 snd->sb_flags |= SB_DROP;
7348 rcv->sb_flags |= SB_DROP;
7349
7350 /* Flush any existing data in the socket buffers */
7351 if (rcv->sb_cc != 0) {
7352 rcv->sb_flags &= ~SB_SEL;
7353 selthreadclear(&rcv->sb_sel);
7354 sbrelease(rcv);
7355 }
7356 if (snd->sb_cc != 0) {
7357 snd->sb_flags &= ~SB_SEL;
7358 selthreadclear(&snd->sb_sel);
7359 sbrelease(snd);
7360 }
6d2010ae
A
7361
7362done:
d9a64523
A
7363 if (p != PROC_NULL) {
7364 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7365 "so 0x%llx [%d,%d] %s defunct%s\n", __func__,
7366 proc_selfpid(), proc_best_name(current_proc()),
7367 proc_pid(p), proc_best_name(p), level,
7368 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7369 SOCK_TYPE(so), defunct ? "is already" : "marked as",
7370 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7371 " extbkidle" : "");
7372 }
0a7de745 7373 return err;
6d2010ae
A
7374}
7375
7376int
7377sodefunct(struct proc *p, struct socket *so, int level)
7378{
7379 struct sockbuf *rcv, *snd;
7380
39236c6e 7381 if (!(so->so_flags & SOF_DEFUNCT)) {
6d2010ae 7382 panic("%s improperly called", __func__);
39236c6e
A
7383 /* NOTREACHED */
7384 }
0a7de745 7385 if (so->so_state & SS_DEFUNCT) {
6d2010ae 7386 goto done;
0a7de745 7387 }
6d2010ae
A
7388
7389 rcv = &so->so_rcv;
7390 snd = &so->so_snd;
7391
39236c6e
A
7392 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7393 char s[MAX_IPv6_STR_LEN];
7394 char d[MAX_IPv6_STR_LEN];
7395 struct inpcb *inp = sotoinpcb(so);
7396
d9a64523
A
7397 if (p != PROC_NULL) {
7398 SODEFUNCTLOG(
0a7de745
A
7399 "%s[%d, %s]: (target pid %d name %s level %d) "
7400 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
7401 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7402 " snd_fl 0x%x]\n", __func__,
7403 proc_selfpid(), proc_best_name(current_proc()),
7404 proc_pid(p), proc_best_name(p), level,
7405 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7406 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7407 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7408 (void *)&inp->inp_laddr.s_addr :
7409 (void *)&inp->in6p_laddr),
7410 s, sizeof(s)), ntohs(inp->in6p_lport),
7411 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7412 (void *)&inp->inp_faddr.s_addr :
7413 (void *)&inp->in6p_faddr,
7414 d, sizeof(d)), ntohs(inp->in6p_fport),
7415 (uint32_t)rcv->sb_sel.si_flags,
7416 (uint32_t)snd->sb_sel.si_flags,
7417 rcv->sb_flags, snd->sb_flags);
7418 }
7419 } else if (p != PROC_NULL) {
39037602
A
7420 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7421 "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7422 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7423 proc_selfpid(), proc_best_name(current_proc()),
7424 proc_pid(p), proc_best_name(p), level,
7425 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7426 SOCK_DOM(so), SOCK_TYPE(so),
7427 (uint32_t)rcv->sb_sel.si_flags,
39236c6e 7428 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
39037602 7429 snd->sb_flags);
39236c6e 7430 }
6d2010ae
A
7431
7432 /*
7433 * Unwedge threads blocked on sbwait() and sb_lock().
7434 */
7435 sbwakeup(rcv);
7436 sbwakeup(snd);
7437
fe8ab488 7438 so->so_flags1 |= SOF1_DEFUNCTINPROG;
0a7de745
A
7439 if (rcv->sb_flags & SB_LOCK) {
7440 sbunlock(rcv, TRUE); /* keep socket locked */
7441 }
7442 if (snd->sb_flags & SB_LOCK) {
7443 sbunlock(snd, TRUE); /* keep socket locked */
7444 }
6d2010ae
A
7445 /*
7446 * Flush the buffers and disconnect. We explicitly call shutdown
7447 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7448 * states are set for the socket. This would also flush out data
7449 * hanging off the receive list of this socket.
7450 */
fe8ab488
A
7451 (void) soshutdownlock_final(so, SHUT_RD);
7452 (void) soshutdownlock_final(so, SHUT_WR);
6d2010ae
A
7453 (void) sodisconnectlocked(so);
7454
7455 /*
7456 * Explicitly handle connectionless-protocol disconnection
7457 * and release any remaining data in the socket buffers.
7458 */
0a7de745 7459 if (!(so->so_state & SS_ISDISCONNECTED)) {
6d2010ae 7460 (void) soisdisconnected(so);
0a7de745 7461 }
6d2010ae 7462
0a7de745 7463 if (so->so_error == 0) {
6d2010ae 7464 so->so_error = EBADF;
0a7de745 7465 }
6d2010ae 7466
39236c6e
A
7467 if (rcv->sb_cc != 0) {
7468 rcv->sb_flags &= ~SB_SEL;
7469 selthreadclear(&rcv->sb_sel);
6d2010ae 7470 sbrelease(rcv);
39236c6e
A
7471 }
7472 if (snd->sb_cc != 0) {
7473 snd->sb_flags &= ~SB_SEL;
7474 selthreadclear(&snd->sb_sel);
6d2010ae 7475 sbrelease(snd);
39236c6e 7476 }
6d2010ae 7477 so->so_state |= SS_DEFUNCT;
39037602 7478 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
6d2010ae
A
7479
7480done:
0a7de745 7481 return 0;
6d2010ae 7482}
316670eb 7483
3e170ce0
A
7484int
7485soresume(struct proc *p, struct socket *so, int locked)
7486{
0a7de745 7487 if (locked == 0) {
3e170ce0 7488 socket_lock(so, 1);
0a7de745 7489 }
3e170ce0
A
7490
7491 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
39037602
A
7492 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7493 "[%d,%d] resumed from bk idle\n",
7494 __func__, proc_selfpid(), proc_best_name(current_proc()),
7495 proc_pid(p), proc_best_name(p),
3e170ce0 7496 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 7497 SOCK_DOM(so), SOCK_TYPE(so));
3e170ce0
A
7498
7499 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7500 so->so_extended_bk_start = 0;
7501 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7502
7503 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7504 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7505 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7506 }
0a7de745 7507 if (locked == 0) {
3e170ce0 7508 socket_unlock(so, 1);
0a7de745 7509 }
3e170ce0 7510
0a7de745 7511 return 0;
3e170ce0
A
7512}
7513
7514/*
7515 * Does not attempt to account for sockets that are delegated from
7516 * the current process
7517 */
7518int
7519so_set_extended_bk_idle(struct socket *so, int optval)
7520{
7521 int error = 0;
7522
7523 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7524 SOCK_PROTO(so) != IPPROTO_TCP) {
7525 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7526 error = EOPNOTSUPP;
7527 } else if (optval == 0) {
7528 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7529
7530 soresume(current_proc(), so, 1);
7531 } else {
7532 struct proc *p = current_proc();
7533 int i;
7534 struct filedesc *fdp;
7535 int count = 0;
7536
5ba3f43e
A
7537 /*
7538 * Unlock socket to avoid lock ordering issue with
7539 * the proc fd table lock
0a7de745 7540 */
5ba3f43e
A
7541 socket_unlock(so, 0);
7542
3e170ce0
A
7543 proc_fdlock(p);
7544
7545 fdp = p->p_fd;
7546 for (i = 0; i < fdp->fd_nfiles; i++) {
7547 struct fileproc *fp = fdp->fd_ofiles[i];
7548 struct socket *so2;
7549
7550 if (fp == NULL ||
7551 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
0a7de745 7552 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) {
3e170ce0 7553 continue;
0a7de745 7554 }
3e170ce0
A
7555
7556 so2 = (struct socket *)fp->f_fglob->fg_data;
7557 if (so != so2 &&
0a7de745 7558 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
3e170ce0 7559 count++;
0a7de745
A
7560 }
7561 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
3e170ce0 7562 break;
0a7de745 7563 }
3e170ce0 7564 }
5ba3f43e
A
7565 proc_fdunlock(p);
7566
7567 socket_lock(so, 0);
7568
3e170ce0
A
7569 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7570 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7571 error = EBUSY;
7572 } else if (so->so_flags & SOF_DELEGATED) {
7573 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7574 error = EBUSY;
7575 } else {
7576 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7577 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7578 }
39037602 7579 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
3e170ce0 7580 "%s marked for extended bk idle\n",
39037602 7581 __func__, proc_selfpid(), proc_best_name(current_proc()),
3e170ce0
A
7582 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7583 SOCK_DOM(so), SOCK_TYPE(so),
7584 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
39037602 7585 "is" : "not");
3e170ce0
A
7586 }
7587
0a7de745 7588 return error;
3e170ce0
A
7589}
7590
7591static void
7592so_stop_extended_bk_idle(struct socket *so)
7593{
7594 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7595 so->so_extended_bk_start = 0;
7596
7597 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7598 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7599 /*
7600 * Force defunct
7601 */
7602 sosetdefunct(current_proc(), so,
7603 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7604 if (so->so_flags & SOF_DEFUNCT) {
7605 sodefunct(current_proc(), so,
7606 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7607 }
7608}
7609
7610void
7611so_drain_extended_bk_idle(struct socket *so)
7612{
7613 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7614 /*
7615 * Only penalize sockets that have outstanding data
7616 */
7617 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7618 so_stop_extended_bk_idle(so);
7619
7620 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7621 }
7622 }
7623}
7624
7625/*
7626 * Return values tells if socket is still in extended background idle
7627 */
7628int
7629so_check_extended_bk_idle_time(struct socket *so)
7630{
7631 int ret = 1;
7632
7633 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
39037602
A
7634 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7635 __func__, proc_selfpid(), proc_best_name(current_proc()),
3e170ce0 7636 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 7637 SOCK_DOM(so), SOCK_TYPE(so));
3e170ce0
A
7638 if (net_uptime() - so->so_extended_bk_start >
7639 soextbkidlestat.so_xbkidle_time) {
7640 so_stop_extended_bk_idle(so);
7641
7642 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7643
7644 ret = 0;
7645 } else {
7646 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7647
7648 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7649 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7650 }
7651 }
39037602 7652
0a7de745 7653 return ret;
3e170ce0
A
7654}
7655
7656void
7657resume_proc_sockets(proc_t p)
7658{
7659 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
0a7de745 7660 struct filedesc *fdp;
3e170ce0
A
7661 int i;
7662
7663 proc_fdlock(p);
7664 fdp = p->p_fd;
7665 for (i = 0; i < fdp->fd_nfiles; i++) {
0a7de745 7666 struct fileproc *fp;
3e170ce0
A
7667 struct socket *so;
7668
7669 fp = fdp->fd_ofiles[i];
39037602 7670 if (fp == NULL ||
3e170ce0 7671 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
0a7de745 7672 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET) {
3e170ce0 7673 continue;
0a7de745 7674 }
3e170ce0
A
7675
7676 so = (struct socket *)fp->f_fglob->fg_data;
7677 (void) soresume(p, so, 0);
7678 }
7679 proc_fdunlock(p);
7680
7681 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7682 }
7683}
7684
316670eb
A
7685__private_extern__ int
7686so_set_recv_anyif(struct socket *so, int optval)
7687{
7688 int ret = 0;
7689
7690#if INET6
39236c6e 7691 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
316670eb 7692#else
39236c6e 7693 if (SOCK_DOM(so) == PF_INET) {
316670eb 7694#endif /* !INET6 */
0a7de745 7695 if (optval) {
316670eb 7696 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
0a7de745 7697 } else {
316670eb 7698 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
0a7de745 7699 }
316670eb
A
7700 }
7701
5ba3f43e 7702
0a7de745 7703 return ret;
316670eb
A
7704}
7705
7706__private_extern__ int
7707so_get_recv_anyif(struct socket *so)
7708{
7709 int ret = 0;
7710
7711#if INET6
39236c6e 7712 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
316670eb 7713#else
39236c6e 7714 if (SOCK_DOM(so) == PF_INET) {
316670eb
A
7715#endif /* !INET6 */
7716 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7717 }
7718
0a7de745 7719 return ret;
316670eb 7720}
39236c6e
A
7721
7722int
7723so_set_restrictions(struct socket *so, uint32_t vals)
7724{
7725 int nocell_old, nocell_new;
fe8ab488 7726 int noexpensive_old, noexpensive_new;
cb323159 7727 int noconstrained_old, noconstrained_new;
39236c6e
A
7728
7729 /*
7730 * Deny-type restrictions are trapdoors; once set they cannot be
7731 * unset for the lifetime of the socket. This allows them to be
7732 * issued by a framework on behalf of the application without
7733 * having to worry that they can be undone.
7734 *
7735 * Note here that socket-level restrictions overrides any protocol
7736 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7737 * socket restriction issued on the socket has a higher precendence
7738 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7739 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7740 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7741 */
7742 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
fe8ab488 7743 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
cb323159 7744 noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
39236c6e 7745 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
3e170ce0 7746 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
cb323159 7747 SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
39236c6e 7748 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
fe8ab488 7749 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
cb323159 7750 noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
39236c6e
A
7751
7752 /* we can only set, not clear restrictions */
fe8ab488 7753 if ((nocell_new - nocell_old) == 0 &&
cb323159
A
7754 (noexpensive_new - noexpensive_old) == 0 &&
7755 (noconstrained_new - noconstrained_old) == 0) {
0a7de745
A
7756 return 0;
7757 }
39236c6e
A
7758#if INET6
7759 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7760#else
7761 if (SOCK_DOM(so) == PF_INET) {
7762#endif /* !INET6 */
fe8ab488 7763 if (nocell_new - nocell_old != 0) {
3e170ce0
A
7764 /*
7765 * if deny cellular is now set, do what's needed
7766 * for INPCB
7767 */
fe8ab488
A
7768 inp_set_nocellular(sotoinpcb(so));
7769 }
7770 if (noexpensive_new - noexpensive_old != 0) {
7771 inp_set_noexpensive(sotoinpcb(so));
7772 }
cb323159
A
7773 if (noconstrained_new - noconstrained_old != 0) {
7774 inp_set_noconstrained(sotoinpcb(so));
7775 }
39236c6e
A
7776 }
7777
0a7de745 7778 if (SOCK_DOM(so) == PF_MULTIPATH) {
5ba3f43e 7779 mptcp_set_restrictions(so);
0a7de745 7780 }
5ba3f43e 7781
0a7de745 7782 return 0;
39236c6e
A
7783}
7784
7785uint32_t
7786so_get_restrictions(struct socket *so)
7787{
0a7de745
A
7788 return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7789 SO_RESTRICT_DENY_OUT |
7790 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
39236c6e
A
7791}
7792
39236c6e 7793int
cb323159 7794so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
39236c6e
A
7795{
7796 struct proc *ep = PROC_NULL;
7797 int error = 0;
7798
7799 /* pid 0 is reserved for kernel */
7800 if (epid == 0) {
7801 error = EINVAL;
7802 goto done;
7803 }
7804
7805 /*
7806 * If this is an in-kernel socket, prevent its delegate
7807 * association from changing unless the socket option is
7808 * coming from within the kernel itself.
7809 */
7810 if (so->last_pid == 0 && p != kernproc) {
7811 error = EACCES;
7812 goto done;
7813 }
7814
7815 /*
7816 * If this is issued by a process that's recorded as the
7817 * real owner of the socket, or if the pid is the same as
7818 * the process's own pid, then proceed. Otherwise ensure
7819 * that the issuing process has the necessary privileges.
7820 */
cb323159 7821 if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
39236c6e
A
7822 if ((error = priv_check_cred(kauth_cred_get(),
7823 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7824 error = EACCES;
7825 goto done;
7826 }
7827 }
7828
7829 /* Find the process that corresponds to the effective pid */
7830 if ((ep = proc_find(epid)) == PROC_NULL) {
7831 error = ESRCH;
7832 goto done;
7833 }
7834
7835 /*
7836 * If a process tries to delegate the socket to itself, then
7837 * there's really nothing to do; treat it as a way for the
7838 * delegate association to be cleared. Note that we check
7839 * the passed-in proc rather than calling proc_selfpid(),
7840 * as we need to check the process issuing the socket option
7841 * which could be kernproc. Given that we don't allow 0 for
7842 * effective pid, it means that a delegated in-kernel socket
7843 * stays delegated during its lifetime (which is probably OK.)
7844 */
7845 if (epid == proc_pid(p)) {
7846 so->so_flags &= ~SOF_DELEGATED;
7847 so->e_upid = 0;
7848 so->e_pid = 0;
7849 uuid_clear(so->e_uuid);
7850 } else {
7851 so->so_flags |= SOF_DELEGATED;
7852 so->e_upid = proc_uniqueid(ep);
7853 so->e_pid = proc_pid(ep);
0a7de745 7854 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
39236c6e 7855 }
cb323159
A
7856 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7857 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
7858 }
39236c6e
A
7859done:
7860 if (error == 0 && net_io_policy_log) {
7861 uuid_string_t buf;
7862
7863 uuid_unparse(so->e_uuid, buf);
7864 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7865 "euuid %s%s\n", __func__, proc_name_address(p),
3e170ce0
A
7866 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7867 SOCK_DOM(so), SOCK_TYPE(so),
7868 so->e_pid, proc_name_address(ep), buf,
39236c6e
A
7869 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7870 } else if (error != 0 && net_io_policy_log) {
7871 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7872 "ERROR (%d)\n", __func__, proc_name_address(p),
3e170ce0
A
7873 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7874 SOCK_DOM(so), SOCK_TYPE(so),
7875 epid, (ep == PROC_NULL) ? "PROC_NULL" :
39236c6e
A
7876 proc_name_address(ep), error);
7877 }
7878
fe8ab488
A
7879 /* Update this socket's policy upon success */
7880 if (error == 0) {
7881 so->so_policy_gencnt *= -1;
7882 so_update_policy(so);
7883#if NECP
7884 so_update_necp_policy(so, NULL, NULL);
7885#endif /* NECP */
7886 }
7887
0a7de745 7888 if (ep != PROC_NULL) {
39236c6e 7889 proc_rele(ep);
0a7de745 7890 }
39236c6e 7891
0a7de745 7892 return error;
39236c6e
A
7893}
7894
7895int
cb323159 7896so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
39236c6e
A
7897{
7898 uuid_string_t buf;
7899 uuid_t uuid;
7900 int error = 0;
7901
7902 /* UUID must not be all-zeroes (reserved for kernel) */
7903 if (uuid_is_null(euuid)) {
7904 error = EINVAL;
3e170ce0 7905 goto done;
39236c6e
A
7906 }
7907
7908 /*
7909 * If this is an in-kernel socket, prevent its delegate
7910 * association from changing unless the socket option is
7911 * coming from within the kernel itself.
7912 */
7913 if (so->last_pid == 0 && p != kernproc) {
7914 error = EACCES;
7915 goto done;
7916 }
7917
7918 /* Get the UUID of the issuing process */
0a7de745 7919 proc_getexecutableuuid(p, uuid, sizeof(uuid));
39236c6e
A
7920
7921 /*
7922 * If this is issued by a process that's recorded as the
7923 * real owner of the socket, or if the uuid is the same as
7924 * the process's own uuid, then proceed. Otherwise ensure
7925 * that the issuing process has the necessary privileges.
7926 */
cb323159
A
7927 if (check_cred &&
7928 (uuid_compare(euuid, so->last_uuid) != 0 ||
7929 uuid_compare(euuid, uuid) != 0)) {
39236c6e
A
7930 if ((error = priv_check_cred(kauth_cred_get(),
7931 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7932 error = EACCES;
7933 goto done;
7934 }
7935 }
7936
7937 /*
7938 * If a process tries to delegate the socket to itself, then
7939 * there's really nothing to do; treat it as a way for the
7940 * delegate association to be cleared. Note that we check
7941 * the uuid of the passed-in proc rather than that of the
7942 * current process, as we need to check the process issuing
7943 * the socket option which could be kernproc itself. Given
7944 * that we don't allow 0 for effective uuid, it means that
7945 * a delegated in-kernel socket stays delegated during its
7946 * lifetime (which is okay.)
7947 */
7948 if (uuid_compare(euuid, uuid) == 0) {
7949 so->so_flags &= ~SOF_DELEGATED;
7950 so->e_upid = 0;
7951 so->e_pid = 0;
7952 uuid_clear(so->e_uuid);
7953 } else {
7954 so->so_flags |= SOF_DELEGATED;
7955 /*
7956 * Unlike so_set_effective_pid(), we only have the UUID
7957 * here and the process ID is not known. Inherit the
7958 * real {pid,upid} of the socket.
7959 */
7960 so->e_upid = so->last_upid;
7961 so->e_pid = so->last_pid;
7962 uuid_copy(so->e_uuid, euuid);
7963 }
cb323159
A
7964 /*
7965 * The following will clear the effective process name as it's the same
7966 * as the real process
7967 */
7968 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7969 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
7970 }
39236c6e
A
7971done:
7972 if (error == 0 && net_io_policy_log) {
7973 uuid_unparse(so->e_uuid, buf);
7974 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7975 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
3e170ce0 7976 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
39236c6e
A
7977 SOCK_TYPE(so), so->e_pid, buf,
7978 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7979 } else if (error != 0 && net_io_policy_log) {
7980 uuid_unparse(euuid, buf);
7981 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7982 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
3e170ce0 7983 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
39236c6e
A
7984 SOCK_TYPE(so), buf, error);
7985 }
7986
fe8ab488
A
7987 /* Update this socket's policy upon success */
7988 if (error == 0) {
7989 so->so_policy_gencnt *= -1;
7990 so_update_policy(so);
7991#if NECP
7992 so_update_necp_policy(so, NULL, NULL);
7993#endif /* NECP */
7994 }
7995
0a7de745 7996 return error;
39236c6e
A
7997}
7998
7999void
8000netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8001 uint32_t ev_datalen)
8002{
8003 struct kev_msg ev_msg;
8004
8005 /*
8006 * A netpolicy event always starts with a netpolicy_event_data
8007 * structure, but the caller can provide for a longer event
8008 * structure to post, depending on the event code.
8009 */
0a7de745 8010 VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
39236c6e 8011
0a7de745
A
8012 bzero(&ev_msg, sizeof(ev_msg));
8013 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8014 ev_msg.kev_class = KEV_NETWORK_CLASS;
8015 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
8016 ev_msg.event_code = ev_code;
39236c6e 8017
0a7de745 8018 ev_msg.dv[0].data_ptr = ev_data;
39236c6e
A
8019 ev_msg.dv[0].data_length = ev_datalen;
8020
8021 kev_post_msg(&ev_msg);
8022}
fe8ab488
A
8023
8024void
3e170ce0 8025socket_post_kev_msg(uint32_t ev_code,
fe8ab488
A
8026 struct kev_socket_event_data *ev_data,
8027 uint32_t ev_datalen)
8028{
8029 struct kev_msg ev_msg;
8030
8031 bzero(&ev_msg, sizeof(ev_msg));
8032 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8033 ev_msg.kev_class = KEV_NETWORK_CLASS;
8034 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8035 ev_msg.event_code = ev_code;
8036
8037 ev_msg.dv[0].data_ptr = ev_data;
0a7de745 8038 ev_msg.dv[0].data_length = ev_datalen;
fe8ab488
A
8039
8040 kev_post_msg(&ev_msg);
8041}
8042
8043void
8044socket_post_kev_msg_closed(struct socket *so)
8045{
8046 struct kev_socket_closed ev;
8047 struct sockaddr *socksa = NULL, *peersa = NULL;
8048 int err;
8049 bzero(&ev, sizeof(ev));
8050 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8051 if (err == 0) {
8052 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8053 &peersa);
8054 if (err == 0) {
8055 memcpy(&ev.ev_data.kev_sockname, socksa,
8056 min(socksa->sa_len,
0a7de745 8057 sizeof(ev.ev_data.kev_sockname)));
fe8ab488
A
8058 memcpy(&ev.ev_data.kev_peername, peersa,
8059 min(peersa->sa_len,
0a7de745 8060 sizeof(ev.ev_data.kev_peername)));
fe8ab488 8061 socket_post_kev_msg(KEV_SOCKET_CLOSED,
0a7de745 8062 &ev.ev_data, sizeof(ev));
fe8ab488
A
8063 }
8064 }
0a7de745 8065 if (socksa != NULL) {
fe8ab488 8066 FREE(socksa, M_SONAME);
0a7de745
A
8067 }
8068 if (peersa != NULL) {
fe8ab488 8069 FREE(peersa, M_SONAME);
0a7de745 8070 }
fe8ab488 8071}