]> git.saurik.com Git - apple/xnu.git/blame - bsd/kern/uipc_socket.c
xnu-4570.41.2.tar.gz
[apple/xnu.git] / bsd / kern / uipc_socket.c
CommitLineData
1c79356b 1/*
5ba3f43e 2 * Copyright (c) 1998-2017 Apple Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
39236c6e 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
39236c6e 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
39236c6e 17 *
2d21ac55
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
39236c6e 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b 27 */
1c79356b
A
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
9bccf70c 61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
1c79356b 62 */
2d21ac55
A
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
1c79356b
A
69
70#include <sys/param.h>
71#include <sys/systm.h>
55e303ae 72#include <sys/filedesc.h>
2d21ac55 73#include <sys/proc.h>
91447636
A
74#include <sys/proc_internal.h>
75#include <sys/kauth.h>
76#include <sys/file_internal.h>
1c79356b
A
77#include <sys/fcntl.h>
78#include <sys/malloc.h>
79#include <sys/mbuf.h>
80#include <sys/domain.h>
81#include <sys/kernel.h>
55e303ae 82#include <sys/event.h>
1c79356b
A
83#include <sys/poll.h>
84#include <sys/protosw.h>
85#include <sys/socket.h>
86#include <sys/socketvar.h>
87#include <sys/resourcevar.h>
88#include <sys/signalvar.h>
89#include <sys/sysctl.h>
39236c6e 90#include <sys/syslog.h>
1c79356b 91#include <sys/uio.h>
fe8ab488 92#include <sys/uio_internal.h>
1c79356b
A
93#include <sys/ev.h>
94#include <sys/kdebug.h>
2d21ac55 95#include <sys/un.h>
d1ecb069 96#include <sys/user.h>
316670eb 97#include <sys/priv.h>
39236c6e 98#include <sys/kern_event.h>
1c79356b 99#include <net/route.h>
39236c6e 100#include <net/init.h>
5ba3f43e 101#include <net/net_api_stats.h>
316670eb 102#include <net/ntstat.h>
fe8ab488 103#include <net/content_filter.h>
1c79356b
A
104#include <netinet/in.h>
105#include <netinet/in_pcb.h>
39037602
A
106#include <netinet/in_tclass.h>
107#include <netinet/tcp_var.h>
6d2010ae
A
108#include <netinet/ip6.h>
109#include <netinet6/ip6_var.h>
39236c6e 110#include <netinet/flow_divert.h>
1c79356b 111#include <kern/zalloc.h>
91447636 112#include <kern/locks.h>
1c79356b 113#include <machine/limits.h>
2d21ac55
A
114#include <libkern/OSAtomic.h>
115#include <pexpert/pexpert.h>
b0d623f7 116#include <kern/assert.h>
6d2010ae 117#include <kern/task.h>
39037602
A
118#include <kern/policy_internal.h>
119
316670eb 120#include <sys/kpi_mbuf.h>
6d2010ae 121#include <sys/mcache.h>
fe8ab488 122#include <sys/unpcb.h>
5ba3f43e 123#include <libkern/section_keywords.h>
2d21ac55
A
124
125#if CONFIG_MACF
2d21ac55
A
126#include <security/mac_framework.h>
127#endif /* MAC */
128
39236c6e
A
129#if MULTIPATH
130#include <netinet/mp_pcb.h>
fe8ab488 131#include <netinet/mptcp_var.h>
39236c6e
A
132#endif /* MULTIPATH */
133
3e170ce0
A
134#define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
135
136#if DEBUG || DEVELOPMENT
137#define DEBUG_KERNEL_ADDRPERM(_v) (_v)
138#else
139#define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
140#endif
141
39236c6e
A
142/* TODO: this should be in a header file somewhere */
143extern char *proc_name_address(void *p);
39037602 144extern char *proc_best_name(proc_t);
39236c6e
A
145
146static u_int32_t so_cache_hw; /* High water mark for socache */
147static u_int32_t so_cache_timeouts; /* number of timeouts */
148static u_int32_t so_cache_max_freed; /* max freed per timeout */
149static u_int32_t cached_sock_count = 0;
150STAILQ_HEAD(, socket) so_cache_head;
151int max_cached_sock_count = MAX_CACHED_SOCKETS;
152static u_int32_t so_cache_time;
153static int socketinit_done;
154static struct zone *so_cache_zone;
155
156static lck_grp_t *so_cache_mtx_grp;
157static lck_attr_t *so_cache_mtx_attr;
91447636 158static lck_grp_attr_t *so_cache_mtx_grp_attr;
39236c6e 159static lck_mtx_t *so_cache_mtx;
91447636 160
1c79356b
A
161#include <machine/limits.h>
162
5ba3f43e 163static int filt_sorattach(struct knote *kn, struct kevent_internal_s *kev);
2d21ac55
A
164static void filt_sordetach(struct knote *kn);
165static int filt_soread(struct knote *kn, long hint);
39037602
A
166static int filt_sortouch(struct knote *kn, struct kevent_internal_s *kev);
167static int filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
168
5ba3f43e 169static int filt_sowattach(struct knote *kn, struct kevent_internal_s *kev);
2d21ac55
A
170static void filt_sowdetach(struct knote *kn);
171static int filt_sowrite(struct knote *kn, long hint);
39037602
A
172static int filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev);
173static int filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
174
5ba3f43e 175static int filt_sockattach(struct knote *kn, struct kevent_internal_s *kev);
316670eb
A
176static void filt_sockdetach(struct knote *kn);
177static int filt_sockev(struct knote *kn, long hint);
39037602
A
178static int filt_socktouch(struct knote *kn, struct kevent_internal_s *kev);
179static int filt_sockprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
2d21ac55 180
39236c6e
A
181static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
182static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
55e303ae 183
5ba3f43e 184SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
39236c6e 185 .f_isfd = 1,
39037602 186 .f_attach = filt_sorattach,
39236c6e
A
187 .f_detach = filt_sordetach,
188 .f_event = filt_soread,
39037602
A
189 .f_touch = filt_sortouch,
190 .f_process = filt_sorprocess,
b0d623f7 191};
39236c6e 192
5ba3f43e 193SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
39236c6e 194 .f_isfd = 1,
39037602 195 .f_attach = filt_sowattach,
39236c6e
A
196 .f_detach = filt_sowdetach,
197 .f_event = filt_sowrite,
39037602
A
198 .f_touch = filt_sowtouch,
199 .f_process = filt_sowprocess,
b0d623f7 200};
39236c6e 201
5ba3f43e 202SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
316670eb 203 .f_isfd = 1,
39037602 204 .f_attach = filt_sockattach,
316670eb
A
205 .f_detach = filt_sockdetach,
206 .f_event = filt_sockev,
3e170ce0 207 .f_touch = filt_socktouch,
39037602
A
208 .f_process = filt_sockprocess,
209};
210
5ba3f43e 211SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
39037602
A
212 .f_isfd = 1,
213 .f_attach = filt_sorattach,
214 .f_detach = filt_sordetach,
215 .f_event = filt_soread,
216 .f_touch = filt_sortouch,
217 .f_process = filt_sorprocess,
316670eb 218};
55e303ae 219
fe8ab488
A
220SYSCTL_DECL(_kern_ipc);
221
2d21ac55 222#define EVEN_MORE_LOCKING_DEBUG 0
fe8ab488 223
1c79356b 224int socket_debug = 0;
fe8ab488
A
225SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
226 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
227
39037602
A
228static unsigned long sodefunct_calls = 0;
229SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
230 &sodefunct_calls, "");
231
39236c6e 232static int socket_zone = M_SOCKET;
1c79356b
A
233so_gen_t so_gencnt; /* generation count for sockets */
234
235MALLOC_DEFINE(M_SONAME, "soname", "socket name");
236MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
237
2d21ac55
A
238#define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
239#define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
240#define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
241#define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
242#define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
fe8ab488 243#define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
2d21ac55 244#define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
fe8ab488 245#define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
2d21ac55 246#define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
1c79356b 247
2d21ac55 248#define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
1c79356b 249
2d21ac55 250int somaxconn = SOMAXCONN;
39236c6e
A
251SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
252 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
1c79356b
A
253
254/* Should we get a maximum also ??? */
fa4905b1 255static int sosendmaxchain = 65536;
1c79356b 256static int sosendminchain = 16384;
55e303ae 257static int sorecvmincopy = 16384;
39236c6e
A
258SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
259 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
260SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
261 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
2d21ac55
A
262
263/*
264 * Set to enable jumbo clusters (if available) for large writes when
265 * the socket is marked with SOF_MULTIPAGES; see below.
266 */
267int sosendjcl = 1;
39236c6e
A
268SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
269 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
1c79356b 270
2d21ac55
A
271/*
272 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
273 * writes on the socket for all protocols on any network interfaces,
274 * depending upon sosendjcl above. Be extra careful when setting this
275 * to 1, because sending down packets that cross physical pages down to
276 * broken drivers (those that falsely assume that the physical pages
277 * are contiguous) might lead to system panics or silent data corruption.
278 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
279 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
280 * capable. Set this to 1 only for testing/debugging purposes.
281 */
282int sosendjcl_ignore_capab = 0;
39236c6e
A
283SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
284 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
1c79356b 285
3e170ce0
A
286/*
287 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
288 * writes on the socket for all protocols on any network interfaces.
289 * Be extra careful when setting this to 1, because sending down packets with
290 * clusters larger that 2 KB might lead to system panics or data corruption.
291 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
292 * on the outgoing interface
293 * Set this to 1 for testing/debugging purposes only.
294 */
fe8ab488
A
295int sosendbigcl_ignore_capab = 0;
296SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
297 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
298
6d2010ae
A
299int sodefunctlog = 0;
300SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
39236c6e 301 &sodefunctlog, 0, "");
6d2010ae 302
316670eb
A
303int sothrottlelog = 0;
304SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
39236c6e
A
305 &sothrottlelog, 0, "");
306
307int sorestrictrecv = 1;
308SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
309 &sorestrictrecv, 0, "Enable inbound interface restrictions");
316670eb 310
fe8ab488
A
311int sorestrictsend = 1;
312SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
313 &sorestrictsend, 0, "Enable outbound interface restrictions");
1c79356b 314
3e170ce0
A
315int soreserveheadroom = 1;
316SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
317 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
318
39037602
A
319#if (DEBUG || DEVELOPMENT)
320int so_notsent_lowat_check = 1;
321SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW|CTLFLAG_LOCKED,
322 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
323#endif /* DEBUG || DEVELOPMENT */
324
813fb2f6
A
325int so_accept_list_waits = 0;
326#if (DEBUG || DEVELOPMENT)
327SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW|CTLFLAG_LOCKED,
328 &so_accept_list_waits, 0, "number of waits for listener incomp list");
329#endif /* DEBUG || DEVELOPMENT */
330
39236c6e 331extern struct inpcbinfo tcbinfo;
2d21ac55
A
332
333/* TODO: these should be in header file */
334extern int get_inpcb_str_size(void);
335extern int get_tcp_str_size(void);
2d21ac55 336
91447636
A
337vm_size_t so_cache_zone_element_size;
338
3e170ce0
A
339static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
340 user_ssize_t *);
2d21ac55
A
341static void cached_sock_alloc(struct socket **, int);
342static void cached_sock_free(struct socket *);
91447636 343
3e170ce0
A
344/*
345 * Maximum of extended background idle sockets per process
346 * Set to zero to disable further setting of the option
347 */
348
349#define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
350#define SO_IDLE_BK_IDLE_TIME 600
351#define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
352
353struct soextbkidlestat soextbkidlestat;
354
355SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
356 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
357 "Maximum of extended background idle sockets per process");
358
359SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
360 &soextbkidlestat.so_xbkidle_time, 0,
361 "Time in seconds to keep extended background idle sockets");
362
363SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
364 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
365 "High water mark for extended background idle sockets");
366
367SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
368 &soextbkidlestat, soextbkidlestat, "");
369
370int so_set_extended_bk_idle(struct socket *, int);
371
5ba3f43e 372
316670eb
A
373/*
374 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
375 * setting the DSCP code on the packet based on the service class; see
376 * <rdar://problem/11277343> for details.
377 */
39037602 378__private_extern__ u_int32_t sotcdb = 0;
6d2010ae 379SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
39236c6e 380 &sotcdb, 0, "");
91447636 381
2d21ac55
A
382void
383socketinit(void)
1c79356b 384{
fe8ab488
A
385 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
386 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
387
3e170ce0
A
388#ifdef __LP64__
389 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
390 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
391 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
392 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
393 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
394 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
395#else
396 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
397 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
398 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
399 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
400 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
401 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
402#endif
403
39236c6e 404 if (socketinit_done) {
91447636
A
405 printf("socketinit: already called...\n");
406 return;
407 }
39236c6e 408 socketinit_done = 1;
91447636 409
39236c6e
A
410 PE_parse_boot_argn("socket_debug", &socket_debug,
411 sizeof (socket_debug));
2d21ac55 412
91447636
A
413 /*
414 * allocate lock group attribute and group for socket cache mutex
415 */
416 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
2d21ac55
A
417 so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
418 so_cache_mtx_grp_attr);
419
91447636
A
420 /*
421 * allocate the lock attribute for socket cache mutex
422 */
423 so_cache_mtx_attr = lck_attr_alloc_init();
91447636 424
2d21ac55
A
425 /* cached sockets mutex */
426 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
39236c6e
A
427 if (so_cache_mtx == NULL) {
428 panic("%s: unable to allocate so_cache_mtx\n", __func__);
429 /* NOTREACHED */
430 }
431 STAILQ_INIT(&so_cache_head);
1c79356b 432
39236c6e
A
433 so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4
434 + get_inpcb_str_size() + 4 + get_tcp_str_size());
2d21ac55 435
3e170ce0 436 so_cache_zone = zinit(so_cache_zone_element_size,
39236c6e 437 (120000 * so_cache_zone_element_size), 8192, "socache zone");
6d2010ae 438 zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
0b4c1975 439 zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
1c79356b 440
3e170ce0
A
441 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
442 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
443 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
444 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
316670eb 445
39236c6e
A
446 in_pcbinit();
447 sflt_init();
6d2010ae 448 socket_tclass_init();
39236c6e
A
449#if MULTIPATH
450 mp_pcbinit();
451#endif /* MULTIPATH */
1c79356b
A
452}
453
2d21ac55
A
454static void
455cached_sock_alloc(struct socket **so, int waitok)
1c79356b 456{
2d21ac55 457 caddr_t temp;
39236c6e 458 uintptr_t offset;
1c79356b 459
91447636
A
460 lck_mtx_lock(so_cache_mtx);
461
39236c6e
A
462 if (!STAILQ_EMPTY(&so_cache_head)) {
463 VERIFY(cached_sock_count > 0);
1c79356b 464
39236c6e
A
465 *so = STAILQ_FIRST(&so_cache_head);
466 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
467 STAILQ_NEXT((*so), so_cache_ent) = NULL;
91447636 468
39236c6e 469 cached_sock_count--;
91447636 470 lck_mtx_unlock(so_cache_mtx);
1c79356b 471
2d21ac55
A
472 temp = (*so)->so_saved_pcb;
473 bzero((caddr_t)*so, sizeof (struct socket));
39236c6e 474
2d21ac55 475 (*so)->so_saved_pcb = temp;
2d21ac55 476 } else {
1c79356b 477
2d21ac55 478 lck_mtx_unlock(so_cache_mtx);
1c79356b 479
2d21ac55
A
480 if (waitok)
481 *so = (struct socket *)zalloc(so_cache_zone);
482 else
483 *so = (struct socket *)zalloc_noblock(so_cache_zone);
1c79356b 484
39236c6e 485 if (*so == NULL)
2d21ac55 486 return;
1c79356b 487
2d21ac55 488 bzero((caddr_t)*so, sizeof (struct socket));
1c79356b 489
2d21ac55 490 /*
3e170ce0
A
491 * Define offsets for extra structures into our
492 * single block of memory. Align extra structures
39236c6e 493 * on longword boundaries.
2d21ac55 494 */
b0d623f7 495
39236c6e 496 offset = (uintptr_t)*so;
2d21ac55 497 offset += sizeof (struct socket);
b0d623f7
A
498
499 offset = ALIGN(offset);
500
2d21ac55
A
501 (*so)->so_saved_pcb = (caddr_t)offset;
502 offset += get_inpcb_str_size();
b0d623f7
A
503
504 offset = ALIGN(offset);
1c79356b 505
316670eb 506 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
2d21ac55 507 (caddr_t)offset;
2d21ac55 508 }
1c79356b 509
3e170ce0 510 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
1c79356b
A
511}
512
2d21ac55
A
513static void
514cached_sock_free(struct socket *so)
1c79356b 515{
1c79356b 516
91447636 517 lck_mtx_lock(so_cache_mtx);
1c79356b 518
39236c6e 519 so_cache_time = net_uptime();
b0d623f7 520 if (++cached_sock_count > max_cached_sock_count) {
1c79356b 521 --cached_sock_count;
91447636 522 lck_mtx_unlock(so_cache_mtx);
91447636 523 zfree(so_cache_zone, so);
2d21ac55 524 } else {
1c79356b
A
525 if (so_cache_hw < cached_sock_count)
526 so_cache_hw = cached_sock_count;
527
39236c6e 528 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
1c79356b
A
529
530 so->cache_timestamp = so_cache_time;
91447636 531 lck_mtx_unlock(so_cache_mtx);
1c79356b 532 }
1c79356b
A
533}
534
39236c6e
A
535void
536so_update_last_owner_locked(struct socket *so, proc_t self)
6d2010ae 537{
39236c6e
A
538 if (so->last_pid != 0) {
539 /*
540 * last_pid and last_upid should remain zero for sockets
541 * created using sock_socket. The check above achieves that
542 */
543 if (self == PROC_NULL)
316670eb 544 self = current_proc();
39236c6e
A
545
546 if (so->last_upid != proc_uniqueid(self) ||
547 so->last_pid != proc_pid(self)) {
316670eb
A
548 so->last_upid = proc_uniqueid(self);
549 so->last_pid = proc_pid(self);
39236c6e
A
550 proc_getexecutableuuid(self, so->last_uuid,
551 sizeof (so->last_uuid));
316670eb 552 }
fe8ab488 553 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
6d2010ae
A
554 }
555}
556
39236c6e
A
557void
558so_update_policy(struct socket *so)
1c79356b 559{
39236c6e
A
560 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
561 (void) inp_update_policy(sotoinpcb(so));
562}
1c79356b 563
fe8ab488
A
564#if NECP
565static void
3e170ce0
A
566so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
567 struct sockaddr *override_remote_addr)
fe8ab488
A
568{
569 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
3e170ce0
A
570 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
571 override_remote_addr, 0);
fe8ab488
A
572}
573#endif /* NECP */
574
39236c6e
A
575boolean_t
576so_cache_timer(void)
577{
578 struct socket *p;
579 int n_freed = 0;
580 boolean_t rc = FALSE;
1c79356b 581
39236c6e
A
582 lck_mtx_lock(so_cache_mtx);
583 so_cache_timeouts++;
584 so_cache_time = net_uptime();
585
586 while (!STAILQ_EMPTY(&so_cache_head)) {
587 VERIFY(cached_sock_count > 0);
588 p = STAILQ_FIRST(&so_cache_head);
3e170ce0 589 if ((so_cache_time - p->cache_timestamp) <
39236c6e 590 SO_CACHE_TIME_LIMIT)
2d21ac55 591 break;
1c79356b 592
39236c6e
A
593 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
594 --cached_sock_count;
1c79356b 595
91447636 596 zfree(so_cache_zone, p);
2d21ac55
A
597
598 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
599 so_cache_max_freed++;
1c79356b
A
600 break;
601 }
602 }
1c79356b 603
39236c6e
A
604 /* Schedule again if there is more to cleanup */
605 if (!STAILQ_EMPTY(&so_cache_head))
606 rc = TRUE;
607
608 lck_mtx_unlock(so_cache_mtx);
609 return (rc);
1c79356b 610}
1c79356b
A
611
612/*
613 * Get a socket structure from our zone, and initialize it.
614 * We don't implement `waitok' yet (see comments in uipc_domain.c).
615 * Note that it would probably be better to allocate socket
616 * and PCB at the same time, but I'm not convinced that all
617 * the protocols can be easily modified to do this.
618 */
619struct socket *
2d21ac55 620soalloc(int waitok, int dom, int type)
1c79356b
A
621{
622 struct socket *so;
623
2d21ac55
A
624 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
625 cached_sock_alloc(&so, waitok);
626 } else {
627 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
628 M_WAITOK);
629 if (so != NULL)
630 bzero(so, sizeof (*so));
1c79356b 631 }
2d21ac55 632 if (so != NULL) {
fe8ab488 633 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
2d21ac55 634 so->so_zone = socket_zone;
5ba3f43e
A
635
636 /*
637 * Increment the socket allocation statistics
638 */
639 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
640
2d21ac55 641#if CONFIG_MACF_SOCKET
39236c6e
A
642 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
643 if (mac_socket_label_init(so, !waitok) != 0) {
2d21ac55
A
644 sodealloc(so);
645 return (NULL);
646 }
647#endif /* MAC_SOCKET */
1c79356b
A
648 }
649
2d21ac55 650 return (so);
1c79356b
A
651}
652
653int
39236c6e
A
654socreate_internal(int dom, struct socket **aso, int type, int proto,
655 struct proc *p, uint32_t flags, struct proc *ep)
1c79356b 656{
39236c6e
A
657 struct protosw *prp;
658 struct socket *so;
659 int error = 0;
d1ecb069 660
55e303ae
A
661#if TCPDEBUG
662 extern int tcpconsdebug;
663#endif
39236c6e
A
664
665 VERIFY(aso != NULL);
666 *aso = NULL;
667
668 if (proto != 0)
1c79356b
A
669 prp = pffindproto(dom, proto, type);
670 else
671 prp = pffindtype(dom, type);
9bccf70c 672
39236c6e
A
673 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
674 if (pffinddomain(dom) == NULL)
2d21ac55 675 return (EAFNOSUPPORT);
2d21ac55 676 if (proto != 0) {
39236c6e 677 if (pffindprotonotype(dom, proto) != NULL)
2d21ac55 678 return (EPROTOTYPE);
2d21ac55 679 }
9bccf70c
A
680 return (EPROTONOSUPPORT);
681 }
1c79356b
A
682 if (prp->pr_type != type)
683 return (EPROTOTYPE);
b0d623f7 684 so = soalloc(1, dom, type);
39236c6e 685 if (so == NULL)
1c79356b
A
686 return (ENOBUFS);
687
5ba3f43e
A
688 switch (dom) {
689 case PF_LOCAL:
690 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
691 break;
692 case PF_INET:
693 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
694 if (type == SOCK_STREAM) {
695 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
696 } else {
697 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
698 }
699 break;
700 case PF_ROUTE:
701 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
702 break;
703 case PF_NDRV:
704 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
705 break;
706 case PF_KEY:
707 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
708 break;
709 case PF_INET6:
710 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
711 if (type == SOCK_STREAM) {
712 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
713 } else {
714 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
715 }
716 break;
717 case PF_SYSTEM:
718 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
719 break;
720 case PF_MULTIPATH:
721 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
722 break;
723 default:
724 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
725 break;
726 }
727
39236c6e
A
728 if (flags & SOCF_ASYNC)
729 so->so_state |= SS_NBIO;
39236c6e 730
1c79356b
A
731 TAILQ_INIT(&so->so_incomp);
732 TAILQ_INIT(&so->so_comp);
733 so->so_type = type;
316670eb
A
734 so->last_upid = proc_uniqueid(p);
735 so->last_pid = proc_pid(p);
39236c6e 736 proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
fe8ab488 737 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
39236c6e
A
738
739 if (ep != PROC_NULL && ep != p) {
740 so->e_upid = proc_uniqueid(ep);
741 so->e_pid = proc_pid(ep);
742 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
743 so->so_flags |= SOF_DELEGATED;
744 }
1c79356b 745
316670eb 746 so->so_cred = kauth_cred_proc_ref(p);
b0d623f7 747 if (!suser(kauth_cred_get(), NULL))
39236c6e 748 so->so_state |= SS_PRIV;
b0d623f7 749
1c79356b 750 so->so_proto = prp;
39236c6e 751 so->so_rcv.sb_flags |= SB_RECV;
91447636 752 so->so_rcv.sb_so = so->so_snd.sb_so = so;
0c530ab8
A
753 so->next_lock_lr = 0;
754 so->next_unlock_lr = 0;
2d21ac55
A
755
756#if CONFIG_MACF_SOCKET
757 mac_socket_label_associate(kauth_cred_get(), so);
758#endif /* MAC_SOCKET */
759
2d21ac55 760 /*
39236c6e
A
761 * Attachment will create the per pcb lock if necessary and
762 * increase refcount for creation, make sure it's done before
763 * socket is inserted in lists.
2d21ac55
A
764 */
765 so->so_usecount++;
91447636
A
766
767 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
39236c6e 768 if (error != 0) {
2d21ac55
A
769 /*
770 * Warning:
771 * If so_pcb is not zero, the socket will be leaked,
772 * so protocol attachment handler must be coded carefuly
55e303ae 773 */
1c79356b 774 so->so_state |= SS_NOFDREF;
d190cdc3 775 VERIFY(so->so_usecount > 0);
37839358
A
776 so->so_usecount--;
777 sofreelastref(so, 1); /* will deallocate the socket */
1c79356b
A
778 return (error);
779 }
39236c6e
A
780
781 atomic_add_32(&prp->pr_domain->dom_refs, 1);
1c79356b 782 TAILQ_INIT(&so->so_evlist);
91447636
A
783
784 /* Attach socket filters for this protocol */
785 sflt_initsock(so);
55e303ae
A
786#if TCPDEBUG
787 if (tcpconsdebug == 2)
788 so->so_options |= SO_DEBUG;
9bccf70c 789#endif
6d2010ae 790 so_set_default_traffic_class(so);
39236c6e 791
d1ecb069 792 /*
39236c6e
A
793 * If this thread or task is marked to create backgrounded sockets,
794 * mark the socket as background.
d1ecb069 795 */
3e170ce0
A
796 if (proc_get_effective_thread_policy(current_thread(),
797 TASK_POLICY_NEW_SOCKETS_BG)) {
d1ecb069 798 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
6d2010ae
A
799 so->so_background_thread = current_thread();
800 }
801
802 switch (dom) {
316670eb 803 /*
39236c6e
A
804 * Don't mark Unix domain, system or multipath sockets as
805 * eligible for defunct by default.
806 */
6d2010ae 807 case PF_LOCAL:
316670eb 808 case PF_SYSTEM:
39236c6e 809 case PF_MULTIPATH:
6d2010ae
A
810 so->so_flags |= SOF_NODEFUNCT;
811 break;
316670eb
A
812 default:
813 break;
d1ecb069
A
814 }
815
fe8ab488
A
816 /*
817 * Entitlements can't be checked at socket creation time except if the
818 * application requested a feature guarded by a privilege (c.f., socket
819 * delegation).
820 * The priv(9) and the Sandboxing APIs are designed with the idea that
821 * a privilege check should only be triggered by a userland request.
822 * A privilege check at socket creation time is time consuming and
823 * could trigger many authorisation error messages from the security
824 * APIs.
825 */
826
1c79356b 827 *aso = so;
39236c6e 828
1c79356b
A
829 return (0);
830}
831
39236c6e
A
832/*
833 * Returns: 0 Success
834 * EAFNOSUPPORT
835 * EPROTOTYPE
836 * EPROTONOSUPPORT
837 * ENOBUFS
838 * <pru_attach>:ENOBUFS[AF_UNIX]
839 * <pru_attach>:ENOBUFS[TCP]
840 * <pru_attach>:ENOMEM[TCP]
841 * <pru_attach>:??? [other protocol families, IPSEC]
842 */
843int
844socreate(int dom, struct socket **aso, int type, int proto)
845{
846 return (socreate_internal(dom, aso, type, proto, current_proc(), 0,
847 PROC_NULL));
848}
849
850int
851socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
852{
853 int error = 0;
854 struct proc *ep = PROC_NULL;
855
856 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
857 error = ESRCH;
858 goto done;
859 }
860
861 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
862
863 /*
864 * It might not be wise to hold the proc reference when calling
865 * socreate_internal since it calls soalloc with M_WAITOK
866 */
867done:
868 if (ep != PROC_NULL)
869 proc_rele(ep);
870
871 return (error);
872}
873
2d21ac55
A
874/*
875 * Returns: 0 Success
876 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
877 * <pru_bind>:EAFNOSUPPORT Address family not supported
878 * <pru_bind>:EADDRNOTAVAIL Address not available.
879 * <pru_bind>:EINVAL Invalid argument
880 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
881 * <pru_bind>:EACCES Permission denied
882 * <pru_bind>:EADDRINUSE Address in use
883 * <pru_bind>:EAGAIN Resource unavailable, try again
884 * <pru_bind>:EPERM Operation not permitted
885 * <pru_bind>:???
886 * <sf_bind>:???
887 *
888 * Notes: It's not possible to fully enumerate the return codes above,
889 * since socket filter authors and protocol family authors may
890 * not choose to limit their error returns to those listed, even
891 * though this may result in some software operating incorrectly.
892 *
893 * The error codes which are enumerated above are those known to
894 * be returned by the tcp_usr_bind function supplied.
895 */
1c79356b 896int
39236c6e 897sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
1c79356b
A
898{
899 struct proc *p = current_proc();
91447636 900 int error = 0;
1c79356b 901
39236c6e
A
902 if (dolock)
903 socket_lock(so, 1);
39236c6e 904
6d2010ae 905 so_update_last_owner_locked(so, p);
39236c6e 906 so_update_policy(so);
3e170ce0 907
fe8ab488
A
908#if NECP
909 so_update_necp_policy(so, nam, NULL);
910#endif /* NECP */
3e170ce0 911
2d21ac55 912 /*
6d2010ae
A
913 * If this is a bind request on a socket that has been marked
914 * as inactive, reject it now before we go any further.
2d21ac55
A
915 */
916 if (so->so_flags & SOF_DEFUNCT) {
917 error = EINVAL;
39037602
A
918 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
919 __func__, proc_pid(p), proc_best_name(p),
920 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
921 SOCK_DOM(so), SOCK_TYPE(so), error);
2d21ac55
A
922 goto out;
923 }
924
91447636 925 /* Socket filter */
6d2010ae 926 error = sflt_bind(so, nam);
2d21ac55 927
91447636
A
928 if (error == 0)
929 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
2d21ac55 930out:
39236c6e
A
931 if (dolock)
932 socket_unlock(so, 1);
2d21ac55 933
91447636
A
934 if (error == EJUSTRETURN)
935 error = 0;
2d21ac55 936
1c79356b
A
937 return (error);
938}
939
940void
2d21ac55 941sodealloc(struct socket *so)
1c79356b 942{
316670eb
A
943 kauth_cred_unref(&so->so_cred);
944
6d2010ae
A
945 /* Remove any filters */
946 sflt_termsock(so);
947
fe8ab488
A
948#if CONTENT_FILTER
949 cfil_sock_detach(so);
950#endif /* CONTENT_FILTER */
951
39236c6e
A
952 /* Delete the state allocated for msg queues on a socket */
953 if (so->so_flags & SOF_ENABLE_MSGS) {
954 FREE(so->so_msg_state, M_TEMP);
955 so->so_msg_state = NULL;
956 }
957 VERIFY(so->so_msg_state == NULL);
958
fe8ab488 959 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
1c79356b 960
2d21ac55
A
961#if CONFIG_MACF_SOCKET
962 mac_socket_label_destroy(so);
963#endif /* MAC_SOCKET */
39236c6e 964
3e170ce0 965 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
2d21ac55
A
966 cached_sock_free(so);
967 } else {
2d21ac55 968 FREE_ZONE(so, sizeof (*so), so->so_zone);
91447636 969 }
1c79356b
A
970}
971
2d21ac55
A
972/*
973 * Returns: 0 Success
974 * EINVAL
975 * EOPNOTSUPP
976 * <pru_listen>:EINVAL[AF_UNIX]
977 * <pru_listen>:EINVAL[TCP]
978 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
979 * <pru_listen>:EINVAL[TCP] Invalid argument
980 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
981 * <pru_listen>:EACCES[TCP] Permission denied
982 * <pru_listen>:EADDRINUSE[TCP] Address in use
983 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
984 * <pru_listen>:EPERM[TCP] Operation not permitted
985 * <sf_listen>:???
986 *
987 * Notes: Other <pru_listen> returns depend on the protocol family; all
988 * <sf_listen> returns depend on what the filter author causes
989 * their filter to return.
990 */
1c79356b 991int
2d21ac55 992solisten(struct socket *so, int backlog)
1c79356b 993{
1c79356b 994 struct proc *p = current_proc();
2d21ac55 995 int error = 0;
1c79356b 996
91447636 997 socket_lock(so, 1);
39236c6e
A
998
999 so_update_last_owner_locked(so, p);
1000 so_update_policy(so);
3e170ce0 1001
fe8ab488
A
1002#if NECP
1003 so_update_necp_policy(so, NULL, NULL);
1004#endif /* NECP */
3e170ce0 1005
2d21ac55
A
1006 if (so->so_proto == NULL) {
1007 error = EINVAL;
1008 goto out;
1009 }
1010 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1011 error = EOPNOTSUPP;
1012 goto out;
1013 }
1014
1015 /*
1016 * If the listen request is made on a socket that is not fully
6d2010ae
A
1017 * disconnected, or on a socket that has been marked as inactive,
1018 * reject the request now.
2d21ac55
A
1019 */
1020 if ((so->so_state &
1021 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
1022 (so->so_flags & SOF_DEFUNCT)) {
1023 error = EINVAL;
6d2010ae 1024 if (so->so_flags & SOF_DEFUNCT) {
39037602 1025 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
39236c6e 1026 "(%d)\n", __func__, proc_pid(p),
39037602 1027 proc_best_name(p),
3e170ce0 1028 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 1029 SOCK_DOM(so), SOCK_TYPE(so), error);
6d2010ae 1030 }
2d21ac55
A
1031 goto out;
1032 }
1033
39236c6e 1034 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
2d21ac55
A
1035 error = EPERM;
1036 goto out;
1037 }
1038
6d2010ae 1039 error = sflt_listen(so);
39236c6e 1040 if (error == 0)
91447636 1041 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
2d21ac55 1042
1c79356b 1043 if (error) {
91447636
A
1044 if (error == EJUSTRETURN)
1045 error = 0;
2d21ac55 1046 goto out;
1c79356b 1047 }
2d21ac55 1048
91447636 1049 if (TAILQ_EMPTY(&so->so_comp))
1c79356b 1050 so->so_options |= SO_ACCEPTCONN;
2d21ac55
A
1051 /*
1052 * POSIX: The implementation may have an upper limit on the length of
1053 * the listen queue-either global or per accepting socket. If backlog
1054 * exceeds this limit, the length of the listen queue is set to the
1055 * limit.
1056 *
1057 * If listen() is called with a backlog argument value that is less
1058 * than 0, the function behaves as if it had been called with a backlog
1059 * argument value of 0.
1060 *
1061 * A backlog argument of 0 may allow the socket to accept connections,
1062 * in which case the length of the listen queue may be set to an
1063 * implementation-defined minimum value.
1064 */
1065 if (backlog <= 0 || backlog > somaxconn)
1c79356b 1066 backlog = somaxconn;
1c79356b 1067
2d21ac55
A
1068 so->so_qlimit = backlog;
1069out:
91447636 1070 socket_unlock(so, 1);
2d21ac55 1071 return (error);
1c79356b
A
1072}
1073
813fb2f6
A
1074/*
1075 * The "accept list lock" protects the fields related to the listener queues
1076 * because we can unlock a socket to respect the lock ordering between
1077 * the listener socket and its clients sockets. The lock ordering is first to
1078 * acquire the client socket before the listener socket.
1079 *
1080 * The accept list lock serializes access to the following fields:
1081 * - of the listener socket:
1082 * - so_comp
1083 * - so_incomp
1084 * - so_qlen
1085 * - so_inqlen
1086 * - of client sockets that are in so_comp or so_incomp:
1087 * - so_head
1088 * - so_list
1089 *
1090 * As one can see the accept list lock protects the consistent of the
1091 * linkage of the client sockets.
1092 *
1093 * Note that those fields may be read without holding the accept list lock
1094 * for a preflight provided the accept list lock is taken when committing
1095 * to take an action based on the result of the preflight. The preflight
1096 * saves the cost of doing the unlock/lock dance.
1097 */
1098void
1099so_acquire_accept_list(struct socket *head, struct socket *so)
1100{
1101 lck_mtx_t *mutex_held;
1102
1103 if (head->so_proto->pr_getlock == NULL) {
1104 return;
1105 }
5ba3f43e
A
1106 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1107 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
813fb2f6
A
1108
1109 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1110 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1111 return;
1112 }
1113 if (so != NULL) {
1114 socket_unlock(so, 0);
1115 }
1116 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1117 so_accept_list_waits += 1;
1118 msleep((caddr_t)&head->so_incomp, mutex_held,
1119 PSOCK | PCATCH, __func__, NULL);
1120 }
1121 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1122 if (so != NULL) {
1123 socket_unlock(head, 0);
1124 socket_lock(so, 0);
1125 socket_lock(head, 0);
1126 }
1127}
1128
1129void
1130so_release_accept_list(struct socket *head)
1131{
1132 if (head->so_proto->pr_getlock != NULL) {
1133 lck_mtx_t *mutex_held;
1134
1135 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
5ba3f43e
A
1136 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1137
813fb2f6
A
1138 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1139 wakeup((caddr_t)&head->so_incomp);
1140 }
1141}
1142
1c79356b 1143void
2d21ac55 1144sofreelastref(struct socket *so, int dealloc)
9bccf70c 1145{
1c79356b
A
1146 struct socket *head = so->so_head;
1147
2d21ac55 1148 /* Assume socket is locked */
1c79356b 1149
39236c6e 1150 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
0b4e3aa0
A
1151 selthreadclear(&so->so_snd.sb_sel);
1152 selthreadclear(&so->so_rcv.sb_sel);
39236c6e
A
1153 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1154 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
fe8ab488 1155 so->so_event = sonullevent;
1c79356b 1156 return;
0b4e3aa0 1157 }
9bccf70c 1158 if (head != NULL) {
d190cdc3
A
1159 /*
1160 * Need to lock the listener when the protocol has
1161 * per socket locks
1162 */
813fb2f6 1163 if (head->so_proto->pr_getlock != NULL) {
d190cdc3 1164 socket_lock(head, 1);
813fb2f6
A
1165 so_acquire_accept_list(head, so);
1166 }
9bccf70c 1167 if (so->so_state & SS_INCOMP) {
d190cdc3 1168 so->so_state &= ~SS_INCOMP;
9bccf70c
A
1169 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1170 head->so_incqlen--;
d190cdc3
A
1171 head->so_qlen--;
1172 so->so_head = NULL;
813fb2f6
A
1173
1174 if (head->so_proto->pr_getlock != NULL) {
1175 so_release_accept_list(head);
1176 socket_unlock(head, 1);
1177 }
9bccf70c 1178 } else if (so->so_state & SS_COMP) {
813fb2f6
A
1179 if (head->so_proto->pr_getlock != NULL) {
1180 so_release_accept_list(head);
1181 socket_unlock(head, 1);
1182 }
9bccf70c
A
1183 /*
1184 * We must not decommission a socket that's
1185 * on the accept(2) queue. If we do, then
1186 * accept(2) may hang after select(2) indicated
1187 * that the listening socket was ready.
1188 */
9bccf70c
A
1189 selthreadclear(&so->so_snd.sb_sel);
1190 selthreadclear(&so->so_rcv.sb_sel);
39236c6e
A
1191 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1192 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
fe8ab488 1193 so->so_event = sonullevent;
9bccf70c
A
1194 return;
1195 } else {
813fb2f6
A
1196 if (head->so_proto->pr_getlock != NULL) {
1197 so_release_accept_list(head);
5ba3f43e
A
1198 socket_unlock(head, 1);
1199 }
813fb2f6 1200 printf("sofree: not queued\n");
9bccf70c 1201 }
1c79356b 1202 }
39236c6e 1203 sowflush(so);
1c79356b 1204 sorflush(so);
2d21ac55 1205
39236c6e
A
1206#if FLOW_DIVERT
1207 if (so->so_flags & SOF_FLOW_DIVERT) {
1208 flow_divert_detach(so);
1209 }
1210#endif /* FLOW_DIVERT */
1211
91447636
A
1212 /* 3932268: disable upcall */
1213 so->so_rcv.sb_flags &= ~SB_UPCALL;
39037602 1214 so->so_snd.sb_flags &= ~(SB_UPCALL|SB_SNDBYTE_CNT);
fe8ab488 1215 so->so_event = sonullevent;
2d21ac55 1216
91447636
A
1217 if (dealloc)
1218 sodealloc(so);
1c79356b
A
1219}
1220
2d21ac55
A
1221void
1222soclose_wait_locked(struct socket *so)
1223{
1224 lck_mtx_t *mutex_held;
1225
1226 if (so->so_proto->pr_getlock != NULL)
5ba3f43e 1227 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
2d21ac55
A
1228 else
1229 mutex_held = so->so_proto->pr_domain->dom_mtx;
5ba3f43e 1230 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
2d21ac55 1231
4a3eedf9
A
1232 /*
1233 * Double check here and return if there's no outstanding upcall;
1234 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1235 */
316670eb 1236 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
2d21ac55 1237 return;
316670eb
A
1238 so->so_rcv.sb_flags &= ~SB_UPCALL;
1239 so->so_snd.sb_flags &= ~SB_UPCALL;
2d21ac55 1240 so->so_flags |= SOF_CLOSEWAIT;
5ba3f43e 1241
39236c6e 1242 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
2d21ac55 1243 "soclose_wait_locked", NULL);
5ba3f43e 1244 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
2d21ac55
A
1245 so->so_flags &= ~SOF_CLOSEWAIT;
1246}
1247
1c79356b
A
1248/*
1249 * Close a socket on last file table reference removal.
1250 * Initiate disconnect if connected.
1251 * Free socket when disconnect complete.
1252 */
1253int
2d21ac55 1254soclose_locked(struct socket *so)
1c79356b 1255{
1c79356b 1256 int error = 0;
91447636 1257 struct timespec ts;
1c79356b 1258
91447636 1259 if (so->so_usecount == 0) {
2d21ac55 1260 panic("soclose: so=%p refcount=0\n", so);
39236c6e 1261 /* NOTREACHED */
1c79356b
A
1262 }
1263
91447636 1264 sflt_notify(so, sock_evt_closing, NULL);
2d21ac55 1265
39236c6e
A
1266 if (so->so_upcallusecount)
1267 soclose_wait_locked(so);
1268
fe8ab488
A
1269#if CONTENT_FILTER
1270 /*
1271 * We have to wait until the content filters are done
1272 */
1273 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1274 cfil_sock_close_wait(so);
1275 cfil_sock_is_closed(so);
1276 cfil_sock_detach(so);
1277 }
1278#endif /* CONTENT_FILTER */
1279
3e170ce0
A
1280 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1281 soresume(current_proc(), so, 1);
1282 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1283 }
1284
91447636 1285 if ((so->so_options & SO_ACCEPTCONN)) {
813fb2f6
A
1286 struct socket *sp, *sonext;
1287 int persocklock = 0;
1288 int incomp_overflow_only;
2d21ac55
A
1289
1290 /*
1291 * We do not want new connection to be added
1292 * to the connection queues
1293 */
91447636 1294 so->so_options &= ~SO_ACCEPTCONN;
2d21ac55 1295
813fb2f6
A
1296 /*
1297 * We can drop the lock on the listener once
1298 * we've acquired the incoming list
1299 */
1300 if (so->so_proto->pr_getlock != NULL) {
1301 persocklock = 1;
1302 so_acquire_accept_list(so, NULL);
1303 socket_unlock(so, 0);
1304 }
1305again:
1306 incomp_overflow_only = 1;
2d21ac55 1307
813fb2f6 1308 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
39236c6e
A
1309 /*
1310 * Radar 5350314
2d21ac55
A
1311 * skip sockets thrown away by tcpdropdropblreq
1312 * they will get cleanup by the garbage collection.
1313 * otherwise, remove the incomp socket from the queue
1314 * and let soabort trigger the appropriate cleanup.
91447636 1315 */
39236c6e 1316 if (sp->so_flags & SOF_OVERFLOW)
2d21ac55
A
1317 continue;
1318
813fb2f6 1319 if (persocklock != 0)
ff6e181a 1320 socket_lock(sp, 1);
2d21ac55 1321
d190cdc3
A
1322 /*
1323 * Radar 27945981
1324 * The extra reference for the list insure the
1325 * validity of the socket pointer when we perform the
1326 * unlock of the head above
1327 */
2d21ac55
A
1328 if (sp->so_state & SS_INCOMP) {
1329 sp->so_state &= ~SS_INCOMP;
1330 sp->so_head = NULL;
d190cdc3
A
1331 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1332 so->so_incqlen--;
1333 so->so_qlen--;
2d21ac55
A
1334
1335 (void) soabort(sp);
813fb2f6
A
1336 } else {
1337 panic("%s sp %p in so_incomp but !SS_INCOMP",
1338 __func__, sp);
ff6e181a 1339 }
2d21ac55 1340
813fb2f6 1341 if (persocklock != 0)
2d21ac55 1342 socket_unlock(sp, 1);
91447636
A
1343 }
1344
813fb2f6 1345 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
91447636 1346 /* Dequeue from so_comp since sofree() won't do it */
813fb2f6 1347 if (persocklock != 0)
ff6e181a 1348 socket_lock(sp, 1);
ff6e181a 1349
2d21ac55
A
1350 if (sp->so_state & SS_COMP) {
1351 sp->so_state &= ~SS_COMP;
1352 sp->so_head = NULL;
d190cdc3
A
1353 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1354 so->so_qlen--;
2d21ac55
A
1355
1356 (void) soabort(sp);
813fb2f6
A
1357 } else {
1358 panic("%s sp %p in so_comp but !SS_COMP",
1359 __func__, sp);
2d21ac55 1360 }
91447636 1361
813fb2f6 1362 if (persocklock)
91447636 1363 socket_unlock(sp, 1);
ff6e181a 1364 }
813fb2f6
A
1365
1366 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1367#if (DEBUG|DEVELOPMENT)
1368 panic("%s head %p so_comp not empty\n", __func__, so);
1369#endif /* (DEVELOPMENT || DEBUG) */
1370
1371 goto again;
91447636 1372 }
813fb2f6
A
1373
1374 if (!TAILQ_EMPTY(&so->so_comp)) {
1375#if (DEBUG|DEVELOPMENT)
1376 panic("%s head %p so_comp not empty\n", __func__, so);
1377#endif /* (DEVELOPMENT || DEBUG) */
1378
1379 goto again;
1380 }
1381
1382 if (persocklock) {
1383 socket_lock(so, 0);
1384 so_release_accept_list(so);
1385 }
1386 }
39236c6e 1387 if (so->so_pcb == NULL) {
91447636
A
1388 /* 3915887: mark the socket as ready for dealloc */
1389 so->so_flags |= SOF_PCBCLEARING;
1c79356b 1390 goto discard;
91447636 1391 }
1c79356b
A
1392 if (so->so_state & SS_ISCONNECTED) {
1393 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
91447636 1394 error = sodisconnectlocked(so);
1c79356b
A
1395 if (error)
1396 goto drop;
1397 }
1398 if (so->so_options & SO_LINGER) {
813fb2f6
A
1399 lck_mtx_t *mutex_held;
1400
1c79356b
A
1401 if ((so->so_state & SS_ISDISCONNECTING) &&
1402 (so->so_state & SS_NBIO))
1403 goto drop;
2d21ac55 1404 if (so->so_proto->pr_getlock != NULL)
5ba3f43e 1405 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
2d21ac55 1406 else
91447636 1407 mutex_held = so->so_proto->pr_domain->dom_mtx;
1c79356b 1408 while (so->so_state & SS_ISCONNECTED) {
91447636 1409 ts.tv_sec = (so->so_linger/100);
2d21ac55
A
1410 ts.tv_nsec = (so->so_linger % 100) *
1411 NSEC_PER_USEC * 1000 * 10;
1412 error = msleep((caddr_t)&so->so_timeo,
1413 mutex_held, PSOCK | PCATCH, "soclose", &ts);
91447636 1414 if (error) {
2d21ac55
A
1415 /*
1416 * It's OK when the time fires,
1417 * don't report an error
1418 */
91447636
A
1419 if (error == EWOULDBLOCK)
1420 error = 0;
1c79356b 1421 break;
91447636 1422 }
1c79356b
A
1423 }
1424 }
1425 }
1426drop:
39236c6e 1427 if (so->so_usecount == 0) {
2d21ac55 1428 panic("soclose: usecount is zero so=%p\n", so);
39236c6e
A
1429 /* NOTREACHED */
1430 }
1431 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1c79356b
A
1432 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1433 if (error == 0)
1434 error = error2;
1435 }
39236c6e 1436 if (so->so_usecount <= 0) {
2d21ac55 1437 panic("soclose: usecount is zero so=%p\n", so);
39236c6e
A
1438 /* NOTREACHED */
1439 }
1c79356b 1440discard:
39236c6e
A
1441 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1442 (so->so_state & SS_NOFDREF)) {
1c79356b 1443 panic("soclose: NOFDREF");
39236c6e
A
1444 /* NOTREACHED */
1445 }
1c79356b 1446 so->so_state |= SS_NOFDREF;
39236c6e 1447
316670eb
A
1448 if ((so->so_flags & SOF_KNOTE) != 0)
1449 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
39236c6e
A
1450
1451 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1c79356b 1452 evsofree(so);
39236c6e 1453
d190cdc3 1454 VERIFY(so->so_usecount > 0);
91447636 1455 so->so_usecount--;
1c79356b 1456 sofree(so);
1c79356b
A
1457 return (error);
1458}
1459
91447636 1460int
2d21ac55 1461soclose(struct socket *so)
91447636
A
1462{
1463 int error = 0;
1464 socket_lock(so, 1);
2d21ac55 1465
2d21ac55 1466 if (so->so_retaincnt == 0) {
91447636 1467 error = soclose_locked(so);
2d21ac55
A
1468 } else {
1469 /*
1470 * if the FD is going away, but socket is
1471 * retained in kernel remove its reference
1472 */
91447636
A
1473 so->so_usecount--;
1474 if (so->so_usecount < 2)
2d21ac55
A
1475 panic("soclose: retaincnt non null and so=%p "
1476 "usecount=%d\n", so, so->so_usecount);
91447636
A
1477 }
1478 socket_unlock(so, 1);
1479 return (error);
1480}
1481
1c79356b
A
1482/*
1483 * Must be called at splnet...
1484 */
2d21ac55 1485/* Should already be locked */
1c79356b 1486int
2d21ac55 1487soabort(struct socket *so)
1c79356b 1488{
9bccf70c 1489 int error;
1c79356b 1490
91447636 1491#ifdef MORE_LOCKING_DEBUG
2d21ac55 1492 lck_mtx_t *mutex_held;
91447636 1493
2d21ac55 1494 if (so->so_proto->pr_getlock != NULL)
91447636 1495 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2d21ac55 1496 else
91447636 1497 mutex_held = so->so_proto->pr_domain->dom_mtx;
5ba3f43e 1498 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
91447636
A
1499#endif
1500
2d21ac55
A
1501 if ((so->so_flags & SOF_ABORTED) == 0) {
1502 so->so_flags |= SOF_ABORTED;
1503 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1504 if (error) {
1505 sofree(so);
1506 return (error);
1507 }
9bccf70c
A
1508 }
1509 return (0);
1c79356b
A
1510}
1511
1512int
2d21ac55 1513soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
9bccf70c 1514{
1c79356b 1515 int error;
91447636 1516
2d21ac55
A
1517 if (dolock)
1518 socket_lock(so, 1);
1c79356b 1519
39236c6e
A
1520 so_update_last_owner_locked(so, PROC_NULL);
1521 so_update_policy(so);
fe8ab488
A
1522#if NECP
1523 so_update_necp_policy(so, NULL, NULL);
1524#endif /* NECP */
39236c6e 1525
1c79356b
A
1526 if ((so->so_state & SS_NOFDREF) == 0)
1527 panic("soaccept: !NOFDREF");
1528 so->so_state &= ~SS_NOFDREF;
1529 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
2d21ac55
A
1530
1531 if (dolock)
1532 socket_unlock(so, 1);
1c79356b
A
1533 return (error);
1534}
2d21ac55 1535
91447636 1536int
2d21ac55 1537soaccept(struct socket *so, struct sockaddr **nam)
91447636
A
1538{
1539 return (soacceptlock(so, nam, 1));
1540}
1c79356b
A
1541
1542int
d190cdc3 1543soacceptfilter(struct socket *so, struct socket *head)
2d21ac55
A
1544{
1545 struct sockaddr *local = NULL, *remote = NULL;
6d2010ae 1546 int error = 0;
2d21ac55
A
1547
1548 /*
39236c6e
A
1549 * Hold the lock even if this socket has not been made visible
1550 * to the filter(s). For sockets with global locks, this protects
1551 * against the head or peer going away
2d21ac55 1552 */
b0d623f7
A
1553 socket_lock(so, 1);
1554 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1555 sogetaddr_locked(so, &local, 0) != 0) {
d190cdc3 1556 so->so_state &= ~SS_NOFDREF;
b0d623f7 1557 socket_unlock(so, 1);
2d21ac55
A
1558 soclose(so);
1559 /* Out of resources; try it again next time */
1560 error = ECONNABORTED;
1561 goto done;
1562 }
1563
6d2010ae 1564 error = sflt_accept(head, so, local, remote);
2d21ac55
A
1565
1566 /*
1567 * If we get EJUSTRETURN from one of the filters, mark this socket
1568 * as inactive and return it anyway. This newly accepted socket
1569 * will be disconnected later before we hand it off to the caller.
1570 */
1571 if (error == EJUSTRETURN) {
1572 error = 0;
6d2010ae
A
1573 (void) sosetdefunct(current_proc(), so,
1574 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
2d21ac55
A
1575 }
1576
1577 if (error != 0) {
1578 /*
1579 * This may seem like a duplication to the above error
1580 * handling part when we return ECONNABORTED, except
1581 * the following is done while holding the lock since
1582 * the socket has been exposed to the filter(s) earlier.
1583 */
5ba3f43e 1584 so->so_state &= ~SS_NOFDREF;
2d21ac55
A
1585 socket_unlock(so, 1);
1586 soclose(so);
1587 /* Propagate socket filter's error code to the caller */
1588 } else {
1589 socket_unlock(so, 1);
1590 }
1591done:
1592 /* Callee checks for NULL pointer */
1593 sock_freeaddr(remote);
1594 sock_freeaddr(local);
1595 return (error);
1596}
1c79356b 1597
2d21ac55
A
1598/*
1599 * Returns: 0 Success
1600 * EOPNOTSUPP Operation not supported on socket
1601 * EISCONN Socket is connected
1602 * <pru_connect>:EADDRNOTAVAIL Address not available.
1603 * <pru_connect>:EINVAL Invalid argument
1604 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1605 * <pru_connect>:EACCES Permission denied
1606 * <pru_connect>:EADDRINUSE Address in use
1607 * <pru_connect>:EAGAIN Resource unavailable, try again
1608 * <pru_connect>:EPERM Operation not permitted
1609 * <sf_connect_out>:??? [anything a filter writer might set]
1610 */
1611int
1612soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1c79356b 1613{
1c79356b
A
1614 int error;
1615 struct proc *p = current_proc();
1c79356b 1616
2d21ac55
A
1617 if (dolock)
1618 socket_lock(so, 1);
39236c6e
A
1619
1620 so_update_last_owner_locked(so, p);
1621 so_update_policy(so);
1622
fe8ab488
A
1623#if NECP
1624 so_update_necp_policy(so, NULL, nam);
1625#endif /* NECP */
1626
2d21ac55
A
1627 /*
1628 * If this is a listening socket or if this is a previously-accepted
1629 * socket that has been marked as inactive, reject the connect request.
1630 */
1631 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
6d2010ae
A
1632 error = EOPNOTSUPP;
1633 if (so->so_flags & SOF_DEFUNCT) {
39037602 1634 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
39236c6e 1635 "(%d)\n", __func__, proc_pid(p),
39037602 1636 proc_best_name(p),
3e170ce0 1637 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 1638 SOCK_DOM(so), SOCK_TYPE(so), error);
6d2010ae 1639 }
2d21ac55
A
1640 if (dolock)
1641 socket_unlock(so, 1);
6d2010ae 1642 return (error);
91447636 1643 }
2d21ac55 1644
39236c6e 1645 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
2d21ac55
A
1646 if (dolock)
1647 socket_unlock(so, 1);
1648 return (EPERM);
1649 }
1650
1c79356b
A
1651 /*
1652 * If protocol is connection-based, can only connect once.
1653 * Otherwise, if connected, try to disconnect first.
1654 * This allows user to disconnect by connecting to, e.g.,
1655 * a null address.
1656 */
1657 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1658 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
2d21ac55 1659 (error = sodisconnectlocked(so)))) {
1c79356b 1660 error = EISCONN;
2d21ac55 1661 } else {
91447636
A
1662 /*
1663 * Run connect filter before calling protocol:
1664 * - non-blocking connect returns before completion;
1665 */
6d2010ae 1666 error = sflt_connectout(so, nam);
39236c6e 1667 if (error != 0) {
91447636
A
1668 if (error == EJUSTRETURN)
1669 error = 0;
6d2010ae 1670 } else {
39236c6e
A
1671 error = (*so->so_proto->pr_usrreqs->pru_connect)
1672 (so, nam, p);
91447636 1673 }
1c79356b 1674 }
2d21ac55
A
1675 if (dolock)
1676 socket_unlock(so, 1);
1c79356b
A
1677 return (error);
1678}
1679
91447636 1680int
2d21ac55 1681soconnect(struct socket *so, struct sockaddr *nam)
91447636
A
1682{
1683 return (soconnectlock(so, nam, 1));
1684}
1685
2d21ac55
A
1686/*
1687 * Returns: 0 Success
1688 * <pru_connect2>:EINVAL[AF_UNIX]
1689 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1690 * <pru_connect2>:??? [other protocol families]
1691 *
1692 * Notes: <pru_connect2> is not supported by [TCP].
1693 */
1c79356b 1694int
2d21ac55 1695soconnect2(struct socket *so1, struct socket *so2)
1c79356b 1696{
1c79356b 1697 int error;
91447636 1698
0c530ab8 1699 socket_lock(so1, 1);
2d21ac55 1700 if (so2->so_proto->pr_lock)
0c530ab8 1701 socket_lock(so2, 1);
1c79356b
A
1702
1703 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
2d21ac55 1704
0c530ab8 1705 socket_unlock(so1, 1);
2d21ac55 1706 if (so2->so_proto->pr_lock)
0c530ab8 1707 socket_unlock(so2, 1);
1c79356b
A
1708 return (error);
1709}
1710
39236c6e 1711int
813fb2f6
A
1712soconnectxlocked(struct socket *so, struct sockaddr *src,
1713 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
3e170ce0
A
1714 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1715 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
39236c6e
A
1716{
1717 int error;
1718
fe8ab488
A
1719 so_update_last_owner_locked(so, p);
1720 so_update_policy(so);
3e170ce0 1721
39236c6e
A
1722 /*
1723 * If this is a listening socket or if this is a previously-accepted
1724 * socket that has been marked as inactive, reject the connect request.
1725 */
1726 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1727 error = EOPNOTSUPP;
1728 if (so->so_flags & SOF_DEFUNCT) {
39037602 1729 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
39236c6e 1730 "(%d)\n", __func__, proc_pid(p),
39037602 1731 proc_best_name(p),
3e170ce0 1732 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 1733 SOCK_DOM(so), SOCK_TYPE(so), error);
39236c6e
A
1734 }
1735 return (error);
1736 }
1737
1738 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0)
1739 return (EPERM);
1740
1741 /*
1742 * If protocol is connection-based, can only connect once
1743 * unless PR_MULTICONN is set. Otherwise, if connected,
1744 * try to disconnect first. This allows user to disconnect
1745 * by connecting to, e.g., a null address.
1746 */
1747 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) &&
1748 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1749 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1750 (error = sodisconnectlocked(so)) != 0)) {
1751 error = EISCONN;
1752 } else {
1753 /*
1754 * Run connect filter before calling protocol:
1755 * - non-blocking connect returns before completion;
1756 */
813fb2f6 1757 error = sflt_connectout(so, dst);
39236c6e 1758 if (error != 0) {
490019cf
A
1759 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1760 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
39236c6e
A
1761 if (error == EJUSTRETURN)
1762 error = 0;
1763 } else {
1764 error = (*so->so_proto->pr_usrreqs->pru_connectx)
813fb2f6 1765 (so, src, dst, p, ifscope, aid, pcid,
3e170ce0 1766 flags, arg, arglen, auio, bytes_written);
39236c6e
A
1767 }
1768 }
1769
1770 return (error);
1771}
1772
1c79356b 1773int
2d21ac55 1774sodisconnectlocked(struct socket *so)
1c79356b 1775{
1c79356b 1776 int error;
1c79356b
A
1777
1778 if ((so->so_state & SS_ISCONNECTED) == 0) {
1779 error = ENOTCONN;
1780 goto bad;
1781 }
1782 if (so->so_state & SS_ISDISCONNECTING) {
1783 error = EALREADY;
1784 goto bad;
1785 }
2d21ac55 1786
1c79356b 1787 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
39236c6e 1788 if (error == 0)
91447636 1789 sflt_notify(so, sock_evt_disconnected, NULL);
39236c6e 1790
1c79356b 1791bad:
1c79356b
A
1792 return (error);
1793}
2d21ac55
A
1794
1795/* Locking version */
91447636 1796int
2d21ac55 1797sodisconnect(struct socket *so)
91447636 1798{
2d21ac55 1799 int error;
91447636
A
1800
1801 socket_lock(so, 1);
1802 error = sodisconnectlocked(so);
1803 socket_unlock(so, 1);
2d21ac55 1804 return (error);
91447636 1805}
1c79356b 1806
39236c6e 1807int
3e170ce0 1808sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
39236c6e
A
1809{
1810 int error;
1811
1812 /*
1813 * Call the protocol disconnectx handler; let it handle all
1814 * matters related to the connection state of this session.
1815 */
1816 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1817 if (error == 0) {
1818 /*
1819 * The event applies only for the session, not for
1820 * the disconnection of individual subflows.
1821 */
1822 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
1823 sflt_notify(so, sock_evt_disconnected, NULL);
1824 }
1825 return (error);
1826}
1827
1828int
3e170ce0 1829sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
39236c6e
A
1830{
1831 int error;
1832
1833 socket_lock(so, 1);
1834 error = sodisconnectxlocked(so, aid, cid);
1835 socket_unlock(so, 1);
1836 return (error);
1837}
1838
39236c6e 1839#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
91447636
A
1840
1841/*
1842 * sosendcheck will lock the socket buffer if it isn't locked and
1843 * verify that there is space for the data being inserted.
2d21ac55
A
1844 *
1845 * Returns: 0 Success
1846 * EPIPE
1847 * sblock:EWOULDBLOCK
1848 * sblock:EINTR
1849 * sbwait:EBADF
1850 * sbwait:EINTR
1851 * [so_error]:???
91447636 1852 */
39236c6e
A
1853int
1854sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1855 int32_t clen, int32_t atomic, int flags, int *sblocked,
1856 struct mbuf *control)
91447636 1857{
39236c6e 1858 int error = 0;
b0d623f7 1859 int32_t space;
3a60a9f5 1860 int assumelock = 0;
91447636
A
1861
1862restart:
1863 if (*sblocked == 0) {
3a60a9f5 1864 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
2d21ac55
A
1865 so->so_send_filt_thread != 0 &&
1866 so->so_send_filt_thread == current_thread()) {
3a60a9f5
A
1867 /*
1868 * We're being called recursively from a filter,
1869 * allow this to continue. Radar 4150520.
1870 * Don't set sblocked because we don't want
1871 * to perform an unlock later.
1872 */
1873 assumelock = 1;
2d21ac55 1874 } else {
3a60a9f5
A
1875 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1876 if (error) {
6d2010ae
A
1877 if (so->so_flags & SOF_DEFUNCT)
1878 goto defunct;
2d21ac55 1879 return (error);
3a60a9f5
A
1880 }
1881 *sblocked = 1;
1882 }
91447636 1883 }
2d21ac55
A
1884
1885 /*
6d2010ae
A
1886 * If a send attempt is made on a socket that has been marked
1887 * as inactive (disconnected), reject the request.
2d21ac55 1888 */
6d2010ae
A
1889 if (so->so_flags & SOF_DEFUNCT) {
1890defunct:
1891 error = EPIPE;
39037602
A
1892 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1893 __func__, proc_selfpid(), proc_best_name(current_proc()),
3e170ce0 1894 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 1895 SOCK_DOM(so), SOCK_TYPE(so), error);
6d2010ae
A
1896 return (error);
1897 }
2d21ac55 1898
fe8ab488
A
1899 if (so->so_state & SS_CANTSENDMORE) {
1900#if CONTENT_FILTER
1901 /*
1902 * Can re-inject data of half closed connections
1903 */
1904 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1905 so->so_snd.sb_cfil_thread == current_thread() &&
1906 cfil_sock_data_pending(&so->so_snd) != 0)
1907 CFIL_LOG(LOG_INFO,
1908 "so %llx ignore SS_CANTSENDMORE",
3e170ce0 1909 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
fe8ab488
A
1910 else
1911#endif /* CONTENT_FILTER */
1912 return (EPIPE);
1913 }
91447636
A
1914 if (so->so_error) {
1915 error = so->so_error;
1916 so->so_error = 0;
2d21ac55 1917 return (error);
91447636 1918 }
2d21ac55 1919
91447636 1920 if ((so->so_state & SS_ISCONNECTED) == 0) {
2d21ac55 1921 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
fe8ab488 1922 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
3e170ce0 1923 (resid != 0 || clen == 0) &&
5ba3f43e 1924 !(so->so_flags1 & SOF1_PRECONNECT_DATA))
2d21ac55 1925 return (ENOTCONN);
5ba3f43e 1926
2d21ac55
A
1927 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1928 return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1929 ENOTCONN : EDESTADDRREQ);
1930 }
91447636 1931 }
3e170ce0 1932
39236c6e
A
1933 if (so->so_flags & SOF_ENABLE_MSGS)
1934 space = msgq_sbspace(so, control);
1935 else
1936 space = sbspace(&so->so_snd);
1937
91447636
A
1938 if (flags & MSG_OOB)
1939 space += 1024;
1940 if ((atomic && resid > so->so_snd.sb_hiwat) ||
2d21ac55
A
1941 clen > so->so_snd.sb_hiwat)
1942 return (EMSGSIZE);
39236c6e 1943
316670eb 1944 if ((space < resid + clen &&
3e170ce0
A
1945 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
1946 space < clen)) ||
316670eb 1947 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
3e170ce0
A
1948 /*
1949 * don't block the connectx call when there's more data
1950 * than can be copied.
1951 */
1952 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
1953 if (space == 0) {
1954 return (EWOULDBLOCK);
1955 }
1956 if (space < (int32_t)so->so_snd.sb_lowat) {
1957 return (0);
1958 }
1959 }
2d21ac55
A
1960 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1961 assumelock) {
1962 return (EWOULDBLOCK);
3a60a9f5 1963 }
39236c6e 1964 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
6d2010ae 1965 *sblocked = 0;
91447636
A
1966 error = sbwait(&so->so_snd);
1967 if (error) {
6d2010ae
A
1968 if (so->so_flags & SOF_DEFUNCT)
1969 goto defunct;
2d21ac55 1970 return (error);
91447636
A
1971 }
1972 goto restart;
1973 }
2d21ac55 1974 return (0);
91447636
A
1975}
1976
1c79356b
A
1977/*
1978 * Send on a socket.
1979 * If send must go all at once and message is larger than
1980 * send buffering, then hard error.
1981 * Lock against other senders.
1982 * If must go all at once and not enough room now, then
1983 * inform user that this would block and do nothing.
1984 * Otherwise, if nonblocking, send as much as possible.
1985 * The data to be sent is described by "uio" if nonzero,
1986 * otherwise by the mbuf chain "top" (which must be null
1987 * if uio is not). Data provided in mbuf chain must be small
1988 * enough to send all at once.
1989 *
1990 * Returns nonzero on error, timeout or signal; callers
1991 * must check for short counts if EINTR/ERESTART are returned.
1992 * Data and control buffers are freed on return.
1993 * Experiment:
1994 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1995 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1996 * point at the mbuf chain being constructed and go from there.
2d21ac55
A
1997 *
1998 * Returns: 0 Success
1999 * EOPNOTSUPP
2000 * EINVAL
2001 * ENOBUFS
2002 * uiomove:EFAULT
2003 * sosendcheck:EPIPE
2004 * sosendcheck:EWOULDBLOCK
2005 * sosendcheck:EINTR
2006 * sosendcheck:EBADF
2007 * sosendcheck:EINTR
2008 * sosendcheck:??? [value from so_error]
2009 * <pru_send>:ECONNRESET[TCP]
2010 * <pru_send>:EINVAL[TCP]
2011 * <pru_send>:ENOBUFS[TCP]
2012 * <pru_send>:EADDRINUSE[TCP]
2013 * <pru_send>:EADDRNOTAVAIL[TCP]
2014 * <pru_send>:EAFNOSUPPORT[TCP]
2015 * <pru_send>:EACCES[TCP]
2016 * <pru_send>:EAGAIN[TCP]
2017 * <pru_send>:EPERM[TCP]
2018 * <pru_send>:EMSGSIZE[TCP]
2019 * <pru_send>:EHOSTUNREACH[TCP]
2020 * <pru_send>:ENETUNREACH[TCP]
2021 * <pru_send>:ENETDOWN[TCP]
2022 * <pru_send>:ENOMEM[TCP]
2023 * <pru_send>:ENOBUFS[TCP]
2024 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2025 * <pru_send>:EINVAL[AF_UNIX]
2026 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2027 * <pru_send>:EPIPE[AF_UNIX]
2028 * <pru_send>:ENOTCONN[AF_UNIX]
2029 * <pru_send>:EISCONN[AF_UNIX]
2030 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2031 * <sf_data_out>:??? [whatever a filter author chooses]
2032 *
2033 * Notes: Other <pru_send> returns depend on the protocol family; all
2034 * <sf_data_out> returns depend on what the filter author causes
2035 * their filter to return.
1c79356b
A
2036 */
2037int
2d21ac55
A
2038sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2039 struct mbuf *top, struct mbuf *control, int flags)
1c79356b
A
2040{
2041 struct mbuf **mp;
39236c6e 2042 struct mbuf *m, *freelist = NULL;
3e170ce0 2043 user_ssize_t space, len, resid, orig_resid;
91447636 2044 int clen = 0, error, dontroute, mlen, sendflags;
1c79356b 2045 int atomic = sosendallatonce(so) || top;
91447636 2046 int sblocked = 0;
1c79356b 2047 struct proc *p = current_proc();
39236c6e 2048 struct mbuf *control_copy = NULL;
3e170ce0
A
2049 uint16_t headroom = 0;
2050 boolean_t en_tracing = FALSE;
1c79356b 2051
39236c6e 2052 if (uio != NULL)
91447636 2053 resid = uio_resid(uio);
39236c6e 2054 else
1c79356b 2055 resid = top->m_pkthdr.len;
39236c6e 2056
2d21ac55
A
2057 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2058 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1c79356b 2059
91447636 2060 socket_lock(so, 1);
fe8ab488 2061
3e170ce0
A
2062 /*
2063 * trace if tracing & network (vs. unix) sockets & and
2064 * non-loopback
2065 */
2066 if (ENTR_SHOULDTRACE &&
2067 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2068 struct inpcb *inp = sotoinpcb(so);
2069 if (inp->inp_last_outifp != NULL &&
2070 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2071 en_tracing = TRUE;
2072 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2073 VM_KERNEL_ADDRPERM(so),
2074 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2075 (int64_t)resid);
2076 orig_resid = resid;
2077 }
2078 }
2079
fe8ab488
A
2080 /*
2081 * Re-injection should not affect process accounting
2082 */
2083 if ((flags & MSG_SKIPCFIL) == 0) {
3e170ce0
A
2084 so_update_last_owner_locked(so, p);
2085 so_update_policy(so);
2086
fe8ab488 2087#if NECP
3e170ce0 2088 so_update_necp_policy(so, NULL, addr);
fe8ab488
A
2089#endif /* NECP */
2090 }
3e170ce0 2091
2d21ac55
A
2092 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2093 error = EOPNOTSUPP;
5ba3f43e 2094 goto out_locked;
2d21ac55 2095 }
91447636 2096
1c79356b
A
2097 /*
2098 * In theory resid should be unsigned.
2099 * However, space must be signed, as it might be less than 0
2100 * if we over-committed, and we must use a signed comparison
2101 * of space and resid. On the other hand, a negative resid
2102 * causes us to loop sending 0-length segments to the protocol.
2103 *
39236c6e
A
2104 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2105 * But it will be used by sockets doing message delivery.
2106 *
fe8ab488 2107 * Note: We limit resid to be a positive int value as we use
39236c6e 2108 * imin() to set bytes_to_copy -- radr://14558484
1c79356b 2109 */
fe8ab488 2110 if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
39236c6e 2111 !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
1c79356b 2112 error = EINVAL;
5ba3f43e 2113 goto out_locked;
1c79356b
A
2114 }
2115
39236c6e
A
2116 dontroute = (flags & MSG_DONTROUTE) &&
2117 (so->so_options & SO_DONTROUTE) == 0 &&
1c79356b 2118 (so->so_proto->pr_flags & PR_ATOMIC);
b0d623f7 2119 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
39236c6e
A
2120
2121 if (control != NULL)
1c79356b 2122 clen = control->m_len;
1c79356b 2123
3e170ce0
A
2124 if (soreserveheadroom != 0)
2125 headroom = so->so_pktheadroom;
2126
1c79356b 2127 do {
2d21ac55 2128 error = sosendcheck(so, addr, resid, clen, atomic, flags,
39236c6e
A
2129 &sblocked, control);
2130 if (error)
5ba3f43e 2131 goto out_locked;
39236c6e 2132
1c79356b 2133 mp = &top;
39236c6e
A
2134 if (so->so_flags & SOF_ENABLE_MSGS)
2135 space = msgq_sbspace(so, control);
2136 else
2137 space = sbspace(&so->so_snd) - clen;
2138 space += ((flags & MSG_OOB) ? 1024 : 0);
fa4905b1 2139
1c79356b 2140 do {
2d21ac55 2141 if (uio == NULL) {
91447636
A
2142 /*
2143 * Data is prepackaged in "top".
2144 */
2145 resid = 0;
1c79356b
A
2146 if (flags & MSG_EOR)
2147 top->m_flags |= M_EOR;
91447636 2148 } else {
2d21ac55
A
2149 int chainlength;
2150 int bytes_to_copy;
2151 boolean_t jumbocl;
fe8ab488 2152 boolean_t bigcl;
3e170ce0 2153 int bytes_to_alloc;
2d21ac55 2154
b0d623f7 2155 bytes_to_copy = imin(resid, space);
2d21ac55 2156
3e170ce0
A
2157 bytes_to_alloc = bytes_to_copy;
2158 if (top == NULL)
2159 bytes_to_alloc += headroom;
2160
39236c6e 2161 if (sosendminchain > 0)
91447636 2162 chainlength = 0;
39236c6e 2163 else
91447636 2164 chainlength = sosendmaxchain;
2d21ac55 2165
fe8ab488 2166 /*
3e170ce0
A
2167 * Use big 4 KB cluster when the outgoing interface
2168 * does not prefer 2 KB clusters
fe8ab488 2169 */
3e170ce0 2170 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
fe8ab488 2171 sosendbigcl_ignore_capab;
3e170ce0 2172
2d21ac55
A
2173 /*
2174 * Attempt to use larger than system page-size
2175 * clusters for large writes only if there is
2176 * a jumbo cluster pool and if the socket is
2177 * marked accordingly.
2178 */
2179 jumbocl = sosendjcl && njcl > 0 &&
2180 ((so->so_flags & SOF_MULTIPAGES) ||
fe8ab488
A
2181 sosendjcl_ignore_capab) &&
2182 bigcl;
2d21ac55 2183
91447636 2184 socket_unlock(so, 0);
2d21ac55 2185
91447636
A
2186 do {
2187 int num_needed;
39236c6e 2188 int hdrs_needed = (top == NULL) ? 1 : 0;
2d21ac55 2189
91447636 2190 /*
2d21ac55
A
2191 * try to maintain a local cache of mbuf
2192 * clusters needed to complete this
2193 * write the list is further limited to
2194 * the number that are currently needed
2195 * to fill the socket this mechanism
2196 * allows a large number of mbufs/
2197 * clusters to be grabbed under a single
2198 * mbuf lock... if we can't get any
2199 * clusters, than fall back to trying
2200 * for mbufs if we fail early (or
2201 * miscalcluate the number needed) make
2202 * sure to release any clusters we
2203 * haven't yet consumed.
91447636 2204 */
2d21ac55 2205 if (freelist == NULL &&
3e170ce0 2206 bytes_to_alloc > MBIGCLBYTES &&
6d2010ae 2207 jumbocl) {
2d21ac55 2208 num_needed =
3e170ce0 2209 bytes_to_alloc / M16KCLBYTES;
2d21ac55 2210
3e170ce0 2211 if ((bytes_to_alloc -
2d21ac55
A
2212 (num_needed * M16KCLBYTES))
2213 >= MINCLSIZE)
2214 num_needed++;
91447636 2215
2d21ac55
A
2216 freelist =
2217 m_getpackets_internal(
2218 (unsigned int *)&num_needed,
2219 hdrs_needed, M_WAIT, 0,
2220 M16KCLBYTES);
2221 /*
2222 * Fall back to 4K cluster size
2223 * if allocation failed
2224 */
2225 }
2226
2227 if (freelist == NULL &&
3e170ce0 2228 bytes_to_alloc > MCLBYTES &&
fe8ab488 2229 bigcl) {
2d21ac55 2230 num_needed =
3e170ce0 2231 bytes_to_alloc / MBIGCLBYTES;
2d21ac55 2232
3e170ce0 2233 if ((bytes_to_alloc -
6d2010ae 2234 (num_needed * MBIGCLBYTES)) >=
2d21ac55 2235 MINCLSIZE)
91447636 2236 num_needed++;
2d21ac55
A
2237
2238 freelist =
2239 m_getpackets_internal(
2240 (unsigned int *)&num_needed,
2241 hdrs_needed, M_WAIT, 0,
6d2010ae 2242 MBIGCLBYTES);
2d21ac55
A
2243 /*
2244 * Fall back to cluster size
2245 * if allocation failed
2246 */
91447636 2247 }
2d21ac55 2248
3e170ce0
A
2249 /*
2250 * Allocate a cluster as we want to
2251 * avoid to split the data in more
2252 * that one segment and using MINCLSIZE
2253 * would lead us to allocate two mbufs
2254 */
2255 if (soreserveheadroom != 0 &&
2256 freelist == NULL &&
2257 ((top == NULL &&
2258 bytes_to_alloc > _MHLEN) ||
2259 bytes_to_alloc > _MLEN)) {
2260 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2261 MCLBYTES;
2262 freelist =
2263 m_getpackets_internal(
2264 (unsigned int *)&num_needed,
2265 hdrs_needed, M_WAIT, 0,
2266 MCLBYTES);
2267 /*
2268 * Fall back to a single mbuf
2269 * if allocation failed
2270 */
2271 } else if (freelist == NULL &&
2272 bytes_to_alloc > MINCLSIZE) {
2d21ac55 2273 num_needed =
3e170ce0 2274 bytes_to_alloc / MCLBYTES;
2d21ac55 2275
3e170ce0 2276 if ((bytes_to_alloc -
2d21ac55
A
2277 (num_needed * MCLBYTES)) >=
2278 MINCLSIZE)
91447636 2279 num_needed++;
2d21ac55
A
2280
2281 freelist =
2282 m_getpackets_internal(
2283 (unsigned int *)&num_needed,
2284 hdrs_needed, M_WAIT, 0,
2285 MCLBYTES);
2286 /*
2287 * Fall back to a single mbuf
2288 * if allocation failed
2289 */
91447636 2290 }
3e170ce0
A
2291 /*
2292 * For datagram protocols, leave
2293 * headroom for protocol headers
2294 * in the first cluster of the chain
2295 */
2296 if (freelist != NULL && atomic &&
2297 top == NULL && headroom > 0) {
2298 freelist->m_data += headroom;
2299 }
39037602 2300
3e170ce0
A
2301 /*
2302 * Fall back to regular mbufs without
2303 * reserving the socket headroom
2304 */
91447636 2305 if (freelist == NULL) {
39236c6e 2306 if (top == NULL)
2d21ac55
A
2307 MGETHDR(freelist,
2308 M_WAIT, MT_DATA);
91447636 2309 else
2d21ac55
A
2310 MGET(freelist,
2311 M_WAIT, MT_DATA);
91447636
A
2312
2313 if (freelist == NULL) {
2314 error = ENOBUFS;
2315 socket_lock(so, 0);
5ba3f43e 2316 goto out_locked;
91447636
A
2317 }
2318 /*
2d21ac55
A
2319 * For datagram protocols,
2320 * leave room for protocol
2321 * headers in first mbuf.
91447636 2322 */
39236c6e 2323 if (atomic && top == NULL &&
2d21ac55
A
2324 bytes_to_copy < MHLEN) {
2325 MH_ALIGN(freelist,
2326 bytes_to_copy);
2327 }
91447636
A
2328 }
2329 m = freelist;
2330 freelist = m->m_next;
2331 m->m_next = NULL;
2d21ac55 2332
91447636 2333 if ((m->m_flags & M_EXT))
3e170ce0
A
2334 mlen = m->m_ext.ext_size -
2335 m_leadingspace(m);
91447636 2336 else if ((m->m_flags & M_PKTHDR))
2d21ac55
A
2337 mlen =
2338 MHLEN - m_leadingspace(m);
91447636 2339 else
3e170ce0 2340 mlen = MLEN - m_leadingspace(m);
b0d623f7 2341 len = imin(mlen, bytes_to_copy);
91447636
A
2342
2343 chainlength += len;
2d21ac55 2344
91447636 2345 space -= len;
fa4905b1 2346
2d21ac55 2347 error = uiomove(mtod(m, caddr_t),
b0d623f7 2348 len, uio);
2d21ac55 2349
91447636 2350 resid = uio_resid(uio);
2d21ac55 2351
91447636
A
2352 m->m_len = len;
2353 *mp = m;
2354 top->m_pkthdr.len += len;
2d21ac55 2355 if (error)
91447636
A
2356 break;
2357 mp = &m->m_next;
2358 if (resid <= 0) {
2359 if (flags & MSG_EOR)
2360 top->m_flags |= M_EOR;
2361 break;
2362 }
2363 bytes_to_copy = min(resid, space);
2d21ac55
A
2364
2365 } while (space > 0 &&
2366 (chainlength < sosendmaxchain || atomic ||
2367 resid < MINCLSIZE));
2368
91447636 2369 socket_lock(so, 0);
2d21ac55 2370
91447636 2371 if (error)
5ba3f43e 2372 goto out_locked;
91447636 2373 }
2d21ac55
A
2374
2375 if (flags & (MSG_HOLD|MSG_SEND)) {
3a60a9f5 2376 /* Enqueue for later, go away if HOLD */
39236c6e 2377 struct mbuf *mb1;
2d21ac55 2378 if (so->so_temp && (flags & MSG_FLUSH)) {
3a60a9f5
A
2379 m_freem(so->so_temp);
2380 so->so_temp = NULL;
2381 }
2382 if (so->so_temp)
2383 so->so_tail->m_next = top;
2384 else
2385 so->so_temp = top;
2386 mb1 = top;
2387 while (mb1->m_next)
2d21ac55 2388 mb1 = mb1->m_next;
3a60a9f5 2389 so->so_tail = mb1;
2d21ac55 2390 if (flags & MSG_HOLD) {
3a60a9f5 2391 top = NULL;
5ba3f43e 2392 goto out_locked;
3a60a9f5
A
2393 }
2394 top = so->so_temp;
2d21ac55
A
2395 }
2396 if (dontroute)
2397 so->so_options |= SO_DONTROUTE;
2398
3e170ce0
A
2399 /*
2400 * Compute flags here, for pru_send and NKEs
2401 *
2402 * If the user set MSG_EOF, the protocol
2403 * understands this flag and nothing left to
2404 * send then use PRU_SEND_EOF instead of PRU_SEND.
2405 */
2d21ac55 2406 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2d21ac55 2407 ((flags & MSG_EOF) &&
3e170ce0
A
2408 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2409 (resid <= 0)) ? PRUS_EOF :
2410 /* If there is more to send set PRUS_MORETOCOME */
2411 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2412
fe8ab488
A
2413 if ((flags & MSG_SKIPCFIL) == 0) {
2414 /*
2415 * Socket filter processing
2416 */
2417 error = sflt_data_out(so, addr, &top,
2418 &control, (sendflags & MSG_OOB) ?
2419 sock_data_filt_flag_oob : 0);
2420 if (error) {
2421 if (error == EJUSTRETURN) {
2422 error = 0;
2423 clen = 0;
2424 control = NULL;
2425 top = NULL;
2426 }
5ba3f43e 2427 goto out_locked;
91447636 2428 }
fe8ab488
A
2429#if CONTENT_FILTER
2430 /*
2431 * Content filter processing
2432 */
2433 error = cfil_sock_data_out(so, addr, top,
3e170ce0 2434 control, (sendflags & MSG_OOB) ?
fe8ab488
A
2435 sock_data_filt_flag_oob : 0);
2436 if (error) {
2437 if (error == EJUSTRETURN) {
2438 error = 0;
2439 clen = 0;
2440 control = NULL;
2441 top = NULL;
2442 }
5ba3f43e 2443 goto out_locked;
fe8ab488
A
2444 }
2445#endif /* CONTENT_FILTER */
1c79356b 2446 }
39236c6e
A
2447 if (so->so_flags & SOF_ENABLE_MSGS) {
2448 /*
2449 * Make a copy of control mbuf,
2450 * so that msg priority can be
2451 * passed to subsequent mbufs.
2452 */
2453 control_copy = m_dup(control, M_NOWAIT);
2454 }
6d2010ae 2455 error = (*so->so_proto->pr_usrreqs->pru_send)
39236c6e
A
2456 (so, sendflags, top, addr, control, p);
2457
2d21ac55
A
2458 if (flags & MSG_SEND)
2459 so->so_temp = NULL;
39236c6e 2460
2d21ac55
A
2461 if (dontroute)
2462 so->so_options &= ~SO_DONTROUTE;
2463
2464 clen = 0;
39236c6e
A
2465 control = control_copy;
2466 control_copy = NULL;
2467 top = NULL;
2d21ac55
A
2468 mp = &top;
2469 if (error)
5ba3f43e 2470 goto out_locked;
1c79356b
A
2471 } while (resid && space > 0);
2472 } while (resid);
2473
5ba3f43e 2474out_locked:
3a60a9f5 2475 if (sblocked)
39236c6e 2476 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
3a60a9f5
A
2477 else
2478 socket_unlock(so, 1);
39236c6e 2479 if (top != NULL)
1c79356b 2480 m_freem(top);
39236c6e 2481 if (control != NULL)
1c79356b 2482 m_freem(control);
39236c6e 2483 if (freelist != NULL)
2d21ac55 2484 m_freem_list(freelist);
39236c6e
A
2485 if (control_copy != NULL)
2486 m_freem(control_copy);
1c79356b 2487
5ba3f43e 2488 soclearfastopen(so);
3e170ce0
A
2489
2490 if (en_tracing) {
2491 /* resid passed here is the bytes left in uio */
2492 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2493 VM_KERNEL_ADDRPERM(so),
2494 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2495 (int64_t)(orig_resid - resid));
2496 }
2497 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2498 so->so_snd.sb_cc, space, error);
1c79356b
A
2499
2500 return (error);
2501}
2502
3e170ce0
A
2503/*
2504 * Supported only connected sockets (no address) without ancillary data
2505 * (control mbuf) for atomic protocols
2506 */
fe8ab488 2507int
3e170ce0 2508sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
fe8ab488
A
2509{
2510 struct mbuf *m, *freelist = NULL;
2511 user_ssize_t len, resid;
3e170ce0
A
2512 int error, dontroute, mlen;
2513 int atomic = sosendallatonce(so);
fe8ab488
A
2514 int sblocked = 0;
2515 struct proc *p = current_proc();
2516 u_int uiofirst = 0;
2517 u_int uiolast = 0;
3e170ce0
A
2518 struct mbuf *top = NULL;
2519 uint16_t headroom = 0;
2520 boolean_t bigcl;
fe8ab488
A
2521
2522 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2523 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2524
2525 if (so->so_type != SOCK_DGRAM) {
2526 error = EINVAL;
2527 goto out;
2528 }
2529 if (atomic == 0) {
2530 error = EINVAL;
2531 goto out;
2532 }
2533 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2534 error = EPROTONOSUPPORT;
2535 goto out;
2536 }
2537 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2538 error = EINVAL;
2539 goto out;
2540 }
3e170ce0 2541 resid = uio_array_resid(uioarray, uiocnt);
fe8ab488
A
2542
2543 /*
2544 * In theory resid should be unsigned.
2545 * However, space must be signed, as it might be less than 0
2546 * if we over-committed, and we must use a signed comparison
2547 * of space and resid. On the other hand, a negative resid
2548 * causes us to loop sending 0-length segments to the protocol.
2549 *
2550 * Note: We limit resid to be a positive int value as we use
2551 * imin() to set bytes_to_copy -- radr://14558484
2552 */
2553 if (resid < 0 || resid > INT_MAX) {
2554 error = EINVAL;
2555 goto out;
2556 }
fe8ab488
A
2557
2558 socket_lock(so, 1);
2559 so_update_last_owner_locked(so, p);
2560 so_update_policy(so);
3e170ce0 2561
fe8ab488 2562#if NECP
3e170ce0 2563 so_update_necp_policy(so, NULL, NULL);
fe8ab488 2564#endif /* NECP */
3e170ce0 2565
fe8ab488
A
2566 dontroute = (flags & MSG_DONTROUTE) &&
2567 (so->so_options & SO_DONTROUTE) == 0 &&
2568 (so->so_proto->pr_flags & PR_ATOMIC);
2569 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2570
3e170ce0
A
2571 error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2572 &sblocked, NULL);
fe8ab488
A
2573 if (error)
2574 goto release;
2575
3e170ce0
A
2576 /*
2577 * Use big 4 KB clusters when the outgoing interface does not prefer
2578 * 2 KB clusters
2579 */
2580 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2581
2582 if (soreserveheadroom != 0)
2583 headroom = so->so_pktheadroom;
2584
fe8ab488
A
2585 do {
2586 int i;
3e170ce0
A
2587 int num_needed = 0;
2588 int chainlength;
2589 size_t maxpktlen = 0;
2590 int bytes_to_alloc;
fe8ab488 2591
3e170ce0
A
2592 if (sosendminchain > 0)
2593 chainlength = 0;
2594 else
2595 chainlength = sosendmaxchain;
fe8ab488 2596
3e170ce0 2597 socket_unlock(so, 0);
fe8ab488 2598
3e170ce0
A
2599 /*
2600 * Find a set of uio that fit in a reasonable number
2601 * of mbuf packets
2602 */
2603 for (i = uiofirst; i < uiocnt; i++) {
2604 struct uio *auio = uioarray[i];
fe8ab488 2605
3e170ce0 2606 len = uio_resid(auio);
fe8ab488 2607
3e170ce0
A
2608 /* Do nothing for empty messages */
2609 if (len == 0)
2610 continue;
fe8ab488 2611
3e170ce0
A
2612 num_needed += 1;
2613 uiolast += 1;
fe8ab488 2614
3e170ce0
A
2615 if (len > maxpktlen)
2616 maxpktlen = len;
fe8ab488 2617
3e170ce0
A
2618 chainlength += len;
2619 if (chainlength > sosendmaxchain)
fe8ab488 2620 break;
3e170ce0
A
2621 }
2622 /*
2623 * Nothing left to send
2624 */
2625 if (num_needed == 0) {
2626 socket_lock(so, 0);
2627 break;
2628 }
2629 /*
2630 * Allocate buffer large enough to include headroom space for
2631 * network and link header
39037602 2632 *
3e170ce0
A
2633 */
2634 bytes_to_alloc = maxpktlen + headroom;
2635
2636 /*
2637 * Allocate a single contiguous buffer of the smallest available
2638 * size when possible
2639 */
2640 if (bytes_to_alloc > MCLBYTES &&
2641 bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2642 freelist = m_getpackets_internal(
2643 (unsigned int *)&num_needed,
2644 num_needed, M_WAIT, 1,
2645 MBIGCLBYTES);
2646 } else if (bytes_to_alloc > _MHLEN &&
2647 bytes_to_alloc <= MCLBYTES) {
2648 freelist = m_getpackets_internal(
2649 (unsigned int *)&num_needed,
2650 num_needed, M_WAIT, 1,
2651 MCLBYTES);
2652 } else {
fe8ab488
A
2653 freelist = m_allocpacket_internal(
2654 (unsigned int *)&num_needed,
3e170ce0
A
2655 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2656 }
39037602 2657
3e170ce0
A
2658 if (freelist == NULL) {
2659 socket_lock(so, 0);
2660 error = ENOMEM;
2661 goto release;
2662 }
2663 /*
2664 * Copy each uio of the set into its own mbuf packet
2665 */
2666 for (i = uiofirst, m = freelist;
2667 i < uiolast && m != NULL;
2668 i++) {
2669 int bytes_to_copy;
2670 struct mbuf *n;
2671 struct uio *auio = uioarray[i];
fe8ab488 2672
3e170ce0
A
2673 bytes_to_copy = uio_resid(auio);
2674
2675 /* Do nothing for empty messages */
2676 if (bytes_to_copy == 0)
2677 continue;
fe8ab488 2678 /*
3e170ce0
A
2679 * Leave headroom for protocol headers
2680 * in the first mbuf of the chain
fe8ab488 2681 */
3e170ce0
A
2682 m->m_data += headroom;
2683
2684 for (n = m; n != NULL; n = n->m_next) {
2685 if ((m->m_flags & M_EXT))
2686 mlen = m->m_ext.ext_size -
2687 m_leadingspace(m);
2688 else if ((m->m_flags & M_PKTHDR))
2689 mlen =
2690 MHLEN - m_leadingspace(m);
2691 else
2692 mlen = MLEN - m_leadingspace(m);
2693 len = imin(mlen, bytes_to_copy);
fe8ab488 2694
3e170ce0
A
2695 /*
2696 * Note: uiomove() decrements the iovec
2697 * length
2698 */
2699 error = uiomove(mtod(n, caddr_t),
2700 len, auio);
fe8ab488
A
2701 if (error != 0)
2702 break;
3e170ce0
A
2703 n->m_len = len;
2704 m->m_pkthdr.len += len;
fe8ab488 2705
3e170ce0 2706 VERIFY(m->m_pkthdr.len <= maxpktlen);
fe8ab488 2707
3e170ce0
A
2708 bytes_to_copy -= len;
2709 resid -= len;
2710 }
2711 if (m->m_pkthdr.len == 0) {
2712 printf(
2713 "%s:%d so %llx pkt %llx type %u len null\n",
2714 __func__, __LINE__,
2715 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2716 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2717 m->m_type);
2718 }
2719 if (error != 0)
2720 break;
2721 m = m->m_nextpkt;
fe8ab488
A
2722 }
2723
3e170ce0
A
2724 socket_lock(so, 0);
2725
2726 if (error)
2727 goto release;
2728 top = freelist;
2729 freelist = NULL;
2730
fe8ab488
A
2731 if (dontroute)
2732 so->so_options |= SO_DONTROUTE;
2733
2734 if ((flags & MSG_SKIPCFIL) == 0) {
2735 struct mbuf **prevnextp = NULL;
3e170ce0 2736
fe8ab488
A
2737 for (i = uiofirst, m = top;
2738 i < uiolast && m != NULL;
2739 i++) {
2740 struct mbuf *nextpkt = m->m_nextpkt;
2741
2742 /*
2743 * Socket filter processing
2744 */
3e170ce0
A
2745 error = sflt_data_out(so, NULL, &m,
2746 NULL, 0);
fe8ab488
A
2747 if (error != 0 && error != EJUSTRETURN)
2748 goto release;
3e170ce0 2749
fe8ab488
A
2750#if CONTENT_FILTER
2751 if (error == 0) {
2752 /*
2753 * Content filter processing
2754 */
3e170ce0
A
2755 error = cfil_sock_data_out(so, NULL, m,
2756 NULL, 0);
fe8ab488
A
2757 if (error != 0 && error != EJUSTRETURN)
2758 goto release;
2759 }
2760#endif /* CONTENT_FILTER */
2761 /*
2762 * Remove packet from the list when
2763 * swallowed by a filter
2764 */
2765 if (error == EJUSTRETURN) {
2766 error = 0;
2767 if (prevnextp != NULL)
2768 *prevnextp = nextpkt;
2769 else
2770 top = nextpkt;
3e170ce0
A
2771 }
2772
fe8ab488
A
2773 m = nextpkt;
2774 if (m != NULL)
2775 prevnextp = &m->m_nextpkt;
2776 }
2777 }
2778 if (top != NULL)
2779 error = (*so->so_proto->pr_usrreqs->pru_send_list)
3e170ce0 2780 (so, 0, top, NULL, NULL, p);
fe8ab488
A
2781
2782 if (dontroute)
2783 so->so_options &= ~SO_DONTROUTE;
2784
fe8ab488
A
2785 top = NULL;
2786 uiofirst = uiolast;
2787 } while (resid > 0 && error == 0);
2788release:
2789 if (sblocked)
2790 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2791 else
2792 socket_unlock(so, 1);
2793out:
2794 if (top != NULL)
2795 m_freem(top);
fe8ab488
A
2796 if (freelist != NULL)
2797 m_freem_list(freelist);
2798
2799 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2800 so->so_snd.sb_cc, 0, error);
2801
2802 return (error);
2803}
2804
3e170ce0
A
2805/*
2806 * May return ERESTART when packet is dropped by MAC policy check
2807 */
2808static int
2809soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2810 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2811{
2812 int error = 0;
2813 struct mbuf *m = *mp;
2814 struct mbuf *nextrecord = *nextrecordp;
2815
2816 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2817#if CONFIG_MACF_SOCKET_SUBSET
2818 /*
2819 * Call the MAC framework for policy checking if we're in
2820 * the user process context and the socket isn't connected.
2821 */
2822 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2823 struct mbuf *m0 = m;
2824 /*
2825 * Dequeue this record (temporarily) from the receive
2826 * list since we're about to drop the socket's lock
2827 * where a new record may arrive and be appended to
2828 * the list. Upon MAC policy failure, the record
2829 * will be freed. Otherwise, we'll add it back to
2830 * the head of the list. We cannot rely on SB_LOCK
2831 * because append operation uses the socket's lock.
2832 */
2833 do {
2834 m->m_nextpkt = NULL;
2835 sbfree(&so->so_rcv, m);
2836 m = m->m_next;
2837 } while (m != NULL);
2838 m = m0;
2839 so->so_rcv.sb_mb = nextrecord;
2840 SB_EMPTY_FIXUP(&so->so_rcv);
2841 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2842 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2843 socket_unlock(so, 0);
2844
2845 if (mac_socket_check_received(proc_ucred(p), so,
2846 mtod(m, struct sockaddr *)) != 0) {
2847 /*
2848 * MAC policy failure; free this record and
2849 * process the next record (or block until
2850 * one is available). We have adjusted sb_cc
2851 * and sb_mbcnt above so there is no need to
2852 * call sbfree() again.
2853 */
2854 m_freem(m);
2855 /*
2856 * Clear SB_LOCK but don't unlock the socket.
2857 * Process the next record or wait for one.
2858 */
2859 socket_lock(so, 0);
2860 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2861 error = ERESTART;
2862 goto done;
2863 }
2864 socket_lock(so, 0);
2865 /*
2866 * If the socket has been defunct'd, drop it.
2867 */
2868 if (so->so_flags & SOF_DEFUNCT) {
2869 m_freem(m);
2870 error = ENOTCONN;
2871 goto done;
2872 }
2873 /*
2874 * Re-adjust the socket receive list and re-enqueue
2875 * the record in front of any packets which may have
2876 * been appended while we dropped the lock.
2877 */
2878 for (m = m0; m->m_next != NULL; m = m->m_next)
2879 sballoc(&so->so_rcv, m);
2880 sballoc(&so->so_rcv, m);
2881 if (so->so_rcv.sb_mb == NULL) {
2882 so->so_rcv.sb_lastrecord = m0;
2883 so->so_rcv.sb_mbtail = m;
2884 }
2885 m = m0;
2886 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2887 so->so_rcv.sb_mb = m;
2888 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2889 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2890 }
2891#endif /* CONFIG_MACF_SOCKET_SUBSET */
2892 if (psa != NULL) {
2893 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
2894 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2895 error = EWOULDBLOCK;
2896 goto done;
2897 }
2898 }
2899 if (flags & MSG_PEEK) {
2900 m = m->m_next;
2901 } else {
2902 sbfree(&so->so_rcv, m);
2903 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2904 panic("%s: about to create invalid socketbuf",
2905 __func__);
2906 /* NOTREACHED */
2907 }
2908 MFREE(m, so->so_rcv.sb_mb);
2909 m = so->so_rcv.sb_mb;
2910 if (m != NULL) {
2911 m->m_nextpkt = nextrecord;
2912 } else {
2913 so->so_rcv.sb_mb = nextrecord;
2914 SB_EMPTY_FIXUP(&so->so_rcv);
2915 }
2916 }
2917done:
2918 *mp = m;
2919 *nextrecordp = nextrecord;
2920
2921 return (error);
2922}
2923
2924/*
2925 * Process one or more MT_CONTROL mbufs present before any data mbufs
2926 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2927 * just copy the data; if !MSG_PEEK, we call into the protocol to
2928 * perform externalization.
2929 */
2930static int
2931soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
2932 struct mbuf **mp, struct mbuf **nextrecordp)
2933{
2934 int error = 0;
2935 struct mbuf *cm = NULL, *cmn;
2936 struct mbuf **cme = &cm;
2937 struct sockbuf *sb_rcv = &so->so_rcv;
2938 struct mbuf **msgpcm = NULL;
2939 struct mbuf *m = *mp;
2940 struct mbuf *nextrecord = *nextrecordp;
2941 struct protosw *pr = so->so_proto;
2942
2943 /*
2944 * Externalizing the control messages would require us to
2945 * drop the socket's lock below. Once we re-acquire the
2946 * lock, the mbuf chain might change. In order to preserve
2947 * consistency, we unlink all control messages from the
2948 * first mbuf chain in one shot and link them separately
2949 * onto a different chain.
2950 */
2951 do {
2952 if (flags & MSG_PEEK) {
2953 if (controlp != NULL) {
2954 if (*controlp == NULL) {
2955 msgpcm = controlp;
2956 }
2957 *controlp = m_copy(m, 0, m->m_len);
2958
2959 /*
2960 * If we failed to allocate an mbuf,
2961 * release any previously allocated
2962 * mbufs for control data. Return
2963 * an error. Keep the mbufs in the
2964 * socket as this is using
2965 * MSG_PEEK flag.
2966 */
2967 if (*controlp == NULL) {
2968 m_freem(*msgpcm);
2969 error = ENOBUFS;
2970 goto done;
2971 }
2972 controlp = &(*controlp)->m_next;
2973 }
2974 m = m->m_next;
2975 } else {
2976 m->m_nextpkt = NULL;
2977 sbfree(sb_rcv, m);
2978 sb_rcv->sb_mb = m->m_next;
2979 m->m_next = NULL;
2980 *cme = m;
2981 cme = &(*cme)->m_next;
2982 m = sb_rcv->sb_mb;
2983 }
2984 } while (m != NULL && m->m_type == MT_CONTROL);
2985
2986 if (!(flags & MSG_PEEK)) {
2987 if (sb_rcv->sb_mb != NULL) {
2988 sb_rcv->sb_mb->m_nextpkt = nextrecord;
2989 } else {
2990 sb_rcv->sb_mb = nextrecord;
2991 SB_EMPTY_FIXUP(sb_rcv);
2992 }
2993 if (nextrecord == NULL)
2994 sb_rcv->sb_lastrecord = m;
2995 }
2996
2997 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2998 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2999
3000 while (cm != NULL) {
3001 int cmsg_type;
3002
3003 cmn = cm->m_next;
3004 cm->m_next = NULL;
3005 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3006
3007 /*
3008 * Call the protocol to externalize SCM_RIGHTS message
3009 * and return the modified message to the caller upon
3010 * success. Otherwise, all other control messages are
3011 * returned unmodified to the caller. Note that we
3012 * only get into this loop if MSG_PEEK is not set.
3013 */
3014 if (pr->pr_domain->dom_externalize != NULL &&
3015 cmsg_type == SCM_RIGHTS) {
3016 /*
3017 * Release socket lock: see 3903171. This
3018 * would also allow more records to be appended
3019 * to the socket buffer. We still have SB_LOCK
3020 * set on it, so we can be sure that the head
3021 * of the mbuf chain won't change.
3022 */
3023 socket_unlock(so, 0);
3024 error = (*pr->pr_domain->dom_externalize)(cm);
3025 socket_lock(so, 0);
3026 } else {
3027 error = 0;
3028 }
3029
3030 if (controlp != NULL && error == 0) {
3031 *controlp = cm;
3032 controlp = &(*controlp)->m_next;
3033 } else {
3034 (void) m_free(cm);
3035 }
3036 cm = cmn;
3037 }
3038 /*
3039 * Update the value of nextrecord in case we received new
3040 * records when the socket was unlocked above for
3041 * externalizing SCM_RIGHTS.
3042 */
3043 if (m != NULL)
3044 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3045 else
3046 nextrecord = sb_rcv->sb_mb;
3047
3048done:
3049 *mp = m;
3050 *nextrecordp = nextrecord;
3051
3052 return (error);
3053}
3054
1c79356b
A
3055/*
3056 * Implement receive operations on a socket.
3057 * We depend on the way that records are added to the sockbuf
3058 * by sbappend*. In particular, each record (mbufs linked through m_next)
3059 * must begin with an address if the protocol so specifies,
3060 * followed by an optional mbuf or mbufs containing ancillary data,
3061 * and then zero or more mbufs of data.
3062 * In order to avoid blocking network interrupts for the entire time here,
3063 * we splx() while doing the actual copy to user space.
3064 * Although the sockbuf is locked, new data may still be appended,
3065 * and thus we must maintain consistency of the sockbuf during that time.
3066 *
3067 * The caller may receive the data as a single mbuf chain by supplying
3068 * an mbuf **mp0 for use in returning the chain. The uio is then used
3069 * only for the count in uio_resid.
2d21ac55
A
3070 *
3071 * Returns: 0 Success
3072 * ENOBUFS
3073 * ENOTCONN
3074 * EWOULDBLOCK
3075 * uiomove:EFAULT
3076 * sblock:EWOULDBLOCK
3077 * sblock:EINTR
3078 * sbwait:EBADF
3079 * sbwait:EINTR
3080 * sodelayed_copy:EFAULT
3081 * <pru_rcvoob>:EINVAL[TCP]
3082 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3083 * <pru_rcvoob>:???
3084 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3085 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3086 * <pr_domain->dom_externalize>:???
3087 *
3088 * Notes: Additional return values from calls through <pru_rcvoob> and
3089 * <pr_domain->dom_externalize> depend on protocols other than
3090 * TCP or AF_UNIX, which are documented above.
1c79356b
A
3091 */
3092int
2d21ac55
A
3093soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3094 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1c79356b 3095{
39236c6e
A
3096 struct mbuf *m, **mp, *ml = NULL;
3097 struct mbuf *nextrecord, *free_list;
3098 int flags, error, offset;
3099 user_ssize_t len;
1c79356b 3100 struct protosw *pr = so->so_proto;
3e170ce0 3101 int moff, type = 0;
39236c6e
A
3102 user_ssize_t orig_resid = uio_resid(uio);
3103 user_ssize_t delayed_copy_len;
55e303ae
A
3104 int can_delay;
3105 int need_event;
3106 struct proc *p = current_proc();
3e170ce0 3107 boolean_t en_tracing = FALSE;
1c79356b 3108
fe8ab488
A
3109 /*
3110 * Sanity check on the length passed by caller as we are making 'int'
3111 * comparisons
3112 */
3113 if (orig_resid < 0 || orig_resid > INT_MAX)
3114 return (EINVAL);
3115
3e170ce0
A
3116 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3117 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3118 so->so_rcv.sb_hiwat);
3119
91447636 3120 socket_lock(so, 1);
6d2010ae 3121 so_update_last_owner_locked(so, p);
39236c6e 3122 so_update_policy(so);
1c79356b 3123
91447636 3124#ifdef MORE_LOCKING_DEBUG
39236c6e
A
3125 if (so->so_usecount == 1) {
3126 panic("%s: so=%x no other reference on socket\n", __func__, so);
3127 /* NOTREACHED */
3128 }
91447636 3129#endif
1c79356b 3130 mp = mp0;
39236c6e
A
3131 if (psa != NULL)
3132 *psa = NULL;
3133 if (controlp != NULL)
3134 *controlp = NULL;
3135 if (flagsp != NULL)
1c79356b
A
3136 flags = *flagsp &~ MSG_EOR;
3137 else
3138 flags = 0;
2d21ac55
A
3139
3140 /*
3141 * If a recv attempt is made on a previously-accepted socket
3142 * that has been marked as inactive (disconnected), reject
3143 * the request.
3144 */
3145 if (so->so_flags & SOF_DEFUNCT) {
3146 struct sockbuf *sb = &so->so_rcv;
3147
6d2010ae 3148 error = ENOTCONN;
39037602
A
3149 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3150 __func__, proc_pid(p), proc_best_name(p),
3151 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3152 SOCK_DOM(so), SOCK_TYPE(so), error);
2d21ac55
A
3153 /*
3154 * This socket should have been disconnected and flushed
6d2010ae
A
3155 * prior to being returned from sodefunct(); there should
3156 * be no data on its receive list, so panic otherwise.
2d21ac55 3157 */
6d2010ae
A
3158 if (so->so_state & SS_DEFUNCT)
3159 sb_empty_assert(sb, __func__);
2d21ac55 3160 socket_unlock(so, 1);
6d2010ae 3161 return (error);
2d21ac55
A
3162 }
3163
3e170ce0
A
3164 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3165 pr->pr_usrreqs->pru_preconnect) {
3166 /*
3167 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3168 * calling write() right after this. *If* the app calls a read
3169 * we do not want to block this read indefinetely. Thus,
3170 * we trigger a connect so that the session gets initiated.
3171 */
3172 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3173
3174 if (error) {
3175 socket_unlock(so, 1);
3176 return (error);
3177 }
3178 }
3179
3180 if (ENTR_SHOULDTRACE &&
3181 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3182 /*
3183 * enable energy tracing for inet sockets that go over
3184 * non-loopback interfaces only.
3185 */
3186 struct inpcb *inp = sotoinpcb(so);
3187 if (inp->inp_last_outifp != NULL &&
3188 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3189 en_tracing = TRUE;
3190 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3191 VM_KERNEL_ADDRPERM(so),
3192 ((so->so_state & SS_NBIO) ?
3193 kEnTrFlagNonBlocking : 0),
3194 (int64_t)orig_resid);
3195 }
3196 }
3197
2d21ac55
A
3198 /*
3199 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3200 * regardless of the flags argument. Here is the case were
3201 * out-of-band data is not inline.
3202 */
3203 if ((flags & MSG_OOB) ||
3204 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3205 (so->so_options & SO_OOBINLINE) == 0 &&
3206 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
1c79356b 3207 m = m_get(M_WAIT, MT_DATA);
55e303ae 3208 if (m == NULL) {
91447636 3209 socket_unlock(so, 1);
2d21ac55
A
3210 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3211 ENOBUFS, 0, 0, 0, 0);
9bccf70c 3212 return (ENOBUFS);
55e303ae 3213 }
1c79356b
A
3214 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3215 if (error)
3216 goto bad;
91447636 3217 socket_unlock(so, 0);
1c79356b
A
3218 do {
3219 error = uiomove(mtod(m, caddr_t),
b0d623f7 3220 imin(uio_resid(uio), m->m_len), uio);
1c79356b 3221 m = m_free(m);
39236c6e 3222 } while (uio_resid(uio) && error == 0 && m != NULL);
91447636 3223 socket_lock(so, 0);
1c79356b 3224bad:
39236c6e 3225 if (m != NULL)
1c79356b 3226 m_freem(m);
39236c6e 3227
9bccf70c
A
3228 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3229 if (error == EWOULDBLOCK || error == EINVAL) {
2d21ac55 3230 /*
9bccf70c 3231 * Let's try to get normal data:
2d21ac55
A
3232 * EWOULDBLOCK: out-of-band data not
3233 * receive yet. EINVAL: out-of-band data
3234 * already read.
9bccf70c
A
3235 */
3236 error = 0;
3237 goto nooob;
39236c6e 3238 } else if (error == 0 && flagsp != NULL) {
9bccf70c 3239 *flagsp |= MSG_OOB;
2d21ac55
A
3240 }
3241 }
91447636 3242 socket_unlock(so, 1);
3e170ce0
A
3243 if (en_tracing) {
3244 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3245 VM_KERNEL_ADDRPERM(so), 0,
3246 (int64_t)(orig_resid - uio_resid(uio)));
3247 }
2d21ac55
A
3248 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3249 0, 0, 0, 0);
39236c6e 3250
1c79356b
A
3251 return (error);
3252 }
3253nooob:
39236c6e
A
3254 if (mp != NULL)
3255 *mp = NULL;
fe8ab488
A
3256
3257 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
1c79356b 3258 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
fe8ab488 3259 }
1c79356b 3260
39236c6e 3261 free_list = NULL;
55e303ae 3262 delayed_copy_len = 0;
1c79356b 3263restart:
91447636
A
3264#ifdef MORE_LOCKING_DEBUG
3265 if (so->so_usecount <= 1)
fe8ab488 3266 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3e170ce0 3267 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
91447636 3268#endif
6601e61a
A
3269 /*
3270 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3271 * and if so just return to the caller. This could happen when
3272 * soreceive() is called by a socket upcall function during the
3273 * time the socket is freed. The socket buffer would have been
3274 * locked across the upcall, therefore we cannot put this thread
3275 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3276 * we may livelock), because the lock on the socket buffer will
3277 * only be released when the upcall routine returns to its caller.
3278 * Because the socket has been officially closed, there can be
3279 * no further read on it.
39236c6e
A
3280 *
3281 * A multipath subflow socket would have its SS_NOFDREF set by
3282 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3283 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
6601e61a
A
3284 */
3285 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
39236c6e 3286 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
6601e61a
A
3287 socket_unlock(so, 1);
3288 return (0);
3289 }
3290
9bccf70c
A
3291 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3292 if (error) {
91447636 3293 socket_unlock(so, 1);
2d21ac55
A
3294 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3295 0, 0, 0, 0);
3e170ce0
A
3296 if (en_tracing) {
3297 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3298 VM_KERNEL_ADDRPERM(so), 0,
3299 (int64_t)(orig_resid - uio_resid(uio)));
3300 }
1c79356b
A
3301 return (error);
3302 }
1c79356b
A
3303
3304 m = so->so_rcv.sb_mb;
3305 /*
3306 * If we have less data than requested, block awaiting more
3307 * (subject to any timeout) if:
3308 * 1. the current count is less than the low water mark, or
3309 * 2. MSG_WAITALL is set, and it is possible to do the entire
3310 * receive operation at once if we block (resid <= hiwat).
3311 * 3. MSG_DONTWAIT is not set
3312 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3313 * we have to do the receive in sections, and thus risk returning
3314 * a short count if a timeout or signal occurs after we start.
3315 */
39236c6e 3316 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
91447636 3317 so->so_rcv.sb_cc < uio_resid(uio)) &&
2d21ac55 3318 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
91447636 3319 ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
39236c6e 3320 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
2d21ac55
A
3321 /*
3322 * Panic if we notice inconsistencies in the socket's
3323 * receive list; both sb_mb and sb_cc should correctly
3324 * reflect the contents of the list, otherwise we may
3325 * end up with false positives during select() or poll()
3326 * which could put the application in a bad state.
3327 */
316670eb 3328 SB_MB_CHECK(&so->so_rcv);
55e303ae 3329
1c79356b 3330 if (so->so_error) {
39236c6e 3331 if (m != NULL)
1c79356b
A
3332 goto dontblock;
3333 error = so->so_error;
3334 if ((flags & MSG_PEEK) == 0)
3335 so->so_error = 0;
3336 goto release;
3337 }
3338 if (so->so_state & SS_CANTRCVMORE) {
fe8ab488
A
3339#if CONTENT_FILTER
3340 /*
3341 * Deal with half closed connections
3342 */
3343 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3344 cfil_sock_data_pending(&so->so_rcv) != 0)
3345 CFIL_LOG(LOG_INFO,
3346 "so %llx ignore SS_CANTRCVMORE",
3e170ce0
A
3347 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3348 else
fe8ab488 3349#endif /* CONTENT_FILTER */
39236c6e 3350 if (m != NULL)
1c79356b
A
3351 goto dontblock;
3352 else
3353 goto release;
3354 }
39236c6e 3355 for (; m != NULL; m = m->m_next)
2d21ac55 3356 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
1c79356b
A
3357 m = so->so_rcv.sb_mb;
3358 goto dontblock;
3359 }
3360 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3361 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3362 error = ENOTCONN;
3363 goto release;
3364 }
91447636 3365 if (uio_resid(uio) == 0)
1c79356b 3366 goto release;
3e170ce0 3367
2d21ac55
A
3368 if ((so->so_state & SS_NBIO) ||
3369 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1c79356b
A
3370 error = EWOULDBLOCK;
3371 goto release;
3372 }
2d21ac55
A
3373 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3374 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
39236c6e 3375 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
2d21ac55 3376#if EVEN_MORE_LOCKING_DEBUG
1c79356b 3377 if (socket_debug)
2d21ac55 3378 printf("Waiting for socket data\n");
91447636 3379#endif
55e303ae 3380
1c79356b 3381 error = sbwait(&so->so_rcv);
2d21ac55 3382#if EVEN_MORE_LOCKING_DEBUG
1c79356b 3383 if (socket_debug)
2d21ac55 3384 printf("SORECEIVE - sbwait returned %d\n", error);
91447636 3385#endif
39236c6e
A
3386 if (so->so_usecount < 1) {
3387 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3388 __func__, so, so->so_usecount);
3389 /* NOTREACHED */
3390 }
9bccf70c 3391 if (error) {
91447636 3392 socket_unlock(so, 1);
2d21ac55
A
3393 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3394 0, 0, 0, 0);
3e170ce0
A
3395 if (en_tracing) {
3396 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3397 VM_KERNEL_ADDRPERM(so), 0,
3398 (int64_t)(orig_resid - uio_resid(uio)));
3399 }
2d21ac55 3400 return (error);
1c79356b
A
3401 }
3402 goto restart;
3403 }
3404dontblock:
b0d623f7 3405 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
2d21ac55
A
3406 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3407 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
1c79356b 3408 nextrecord = m->m_nextpkt;
fe8ab488 3409
3e170ce0
A
3410 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3411 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3412 mp0 == NULL);
3413 if (error == ERESTART)
3414 goto restart;
3415 else if (error != 0)
3416 goto release;
1c79356b 3417 orig_resid = 0;
1c79356b 3418 }
2d21ac55
A
3419
3420 /*
3421 * Process one or more MT_CONTROL mbufs present before any data mbufs
3422 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3423 * just copy the data; if !MSG_PEEK, we call into the protocol to
3424 * perform externalization.
3425 */
3426 if (m != NULL && m->m_type == MT_CONTROL) {
3e170ce0
A
3427 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3428 if (error != 0)
3429 goto release;
316670eb 3430 orig_resid = 0;
1c79356b 3431 }
2d21ac55 3432
39236c6e
A
3433 /*
3434 * If the socket is a TCP socket with message delivery
3435 * enabled, then create a control msg to deliver the
3436 * relative TCP sequence number for this data. Waiting
3437 * until this point will protect against failures to
3438 * allocate an mbuf for control msgs.
3439 */
3440 if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3441 (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3442 struct mbuf *seq_cm;
3443
3444 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
3445 sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
3446 if (seq_cm == NULL) {
3447 /* unable to allocate a control mbuf */
3448 error = ENOBUFS;
3449 goto release;
3450 }
3451 *controlp = seq_cm;
3452 controlp = &seq_cm->m_next;
3453 }
3454
2d21ac55
A
3455 if (m != NULL) {
3456 if (!(flags & MSG_PEEK)) {
3457 /*
3458 * We get here because m points to an mbuf following
3459 * any MT_SONAME or MT_CONTROL mbufs which have been
3460 * processed above. In any case, m should be pointing
3461 * to the head of the mbuf chain, and the nextrecord
3462 * should be either NULL or equal to m->m_nextpkt.
3463 * See comments above about SB_LOCK.
3464 */
39236c6e
A
3465 if (m != so->so_rcv.sb_mb ||
3466 m->m_nextpkt != nextrecord) {
3467 panic("%s: post-control !sync so=%p m=%p "
3468 "nextrecord=%p\n", __func__, so, m,
3469 nextrecord);
3470 /* NOTREACHED */
3471 }
2d21ac55
A
3472 if (nextrecord == NULL)
3473 so->so_rcv.sb_lastrecord = m;
3474 }
1c79356b
A
3475 type = m->m_type;
3476 if (type == MT_OOBDATA)
3477 flags |= MSG_OOB;
2d21ac55
A
3478 } else {
3479 if (!(flags & MSG_PEEK)) {
2d21ac55
A
3480 SB_EMPTY_FIXUP(&so->so_rcv);
3481 }
1c79356b 3482 }
2d21ac55
A
3483 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3484 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3485
1c79356b
A
3486 moff = 0;
3487 offset = 0;
fa4905b1 3488
91447636 3489 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
2d21ac55 3490 can_delay = 1;
55e303ae 3491 else
2d21ac55 3492 can_delay = 0;
55e303ae
A
3493
3494 need_event = 0;
fa4905b1 3495
39236c6e
A
3496 while (m != NULL &&
3497 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
1c79356b
A
3498 if (m->m_type == MT_OOBDATA) {
3499 if (type != MT_OOBDATA)
3500 break;
2d21ac55 3501 } else if (type == MT_OOBDATA) {
1c79356b 3502 break;
2d21ac55 3503 }
9bccf70c 3504 /*
2d21ac55 3505 * Make sure to allways set MSG_OOB event when getting
9bccf70c
A
3506 * out of band data inline.
3507 */
1c79356b 3508 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2d21ac55
A
3509 (so->so_options & SO_OOBINLINE) != 0 &&
3510 (so->so_state & SS_RCVATMARK) != 0) {
9bccf70c
A
3511 flags |= MSG_OOB;
3512 }
1c79356b 3513 so->so_state &= ~SS_RCVATMARK;
91447636 3514 len = uio_resid(uio) - delayed_copy_len;
1c79356b
A
3515 if (so->so_oobmark && len > so->so_oobmark - offset)
3516 len = so->so_oobmark - offset;
3517 if (len > m->m_len - moff)
3518 len = m->m_len - moff;
3519 /*
3520 * If mp is set, just pass back the mbufs.
3521 * Otherwise copy them out via the uio, then free.
3522 * Sockbuf must be consistent here (points to current mbuf,
3523 * it points to next record) when we drop priority;
3524 * we must note any additions to the sockbuf when we
3525 * block interrupts again.
3526 */
39236c6e 3527 if (mp == NULL) {
2d21ac55
A
3528 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3529 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
55e303ae 3530 if (can_delay && len == m->m_len) {
2d21ac55 3531 /*
55e303ae
A
3532 * only delay the copy if we're consuming the
3533 * mbuf and we're NOT in MSG_PEEK mode
3534 * and we have enough data to make it worthwile
2d21ac55
A
3535 * to drop and retake the lock... can_delay
3536 * reflects the state of the 2 latter
3537 * constraints moff should always be zero
3538 * in these cases
55e303ae 3539 */
2d21ac55 3540 delayed_copy_len += len;
55e303ae 3541 } else {
2d21ac55
A
3542 if (delayed_copy_len) {
3543 error = sodelayed_copy(so, uio,
3544 &free_list, &delayed_copy_len);
55e303ae
A
3545
3546 if (error) {
55e303ae
A
3547 goto release;
3548 }
2d21ac55
A
3549 /*
3550 * can only get here if MSG_PEEK is not
3551 * set therefore, m should point at the
3552 * head of the rcv queue; if it doesn't,
3553 * it means something drastically
3554 * changed while we were out from behind
3555 * the lock in sodelayed_copy. perhaps
3556 * a RST on the stream. in any event,
3557 * the stream has been interrupted. it's
3558 * probably best just to return whatever
3559 * data we've moved and let the caller
3560 * sort it out...
3561 */
55e303ae 3562 if (m != so->so_rcv.sb_mb) {
2d21ac55 3563 break;
55e303ae
A
3564 }
3565 }
91447636 3566 socket_unlock(so, 0);
2d21ac55
A
3567 error = uiomove(mtod(m, caddr_t) + moff,
3568 (int)len, uio);
91447636 3569 socket_lock(so, 0);
55e303ae 3570
55e303ae 3571 if (error)
2d21ac55 3572 goto release;
55e303ae 3573 }
2d21ac55 3574 } else {
91447636 3575 uio_setresid(uio, (uio_resid(uio) - len));
2d21ac55 3576 }
1c79356b
A
3577 if (len == m->m_len - moff) {
3578 if (m->m_flags & M_EOR)
3579 flags |= MSG_EOR;
3580 if (flags & MSG_PEEK) {
3581 m = m->m_next;
3582 moff = 0;
3583 } else {
3584 nextrecord = m->m_nextpkt;
3585 sbfree(&so->so_rcv, m);
91447636 3586 m->m_nextpkt = NULL;
55e303ae 3587
39236c6e
A
3588 /*
3589 * If this packet is an unordered packet
3590 * (indicated by M_UNORDERED_DATA flag), remove
3591 * the additional bytes added to the
3592 * receive socket buffer size.
3593 */
3594 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3595 m->m_len &&
3596 (m->m_flags & M_UNORDERED_DATA) &&
3597 sbreserve(&so->so_rcv,
3598 so->so_rcv.sb_hiwat - m->m_len)) {
3599 if (so->so_msg_state->msg_uno_bytes >
3600 m->m_len) {
3601 so->so_msg_state->
3602 msg_uno_bytes -= m->m_len;
3603 } else {
3604 so->so_msg_state->
3605 msg_uno_bytes = 0;
3606 }
3607 m->m_flags &= ~M_UNORDERED_DATA;
3608 }
3609
3610 if (mp != NULL) {
1c79356b
A
3611 *mp = m;
3612 mp = &m->m_next;
3613 so->so_rcv.sb_mb = m = m->m_next;
39236c6e 3614 *mp = NULL;
1c79356b 3615 } else {
55e303ae 3616 if (free_list == NULL)
2d21ac55
A
3617 free_list = m;
3618 else
3619 ml->m_next = m;
3620 ml = m;
14353aa8 3621 so->so_rcv.sb_mb = m = m->m_next;
39236c6e 3622 ml->m_next = NULL;
1c79356b 3623 }
2d21ac55 3624 if (m != NULL) {
1c79356b 3625 m->m_nextpkt = nextrecord;
2d21ac55
A
3626 if (nextrecord == NULL)
3627 so->so_rcv.sb_lastrecord = m;
3628 } else {
3629 so->so_rcv.sb_mb = nextrecord;
3630 SB_EMPTY_FIXUP(&so->so_rcv);
3631 }
3632 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3633 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
1c79356b
A
3634 }
3635 } else {
2d21ac55 3636 if (flags & MSG_PEEK) {
1c79356b 3637 moff += len;
2d21ac55 3638 } else {
6d2010ae
A
3639 if (mp != NULL) {
3640 int copy_flag;
3641
3642 if (flags & MSG_DONTWAIT)
3643 copy_flag = M_DONTWAIT;
3644 else
3645 copy_flag = M_WAIT;
3646 *mp = m_copym(m, 0, len, copy_flag);
39236c6e
A
3647 /*
3648 * Failed to allocate an mbuf?
3649 * Adjust uio_resid back, it was
3650 * adjusted down by len bytes which
3651 * we didn't copy over.
3652 */
6d2010ae 3653 if (*mp == NULL) {
39236c6e
A
3654 uio_setresid(uio,
3655 (uio_resid(uio) + len));
6d2010ae
A
3656 break;
3657 }
3658 }
1c79356b
A
3659 m->m_data += len;
3660 m->m_len -= len;
3661 so->so_rcv.sb_cc -= len;
3662 }
3663 }
3664 if (so->so_oobmark) {
3665 if ((flags & MSG_PEEK) == 0) {
3666 so->so_oobmark -= len;
3667 if (so->so_oobmark == 0) {
2d21ac55
A
3668 so->so_state |= SS_RCVATMARK;
3669 /*
3670 * delay posting the actual event until
3671 * after any delayed copy processing
3672 * has finished
3673 */
3674 need_event = 1;
3675 break;
1c79356b
A
3676 }
3677 } else {
3678 offset += len;
3679 if (offset == so->so_oobmark)
3680 break;
3681 }
3682 }
2d21ac55 3683 if (flags & MSG_EOR)
1c79356b
A
3684 break;
3685 /*
2d21ac55
A
3686 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3687 * (for non-atomic socket), we must not quit until
3688 * "uio->uio_resid == 0" or an error termination.
3689 * If a signal/timeout occurs, return with a short
3690 * count but without error. Keep sockbuf locked
3691 * against other readers.
1c79356b 3692 */
39236c6e 3693 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL &&
2d21ac55 3694 (uio_resid(uio) - delayed_copy_len) > 0 &&
1c79356b 3695 !sosendallatonce(so) && !nextrecord) {
fe8ab488
A
3696 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3697#if CONTENT_FILTER
3698 && cfil_sock_data_pending(&so->so_rcv) == 0
3699#endif /* CONTENT_FILTER */
3700 ))
2d21ac55 3701 goto release;
fa4905b1 3702
2d21ac55
A
3703 /*
3704 * Depending on the protocol (e.g. TCP), the following
3705 * might cause the socket lock to be dropped and later
3706 * be reacquired, and more data could have arrived and
3707 * have been appended to the receive socket buffer by
3708 * the time it returns. Therefore, we only sleep in
3709 * sbwait() below if and only if the socket buffer is
3710 * empty, in order to avoid a false sleep.
3711 */
3712 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3713 (((struct inpcb *)so->so_pcb)->inp_state !=
3714 INPCB_STATE_DEAD))
3715 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3716
3717 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3718 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3719
3720 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3721 error = 0;
55e303ae 3722 goto release;
fa4905b1 3723 }
55e303ae 3724 /*
2d21ac55
A
3725 * have to wait until after we get back from the sbwait
3726 * to do the copy because we will drop the lock if we
3727 * have enough data that has been delayed... by dropping
3728 * the lock we open up a window allowing the netisr
3729 * thread to process the incoming packets and to change
3730 * the state of this socket... we're issuing the sbwait
3731 * because the socket is empty and we're expecting the
3732 * netisr thread to wake us up when more packets arrive;
3733 * if we allow that processing to happen and then sbwait
3734 * we could stall forever with packets sitting in the
3735 * socket if no further packets arrive from the remote
3736 * side.
55e303ae 3737 *
2d21ac55
A
3738 * we want to copy before we've collected all the data
3739 * to satisfy this request to allow the copy to overlap
3740 * the incoming packet processing on an MP system
55e303ae 3741 */
2d21ac55
A
3742 if (delayed_copy_len > sorecvmincopy &&
3743 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3744 error = sodelayed_copy(so, uio,
3745 &free_list, &delayed_copy_len);
55e303ae
A
3746
3747 if (error)
2d21ac55 3748 goto release;
1c79356b
A
3749 }
3750 m = so->so_rcv.sb_mb;
39236c6e 3751 if (m != NULL) {
1c79356b 3752 nextrecord = m->m_nextpkt;
fa4905b1 3753 }
316670eb 3754 SB_MB_CHECK(&so->so_rcv);
1c79356b
A
3755 }
3756 }
91447636 3757#ifdef MORE_LOCKING_DEBUG
39236c6e
A
3758 if (so->so_usecount <= 1) {
3759 panic("%s: after big while so=%p ref=%d on socket\n",
3760 __func__, so, so->so_usecount);
3761 /* NOTREACHED */
3762 }
91447636 3763#endif
1c79356b 3764
39236c6e 3765 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
2d21ac55 3766 if (so->so_options & SO_DONTTRUNC) {
1c79356b 3767 flags |= MSG_RCVMORE;
2d21ac55 3768 } else {
9bccf70c 3769 flags |= MSG_TRUNC;
1c79356b
A
3770 if ((flags & MSG_PEEK) == 0)
3771 (void) sbdroprecord(&so->so_rcv);
3772 }
3773 }
2d21ac55
A
3774
3775 /*
3776 * pru_rcvd below (for TCP) may cause more data to be received
3777 * if the socket lock is dropped prior to sending the ACK; some
3778 * legacy OpenTransport applications don't handle this well
3779 * (if it receives less data than requested while MSG_HAVEMORE
3780 * is set), and so we set the flag now based on what we know
3781 * prior to calling pru_rcvd.
3782 */
3783 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
3784 flags |= MSG_HAVEMORE;
3785
1c79356b 3786 if ((flags & MSG_PEEK) == 0) {
39236c6e 3787 if (m == NULL) {
1c79356b 3788 so->so_rcv.sb_mb = nextrecord;
2d21ac55
A
3789 /*
3790 * First part is an inline SB_EMPTY_FIXUP(). Second
3791 * part makes sure sb_lastrecord is up-to-date if
3792 * there is still data in the socket buffer.
3793 */
3794 if (so->so_rcv.sb_mb == NULL) {
3795 so->so_rcv.sb_mbtail = NULL;
3796 so->so_rcv.sb_lastrecord = NULL;
3797 } else if (nextrecord->m_nextpkt == NULL) {
3798 so->so_rcv.sb_lastrecord = nextrecord;
3799 }
316670eb 3800 SB_MB_CHECK(&so->so_rcv);
2d21ac55
A
3801 }
3802 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3803 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
1c79356b
A
3804 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
3805 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3806 }
39236c6e 3807
55e303ae 3808 if (delayed_copy_len) {
91447636 3809 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
55e303ae 3810 if (error)
2d21ac55 3811 goto release;
55e303ae 3812 }
39236c6e
A
3813 if (free_list != NULL) {
3814 m_freem_list(free_list);
3815 free_list = NULL;
55e303ae
A
3816 }
3817 if (need_event)
2d21ac55 3818 postevent(so, 0, EV_OOB);
39236c6e 3819
91447636 3820 if (orig_resid == uio_resid(uio) && orig_resid &&
1c79356b 3821 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
39236c6e 3822 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
1c79356b
A
3823 goto restart;
3824 }
3825
39236c6e 3826 if (flagsp != NULL)
1c79356b
A
3827 *flagsp |= flags;
3828release:
91447636 3829#ifdef MORE_LOCKING_DEBUG
39236c6e
A
3830 if (so->so_usecount <= 1) {
3831 panic("%s: release so=%p ref=%d on socket\n", __func__,
2d21ac55 3832 so, so->so_usecount);
39236c6e
A
3833 /* NOTREACHED */
3834 }
91447636 3835#endif
39236c6e 3836 if (delayed_copy_len)
2d21ac55 3837 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
1c79356b 3838
39236c6e
A
3839 if (free_list != NULL)
3840 m_freem_list(free_list);
3841
3842 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
3843
3e170ce0
A
3844 if (en_tracing) {
3845 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3846 VM_KERNEL_ADDRPERM(so),
3847 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
3848 (int64_t)(orig_resid - uio_resid(uio)));
3849 }
2d21ac55
A
3850 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
3851 so->so_rcv.sb_cc, 0, error);
1c79356b
A
3852
3853 return (error);
3854}
3855
2d21ac55
A
3856/*
3857 * Returns: 0 Success
3858 * uiomove:EFAULT
3859 */
3860static int
3861sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
39236c6e 3862 user_ssize_t *resid)
55e303ae 3863{
2d21ac55 3864 int error = 0;
55e303ae
A
3865 struct mbuf *m;
3866
3867 m = *free_list;
3868
91447636 3869 socket_unlock(so, 0);
55e303ae 3870
39236c6e 3871 while (m != NULL && error == 0) {
2d21ac55 3872 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
2d21ac55
A
3873 m = m->m_next;
3874 }
3875 m_freem_list(*free_list);
3876
39236c6e 3877 *free_list = NULL;
2d21ac55
A
3878 *resid = 0;
3879
3880 socket_lock(so, 0);
55e303ae 3881
2d21ac55
A
3882 return (error);
3883}
3884
3e170ce0
A
3885static int
3886sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
3887 u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
3888{
3889#pragma unused(so)
3890 int error = 0;
3891 struct mbuf *ml, *m;
3892 int i = 0;
3893 struct uio *auio;
3894
3895 for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
3896 ml = ml->m_nextpkt, i++) {
3897 auio = msgarray[i].uio;
3898 for (m = ml; m != NULL; m = m->m_next) {
3899 error = uiomove(mtod(m, caddr_t), m->m_len, auio);
3900 if (error != 0)
3901 goto out;
3902 }
3903 }
3904out:
3905 m_freem_list(*free_list);
3906
3907 *free_list = NULL;
3908 *resid = 0;
3909
3910 return (error);
3911}
3912
2d21ac55 3913int
3e170ce0
A
3914soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
3915 int *flagsp)
2d21ac55 3916{
3e170ce0 3917 struct mbuf *m;
fe8ab488 3918 struct mbuf *nextrecord;
3e170ce0
A
3919 struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
3920 int error;
3921 user_ssize_t len, pktlen, delayed_copy_len = 0;
fe8ab488 3922 struct protosw *pr = so->so_proto;
3e170ce0 3923 user_ssize_t resid;
fe8ab488
A
3924 struct proc *p = current_proc();
3925 struct uio *auio = NULL;
3e170ce0 3926 int npkts = 0;
fe8ab488 3927 int sblocked = 0;
3e170ce0
A
3928 struct sockaddr **psa = NULL;
3929 struct mbuf **controlp = NULL;
3930 int can_delay;
3931 int flags;
3932 struct mbuf *free_others = NULL;
55e303ae 3933
fe8ab488
A
3934 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
3935 so, uiocnt,
3936 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
3937
fe8ab488
A
3938 /*
3939 * Sanity checks:
3940 * - Only supports don't wait flags
3941 * - Only support datagram sockets (could be extended to raw)
3942 * - Must be atomic
3943 * - Protocol must support packet chains
3944 * - The uio array is NULL (should we panic?)
3945 */
3e170ce0
A
3946 if (flagsp != NULL)
3947 flags = *flagsp;
3948 else
3949 flags = 0;
3950 if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
3951 MSG_NBIO)) {
3952 printf("%s invalid flags 0x%x\n", __func__, flags);
3953 error = EINVAL;
fe8ab488
A
3954 goto out;
3955 }
3956 if (so->so_type != SOCK_DGRAM) {
3957 error = EINVAL;
3958 goto out;
3959 }
3960 if (sosendallatonce(so) == 0) {
3961 error = EINVAL;
3962 goto out;
3963 }
3964 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
3965 error = EPROTONOSUPPORT;
3966 goto out;
3967 }
3e170ce0 3968 if (msgarray == NULL) {
fe8ab488
A
3969 printf("%s uioarray is NULL\n", __func__);
3970 error = EINVAL;
3971 goto out;
3972 }
3973 if (uiocnt == 0) {
3974 printf("%s uiocnt is 0\n", __func__);
3975 error = EINVAL;
3976 goto out;
3977 }
3978 /*
3979 * Sanity check on the length passed by caller as we are making 'int'
3980 * comparisons
3981 */
3e170ce0
A
3982 resid = recv_msg_array_resid(msgarray, uiocnt);
3983 if (resid < 0 || resid > INT_MAX) {
fe8ab488
A
3984 error = EINVAL;
3985 goto out;
3986 }
3987
3e170ce0
A
3988 if (!(flags & MSG_PEEK) && sorecvmincopy > 0)
3989 can_delay = 1;
3990 else
3991 can_delay = 0;
3992
fe8ab488
A
3993 socket_lock(so, 1);
3994 so_update_last_owner_locked(so, p);
3995 so_update_policy(so);
3996
3997#if NECP
3998 so_update_necp_policy(so, NULL, NULL);
3999#endif /* NECP */
3e170ce0 4000
fe8ab488
A
4001 /*
4002 * If a recv attempt is made on a previously-accepted socket
4003 * that has been marked as inactive (disconnected), reject
4004 * the request.
4005 */
4006 if (so->so_flags & SOF_DEFUNCT) {
4007 struct sockbuf *sb = &so->so_rcv;
4008
4009 error = ENOTCONN;
39037602
A
4010 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4011 __func__, proc_pid(p), proc_best_name(p),
4012 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4013 SOCK_DOM(so), SOCK_TYPE(so), error);
fe8ab488
A
4014 /*
4015 * This socket should have been disconnected and flushed
4016 * prior to being returned from sodefunct(); there should
4017 * be no data on its receive list, so panic otherwise.
4018 */
4019 if (so->so_state & SS_DEFUNCT)
4020 sb_empty_assert(sb, __func__);
4021 goto release;
4022 }
3e170ce0
A
4023
4024next:
4025 /*
4026 * The uio may be empty
4027 */
4028 if (npkts >= uiocnt) {
4029 error = 0;
4030 goto release;
4031 }
fe8ab488
A
4032restart:
4033 /*
4034 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4035 * and if so just return to the caller. This could happen when
4036 * soreceive() is called by a socket upcall function during the
4037 * time the socket is freed. The socket buffer would have been
4038 * locked across the upcall, therefore we cannot put this thread
4039 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4040 * we may livelock), because the lock on the socket buffer will
4041 * only be released when the upcall routine returns to its caller.
4042 * Because the socket has been officially closed, there can be
4043 * no further read on it.
4044 */
4045 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4046 (SS_NOFDREF | SS_CANTRCVMORE)) {
4047 error = 0;
4048 goto release;
4049 }
4050
4051 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4052 if (error) {
4053 goto release;
4054 }
4055 sblocked = 1;
4056
fe8ab488
A
4057 m = so->so_rcv.sb_mb;
4058 /*
4059 * Block awaiting more datagram if needed
4060 */
3e170ce0
A
4061 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4062 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4063 ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
fe8ab488
A
4064 /*
4065 * Panic if we notice inconsistencies in the socket's
4066 * receive list; both sb_mb and sb_cc should correctly
4067 * reflect the contents of the list, otherwise we may
4068 * end up with false positives during select() or poll()
4069 * which could put the application in a bad state.
4070 */
4071 SB_MB_CHECK(&so->so_rcv);
4072
4073 if (so->so_error) {
4074 error = so->so_error;
3e170ce0
A
4075 if ((flags & MSG_PEEK) == 0)
4076 so->so_error = 0;
fe8ab488
A
4077 goto release;
4078 }
4079 if (so->so_state & SS_CANTRCVMORE) {
4080 goto release;
4081 }
4082 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
4083 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4084 error = ENOTCONN;
4085 goto release;
4086 }
4087 if ((so->so_state & SS_NBIO) ||
4088 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
4089 error = EWOULDBLOCK;
4090 goto release;
4091 }
4092 /*
4093 * Do not block if we got some data
fe8ab488 4094 */
3e170ce0 4095 if (free_list != NULL) {
fe8ab488
A
4096 error = 0;
4097 goto release;
4098 }
3e170ce0 4099
fe8ab488
A
4100 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4101 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4102
4103 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4104 sblocked = 0;
4105
4106 error = sbwait(&so->so_rcv);
4107 if (error) {
4108 goto release;
4109 }
4110 goto restart;
4111 }
4112
fe8ab488
A
4113 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4114 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4115 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4116
4117 /*
4118 * Consume the current uio index as we have a datagram
4119 */
3e170ce0
A
4120 auio = msgarray[npkts].uio;
4121 resid = uio_resid(auio);
4122 msgarray[npkts].which |= SOCK_MSG_DATA;
4123 psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4124 &msgarray[npkts].psa : NULL;
4125 controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4126 &msgarray[npkts].controlp : NULL;
4127 npkts += 1;
fe8ab488
A
4128 nextrecord = m->m_nextpkt;
4129
fe8ab488 4130 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3e170ce0
A
4131 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4132 if (error == ERESTART)
4133 goto restart;
4134 else if (error != 0)
4135 goto release;
fe8ab488 4136 }
fe8ab488 4137
fe8ab488 4138 if (m != NULL && m->m_type == MT_CONTROL) {
3e170ce0
A
4139 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4140 if (error != 0)
4141 goto release;
fe8ab488 4142 }
fe8ab488 4143
3e170ce0
A
4144 if (m->m_pkthdr.len == 0) {
4145 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4146 __func__, __LINE__,
4147 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4148 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4149 m->m_type);
4150 }
fe8ab488
A
4151
4152 /*
3e170ce0
A
4153 * Loop to copy the mbufs of the current record
4154 * Support zero length packets
fe8ab488 4155 */
3e170ce0
A
4156 ml = NULL;
4157 pktlen = 0;
4158 while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
fe8ab488 4159 if (m->m_len == 0)
3e170ce0
A
4160 panic("%p m_len zero", m);
4161 if (m->m_type == 0)
4162 panic("%p m_type zero", m);
fe8ab488
A
4163 /*
4164 * Clip to the residual length
4165 */
4166 if (len > m->m_len)
4167 len = m->m_len;
3e170ce0 4168 pktlen += len;
fe8ab488 4169 /*
3e170ce0 4170 * Copy the mbufs via the uio or delay the copy
fe8ab488
A
4171 * Sockbuf must be consistent here (points to current mbuf,
4172 * it points to next record) when we drop priority;
4173 * we must note any additions to the sockbuf when we
4174 * block interrupts again.
4175 */
3e170ce0 4176 if (len > 0 && can_delay == 0) {
fe8ab488
A
4177 socket_unlock(so, 0);
4178 error = uiomove(mtod(m, caddr_t), (int)len, auio);
4179 socket_lock(so, 0);
fe8ab488
A
4180 if (error)
4181 goto release;
3e170ce0
A
4182 } else {
4183 delayed_copy_len += len;
fe8ab488 4184 }
3e170ce0 4185
fe8ab488
A
4186 if (len == m->m_len) {
4187 /*
3e170ce0 4188 * m was entirely copied
fe8ab488 4189 */
fe8ab488 4190 sbfree(&so->so_rcv, m);
3e170ce0 4191 nextrecord = m->m_nextpkt;
fe8ab488
A
4192 m->m_nextpkt = NULL;
4193
4194 /*
3e170ce0 4195 * Set the first packet to the head of the free list
fe8ab488 4196 */
3e170ce0
A
4197 if (free_list == NULL)
4198 free_list = m;
4199 /*
4200 * Link current packet to tail of free list
4201 */
4202 if (ml == NULL) {
4203 if (free_tail != NULL)
4204 free_tail->m_nextpkt = m;
4205 free_tail = m;
fe8ab488 4206 }
3e170ce0
A
4207 /*
4208 * Link current mbuf to last mbuf of current packet
4209 */
4210 if (ml != NULL)
4211 ml->m_next = m;
4212 ml = m;
4213
4214 /*
4215 * Move next buf to head of socket buffer
4216 */
4217 so->so_rcv.sb_mb = m = ml->m_next;
4218 ml->m_next = NULL;
4219
fe8ab488
A
4220 if (m != NULL) {
4221 m->m_nextpkt = nextrecord;
4222 if (nextrecord == NULL)
4223 so->so_rcv.sb_lastrecord = m;
4224 } else {
4225 so->so_rcv.sb_mb = nextrecord;
4226 SB_EMPTY_FIXUP(&so->so_rcv);
4227 }
4228 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4229 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4230 } else {
4231 /*
4232 * Stop the loop on partial copy
4233 */
fe8ab488
A
4234 break;
4235 }
4236 }
4237#ifdef MORE_LOCKING_DEBUG
4238 if (so->so_usecount <= 1) {
4239 panic("%s: after big while so=%llx ref=%d on socket\n",
4240 __func__,
3e170ce0 4241 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
fe8ab488
A
4242 /* NOTREACHED */
4243 }
4244#endif
4245 /*
4246 * Tell the caller we made a partial copy
4247 */
4248 if (m != NULL) {
4249 if (so->so_options & SO_DONTTRUNC) {
3e170ce0
A
4250 /*
4251 * Copyout first the freelist then the partial mbuf
4252 */
4253 socket_unlock(so, 0);
4254 if (delayed_copy_len)
4255 error = sodelayed_copy_list(so, msgarray,
4256 uiocnt, &free_list, &delayed_copy_len);
4257
4258 if (error == 0) {
4259 error = uiomove(mtod(m, caddr_t), (int)len,
4260 auio);
4261 }
4262 socket_lock(so, 0);
4263 if (error)
4264 goto release;
4265
fe8ab488
A
4266 m->m_data += len;
4267 m->m_len -= len;
4268 so->so_rcv.sb_cc -= len;
4269 flags |= MSG_RCVMORE;
4270 } else {
4271 (void) sbdroprecord(&so->so_rcv);
4272 nextrecord = so->so_rcv.sb_mb;
4273 m = NULL;
4274 flags |= MSG_TRUNC;
4275 }
4276 }
4277
4278 if (m == NULL) {
4279 so->so_rcv.sb_mb = nextrecord;
4280 /*
4281 * First part is an inline SB_EMPTY_FIXUP(). Second
4282 * part makes sure sb_lastrecord is up-to-date if
4283 * there is still data in the socket buffer.
4284 */
4285 if (so->so_rcv.sb_mb == NULL) {
4286 so->so_rcv.sb_mbtail = NULL;
4287 so->so_rcv.sb_lastrecord = NULL;
4288 } else if (nextrecord->m_nextpkt == NULL) {
4289 so->so_rcv.sb_lastrecord = nextrecord;
4290 }
4291 SB_MB_CHECK(&so->so_rcv);
4292 }
4293 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4294 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4295
4296 /*
4297 * We can continue to the next packet as long as:
4298 * - We haven't exhausted the uio array
4299 * - There was no error
4300 * - A packet was not truncated
4301 * - We can still receive more data
3e170ce0
A
4302 */
4303 if (npkts < uiocnt && error == 0 &&
4304 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4305 (so->so_state & SS_CANTRCVMORE) == 0) {
fe8ab488
A
4306 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4307 sblocked = 0;
4308
3e170ce0 4309 goto next;
fe8ab488 4310 }
3e170ce0
A
4311 if (flagsp != NULL)
4312 *flagsp |= flags;
fe8ab488
A
4313
4314release:
4315 /*
4316 * pru_rcvd may cause more data to be received if the socket lock
4317 * is dropped so we set MSG_HAVEMORE now based on what we know.
3e170ce0
A
4318 * That way the caller won't be surprised if it receives less data
4319 * than requested.
fe8ab488
A
4320 */
4321 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
4322 flags |= MSG_HAVEMORE;
4323
4324 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
4325 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4326
fe8ab488
A
4327 if (sblocked)
4328 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4329 else
4330 socket_unlock(so, 1);
3e170ce0
A
4331
4332 if (delayed_copy_len)
4333 error = sodelayed_copy_list(so, msgarray, uiocnt,
4334 &free_list, &delayed_copy_len);
fe8ab488
A
4335out:
4336 /*
3e170ce0 4337 * Amortize the cost of freeing the mbufs
fe8ab488
A
4338 */
4339 if (free_list != NULL)
4340 m_freem_list(free_list);
3e170ce0
A
4341 if (free_others != NULL)
4342 m_freem_list(free_others);
fe8ab488
A
4343
4344 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4345 0, 0, 0, 0);
4346 return (error);
4347}
4348
4349/*
4350 * Returns: 0 Success
4351 * EINVAL
4352 * ENOTCONN
4353 * <pru_shutdown>:EINVAL
4354 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4355 * <pru_shutdown>:ENOBUFS[TCP]
4356 * <pru_shutdown>:EMSGSIZE[TCP]
4357 * <pru_shutdown>:EHOSTUNREACH[TCP]
4358 * <pru_shutdown>:ENETUNREACH[TCP]
4359 * <pru_shutdown>:ENETDOWN[TCP]
4360 * <pru_shutdown>:ENOMEM[TCP]
4361 * <pru_shutdown>:EACCES[TCP]
4362 * <pru_shutdown>:EMSGSIZE[TCP]
4363 * <pru_shutdown>:ENOBUFS[TCP]
4364 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4365 * <pru_shutdown>:??? [other protocol families]
4366 */
4367int
4368soshutdown(struct socket *so, int how)
4369{
4370 int error;
4371
4372 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4373
4374 switch (how) {
4375 case SHUT_RD:
4376 case SHUT_WR:
4377 case SHUT_RDWR:
4378 socket_lock(so, 1);
4379 if ((so->so_state &
4380 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
4381 error = ENOTCONN;
2d21ac55
A
4382 } else {
4383 error = soshutdownlock(so, how);
4384 }
4385 socket_unlock(so, 1);
4386 break;
4387 default:
4388 error = EINVAL;
4389 break;
55e303ae 4390 }
55e303ae 4391
fe8ab488
A
4392 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4393
55e303ae
A
4394 return (error);
4395}
4396
1c79356b 4397int
fe8ab488 4398soshutdownlock_final(struct socket *so, int how)
1c79356b 4399{
2d21ac55
A
4400 struct protosw *pr = so->so_proto;
4401 int error = 0;
1c79356b 4402
91447636 4403 sflt_notify(so, sock_evt_shutdown, &how);
1c79356b 4404
9bccf70c 4405 if (how != SHUT_WR) {
2d21ac55
A
4406 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4407 /* read already shut down */
4408 error = ENOTCONN;
4409 goto done;
4410 }
1c79356b
A
4411 sorflush(so);
4412 postevent(so, 0, EV_RCLOSED);
4413 }
9bccf70c 4414 if (how != SHUT_RD) {
2d21ac55
A
4415 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4416 /* write already shut down */
4417 error = ENOTCONN;
4418 goto done;
4419 }
4420 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4421 postevent(so, 0, EV_WCLOSED);
1c79356b 4422 }
2d21ac55 4423done:
fe8ab488
A
4424 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4425 return (error);
4426}
4427
4428int
4429soshutdownlock(struct socket *so, int how)
4430{
4431 int error = 0;
4432
4433#if CONTENT_FILTER
4434 /*
4435 * A content filter may delay the actual shutdown until it
4436 * has processed the pending data
4437 */
4438 if (so->so_flags & SOF_CONTENT_FILTER) {
4439 error = cfil_sock_shutdown(so, &how);
4440 if (error == EJUSTRETURN) {
4441 error = 0;
4442 goto done;
4443 } else if (error != 0) {
4444 goto done;
4445 }
4446 }
4447#endif /* CONTENT_FILTER */
3e170ce0 4448
fe8ab488
A
4449 error = soshutdownlock_final(so, how);
4450
4451done:
2d21ac55 4452 return (error);
1c79356b
A
4453}
4454
39236c6e
A
4455void
4456sowflush(struct socket *so)
4457{
4458 struct sockbuf *sb = &so->so_snd;
39236c6e
A
4459
4460 /*
4461 * Obtain lock on the socket buffer (SB_LOCK). This is required
4462 * to prevent the socket buffer from being unexpectedly altered
4463 * while it is used by another thread in socket send/receive.
4464 *
4465 * sblock() must not fail here, hence the assertion.
4466 */
4467 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4468 VERIFY(sb->sb_flags & SB_LOCK);
4469
4470 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
4471 sb->sb_flags |= SB_DROP;
4472 sb->sb_upcall = NULL;
4473 sb->sb_upcallarg = NULL;
4474
4475 sbunlock(sb, TRUE); /* keep socket locked */
4476
4477 selthreadclear(&sb->sb_sel);
4478 sbrelease(sb);
4479}
4480
1c79356b 4481void
2d21ac55 4482sorflush(struct socket *so)
1c79356b 4483{
39236c6e
A
4484 struct sockbuf *sb = &so->so_rcv;
4485 struct protosw *pr = so->so_proto;
1c79356b 4486 struct sockbuf asb;
39236c6e 4487#ifdef notyet
2d21ac55 4488 lck_mtx_t *mutex_held;
39236c6e
A
4489 /*
4490 * XXX: This code is currently commented out, because we may get here
4491 * as part of sofreelastref(), and at that time, pr_getlock() may no
4492 * longer be able to return us the lock; this will be fixed in future.
4493 */
2d21ac55 4494 if (so->so_proto->pr_getlock != NULL)
91447636 4495 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2d21ac55 4496 else
91447636 4497 mutex_held = so->so_proto->pr_domain->dom_mtx;
39236c6e 4498
5ba3f43e 4499 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
39236c6e 4500#endif /* notyet */
91447636
A
4501
4502 sflt_notify(so, sock_evt_flush_read, NULL);
1c79356b 4503
1c79356b 4504 socantrcvmore(so);
39236c6e
A
4505
4506 /*
4507 * Obtain lock on the socket buffer (SB_LOCK). This is required
4508 * to prevent the socket buffer from being unexpectedly altered
4509 * while it is used by another thread in socket send/receive.
4510 *
4511 * sblock() must not fail here, hence the assertion.
4512 */
4513 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4514 VERIFY(sb->sb_flags & SB_LOCK);
4515
4516 /*
4517 * Copy only the relevant fields from "sb" to "asb" which we
4518 * need for sbrelease() to function. In particular, skip
4519 * sb_sel as it contains the wait queue linkage, which would
4520 * wreak havoc if we were to issue selthreadclear() on "asb".
4521 * Make sure to not carry over SB_LOCK in "asb", as we need
4522 * to acquire it later as part of sbrelease().
4523 */
4524 bzero(&asb, sizeof (asb));
4525 asb.sb_cc = sb->sb_cc;
4526 asb.sb_hiwat = sb->sb_hiwat;
4527 asb.sb_mbcnt = sb->sb_mbcnt;
4528 asb.sb_mbmax = sb->sb_mbmax;
4529 asb.sb_ctl = sb->sb_ctl;
4530 asb.sb_lowat = sb->sb_lowat;
4531 asb.sb_mb = sb->sb_mb;
4532 asb.sb_mbtail = sb->sb_mbtail;
4533 asb.sb_lastrecord = sb->sb_lastrecord;
4534 asb.sb_so = sb->sb_so;
4535 asb.sb_flags = sb->sb_flags;
4536 asb.sb_flags &= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL);
4537 asb.sb_flags |= SB_DROP;
4538
4539 /*
4540 * Ideally we'd bzero() these and preserve the ones we need;
4541 * but to do that we'd need to shuffle things around in the
4542 * sockbuf, and we can't do it now because there are KEXTS
4543 * that are directly referring to the socket structure.
4544 *
4545 * Setting SB_DROP acts as a barrier to prevent further appends.
4546 * Clearing SB_SEL is done for selthreadclear() below.
4547 */
4548 sb->sb_cc = 0;
4549 sb->sb_hiwat = 0;
4550 sb->sb_mbcnt = 0;
4551 sb->sb_mbmax = 0;
4552 sb->sb_ctl = 0;
4553 sb->sb_lowat = 0;
4554 sb->sb_mb = NULL;
4555 sb->sb_mbtail = NULL;
4556 sb->sb_lastrecord = NULL;
4557 sb->sb_timeo.tv_sec = 0;
4558 sb->sb_timeo.tv_usec = 0;
4559 sb->sb_upcall = NULL;
4560 sb->sb_upcallarg = NULL;
4561 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
4562 sb->sb_flags |= SB_DROP;
4563
4564 sbunlock(sb, TRUE); /* keep socket locked */
4565
4566 /*
4567 * Note that selthreadclear() is called on the original "sb" and
4568 * not the local "asb" because of the way wait queue linkage is
4569 * implemented. Given that selwakeup() may be triggered, SB_SEL
4570 * should no longer be set (cleared above.)
4571 */
0b4e3aa0 4572 selthreadclear(&sb->sb_sel);
39236c6e
A
4573
4574 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
1c79356b 4575 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
39236c6e 4576
1c79356b
A
4577 sbrelease(&asb);
4578}
4579
4580/*
4581 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4582 * an additional variant to handle the case where the option value needs
4583 * to be some kind of integer, but not a specific size.
4584 * In addition to their use here, these functions are also called by the
4585 * protocol-level pr_ctloutput() routines.
2d21ac55
A
4586 *
4587 * Returns: 0 Success
4588 * EINVAL
4589 * copyin:EFAULT
1c79356b
A
4590 */
4591int
2d21ac55 4592sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
1c79356b
A
4593{
4594 size_t valsize;
4595
4596 /*
4597 * If the user gives us more than we wanted, we ignore it,
4598 * but if we don't get the minimum length the caller
4599 * wants, we return EINVAL. On success, sopt->sopt_valsize
4600 * is set to however much we actually retrieved.
4601 */
4602 if ((valsize = sopt->sopt_valsize) < minlen)
2d21ac55 4603 return (EINVAL);
1c79356b
A
4604 if (valsize > len)
4605 sopt->sopt_valsize = valsize = len;
4606
b0d623f7 4607 if (sopt->sopt_p != kernproc)
1c79356b
A
4608 return (copyin(sopt->sopt_val, buf, valsize));
4609
91447636 4610 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
2d21ac55
A
4611 return (0);
4612}
4613
4614/*
4615 * sooptcopyin_timeval
4616 * Copy in a timeval value into tv_p, and take into account whether the
4617 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4618 * code here so that we can verify the 64-bit tv_sec value before we lose
4619 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4620 */
4621static int
39236c6e 4622sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
2d21ac55
A
4623{
4624 int error;
b0d623f7 4625
2d21ac55 4626 if (proc_is64bit(sopt->sopt_p)) {
b0d623f7 4627 struct user64_timeval tv64;
2d21ac55 4628
39236c6e 4629 if (sopt->sopt_valsize < sizeof (tv64))
2d21ac55 4630 return (EINVAL);
39236c6e
A
4631
4632 sopt->sopt_valsize = sizeof (tv64);
b0d623f7 4633 if (sopt->sopt_p != kernproc) {
39236c6e 4634 error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
b0d623f7
A
4635 if (error != 0)
4636 return (error);
4637 } else {
4638 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
39236c6e 4639 sizeof (tv64));
2d21ac55 4640 }
39236c6e
A
4641 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4642 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000)
2d21ac55 4643 return (EDOM);
39236c6e 4644
2d21ac55
A
4645 tv_p->tv_sec = tv64.tv_sec;
4646 tv_p->tv_usec = tv64.tv_usec;
4647 } else {
b0d623f7
A
4648 struct user32_timeval tv32;
4649
39236c6e 4650 if (sopt->sopt_valsize < sizeof (tv32))
2d21ac55 4651 return (EINVAL);
39236c6e
A
4652
4653 sopt->sopt_valsize = sizeof (tv32);
b0d623f7 4654 if (sopt->sopt_p != kernproc) {
39236c6e 4655 error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
2d21ac55
A
4656 if (error != 0) {
4657 return (error);
4658 }
4659 } else {
b0d623f7 4660 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
39236c6e 4661 sizeof (tv32));
2d21ac55 4662 }
39236c6e
A
4663#ifndef __LP64__
4664 /*
4665 * K64todo "comparison is always false due to
4666 * limited range of data type"
4667 */
4668 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4669 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000)
2d21ac55 4670 return (EDOM);
b0d623f7
A
4671#endif
4672 tv_p->tv_sec = tv32.tv_sec;
4673 tv_p->tv_usec = tv32.tv_usec;
2d21ac55
A
4674 }
4675 return (0);
1c79356b
A
4676}
4677
5ba3f43e
A
4678int
4679soopt_cred_check(struct socket *so, int priv, boolean_t allow_root)
39037602
A
4680{
4681 kauth_cred_t cred = NULL;
4682 proc_t ep = PROC_NULL;
5ba3f43e
A
4683 uid_t uid;
4684 int error = 0;
39037602
A
4685
4686 if (so->so_flags & SOF_DELEGATED) {
4687 ep = proc_find(so->e_pid);
4688 if (ep)
4689 cred = kauth_cred_proc_ref(ep);
4690 }
5ba3f43e
A
4691
4692 uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4693
4694 /* uid is 0 for root */
4695 if (uid != 0 || !allow_root)
4696 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
39037602
A
4697 if (cred)
4698 kauth_cred_unref(&cred);
4699 if (ep != PROC_NULL)
4700 proc_rele(ep);
4701
4702 return (error);
4703}
4704
2d21ac55
A
4705/*
4706 * Returns: 0 Success
4707 * EINVAL
4708 * ENOPROTOOPT
4709 * ENOBUFS
4710 * EDOM
4711 * sooptcopyin:EINVAL
4712 * sooptcopyin:EFAULT
4713 * sooptcopyin_timeval:EINVAL
4714 * sooptcopyin_timeval:EFAULT
4715 * sooptcopyin_timeval:EDOM
4716 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4717 * <pr_ctloutput>:???w
4718 * sflt_attach_private:??? [whatever a filter author chooses]
4719 * <sf_setoption>:??? [whatever a filter author chooses]
4720 *
4721 * Notes: Other <pru_listen> returns depend on the protocol family; all
4722 * <sf_listen> returns depend on what the filter author causes
4723 * their filter to return.
4724 */
1c79356b 4725int
39236c6e 4726sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
1c79356b
A
4727{
4728 int error, optval;
4729 struct linger l;
4730 struct timeval tv;
2d21ac55
A
4731#if CONFIG_MACF_SOCKET
4732 struct mac extmac;
4733#endif /* MAC_SOCKET */
91447636 4734
39236c6e
A
4735 if (sopt->sopt_dir != SOPT_SET)
4736 sopt->sopt_dir = SOPT_SET;
4737
4738 if (dolock)
4739 socket_lock(so, 1);
4740
4741 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4742 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
b0d623f7 4743 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
2d21ac55
A
4744 /* the socket has been shutdown, no more sockopt's */
4745 error = EINVAL;
39236c6e 4746 goto out;
9bccf70c
A
4747 }
4748
6d2010ae 4749 error = sflt_setsockopt(so, sopt);
39236c6e 4750 if (error != 0) {
6d2010ae
A
4751 if (error == EJUSTRETURN)
4752 error = 0;
39236c6e 4753 goto out;
1c79356b
A
4754 }
4755
1c79356b 4756 if (sopt->sopt_level != SOL_SOCKET) {
39236c6e
A
4757 if (so->so_proto != NULL &&
4758 so->so_proto->pr_ctloutput != NULL) {
2d21ac55 4759 error = (*so->so_proto->pr_ctloutput)(so, sopt);
39236c6e 4760 goto out;
91447636 4761 }
1c79356b
A
4762 error = ENOPROTOOPT;
4763 } else {
39236c6e
A
4764 /*
4765 * Allow socket-level (SOL_SOCKET) options to be filtered by
4766 * the protocol layer, if needed. A zero value returned from
4767 * the handler means use default socket-level processing as
4768 * done by the rest of this routine. Otherwise, any other
4769 * return value indicates that the option is unsupported.
4770 */
4771 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4772 pru_socheckopt(so, sopt)) != 0)
4773 goto out;
4774
4775 error = 0;
1c79356b
A
4776 switch (sopt->sopt_name) {
4777 case SO_LINGER:
91447636 4778 case SO_LINGER_SEC:
2d21ac55 4779 error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
39236c6e
A
4780 if (error != 0)
4781 goto out;
1c79356b 4782
2d21ac55
A
4783 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
4784 l.l_linger : l.l_linger * hz;
39236c6e 4785 if (l.l_onoff != 0)
1c79356b
A
4786 so->so_options |= SO_LINGER;
4787 else
4788 so->so_options &= ~SO_LINGER;
4789 break;
4790
4791 case SO_DEBUG:
4792 case SO_KEEPALIVE:
4793 case SO_DONTROUTE:
4794 case SO_USELOOPBACK:
4795 case SO_BROADCAST:
4796 case SO_REUSEADDR:
4797 case SO_REUSEPORT:
4798 case SO_OOBINLINE:
4799 case SO_TIMESTAMP:
6d2010ae 4800 case SO_TIMESTAMP_MONOTONIC:
1c79356b
A
4801 case SO_DONTTRUNC:
4802 case SO_WANTMORE:
9bccf70c 4803 case SO_WANTOOBFLAG:
fe8ab488 4804 case SO_NOWAKEFROMSLEEP:
39037602 4805 case SO_NOAPNFALLBK:
2d21ac55
A
4806 error = sooptcopyin(sopt, &optval, sizeof (optval),
4807 sizeof (optval));
39236c6e
A
4808 if (error != 0)
4809 goto out;
1c79356b
A
4810 if (optval)
4811 so->so_options |= sopt->sopt_name;
4812 else
4813 so->so_options &= ~sopt->sopt_name;
4814 break;
4815
4816 case SO_SNDBUF:
4817 case SO_RCVBUF:
4818 case SO_SNDLOWAT:
4819 case SO_RCVLOWAT:
2d21ac55
A
4820 error = sooptcopyin(sopt, &optval, sizeof (optval),
4821 sizeof (optval));
39236c6e
A
4822 if (error != 0)
4823 goto out;
1c79356b
A
4824
4825 /*
4826 * Values < 1 make no sense for any of these
4827 * options, so disallow them.
4828 */
4829 if (optval < 1) {
4830 error = EINVAL;
39236c6e 4831 goto out;
1c79356b
A
4832 }
4833
4834 switch (sopt->sopt_name) {
4835 case SO_SNDBUF:
39236c6e
A
4836 case SO_RCVBUF: {
4837 struct sockbuf *sb =
4838 (sopt->sopt_name == SO_SNDBUF) ?
4839 &so->so_snd : &so->so_rcv;
4840 if (sbreserve(sb, (u_int32_t)optval) == 0) {
1c79356b 4841 error = ENOBUFS;
39236c6e 4842 goto out;
1c79356b 4843 }
316670eb
A
4844 sb->sb_flags |= SB_USRSIZE;
4845 sb->sb_flags &= ~SB_AUTOSIZE;
4846 sb->sb_idealsize = (u_int32_t)optval;
1c79356b 4847 break;
316670eb 4848 }
1c79356b
A
4849 /*
4850 * Make sure the low-water is never greater than
4851 * the high-water.
4852 */
fe8ab488
A
4853 case SO_SNDLOWAT: {
4854 int space = sbspace(&so->so_snd);
4855 u_int32_t hiwat = so->so_snd.sb_hiwat;
4856
4857 if (so->so_snd.sb_flags & SB_UNIX) {
4858 struct unpcb *unp =
4859 (struct unpcb *)(so->so_pcb);
3e170ce0
A
4860 if (unp != NULL &&
4861 unp->unp_conn != NULL) {
fe8ab488
A
4862 hiwat += unp->unp_conn->unp_cc;
4863 }
4864 }
4865
1c79356b 4866 so->so_snd.sb_lowat =
fe8ab488
A
4867 (optval > hiwat) ?
4868 hiwat : optval;
4869
4870 if (space >= so->so_snd.sb_lowat) {
4871 sowwakeup(so);
4872 }
1c79356b 4873 break;
3e170ce0 4874 }
fe8ab488
A
4875 case SO_RCVLOWAT: {
4876 int64_t data_len;
1c79356b
A
4877 so->so_rcv.sb_lowat =
4878 (optval > so->so_rcv.sb_hiwat) ?
4879 so->so_rcv.sb_hiwat : optval;
3e170ce0 4880 data_len = so->so_rcv.sb_cc
fe8ab488
A
4881 - so->so_rcv.sb_ctl;
4882 if (data_len >= so->so_rcv.sb_lowat)
4883 sorwakeup(so);
1c79356b
A
4884 break;
4885 }
fe8ab488 4886 }
1c79356b
A
4887 break;
4888
4889 case SO_SNDTIMEO:
4890 case SO_RCVTIMEO:
2d21ac55 4891 error = sooptcopyin_timeval(sopt, &tv);
39236c6e
A
4892 if (error != 0)
4893 goto out;
1c79356b 4894
1c79356b
A
4895 switch (sopt->sopt_name) {
4896 case SO_SNDTIMEO:
91447636 4897 so->so_snd.sb_timeo = tv;
1c79356b
A
4898 break;
4899 case SO_RCVTIMEO:
91447636 4900 so->so_rcv.sb_timeo = tv;
1c79356b
A
4901 break;
4902 }
4903 break;
4904
39236c6e 4905 case SO_NKE: {
9bccf70c 4906 struct so_nke nke;
1c79356b 4907
2d21ac55
A
4908 error = sooptcopyin(sopt, &nke, sizeof (nke),
4909 sizeof (nke));
39236c6e
A
4910 if (error != 0)
4911 goto out;
1c79356b 4912
6d2010ae 4913 error = sflt_attach_internal(so, nke.nke_handle);
1c79356b
A
4914 break;
4915 }
4916
9bccf70c 4917 case SO_NOSIGPIPE:
2d21ac55
A
4918 error = sooptcopyin(sopt, &optval, sizeof (optval),
4919 sizeof (optval));
39236c6e
A
4920 if (error != 0)
4921 goto out;
4922 if (optval != 0)
2d21ac55
A
4923 so->so_flags |= SOF_NOSIGPIPE;
4924 else
4925 so->so_flags &= ~SOF_NOSIGPIPE;
9bccf70c
A
4926 break;
4927
55e303ae 4928 case SO_NOADDRERR:
2d21ac55
A
4929 error = sooptcopyin(sopt, &optval, sizeof (optval),
4930 sizeof (optval));
39236c6e
A
4931 if (error != 0)
4932 goto out;
4933 if (optval != 0)
2d21ac55
A
4934 so->so_flags |= SOF_NOADDRAVAIL;
4935 else
4936 so->so_flags &= ~SOF_NOADDRAVAIL;
2d21ac55
A
4937 break;
4938
4939 case SO_REUSESHAREUID:
4940 error = sooptcopyin(sopt, &optval, sizeof (optval),
4941 sizeof (optval));
39236c6e
A
4942 if (error != 0)
4943 goto out;
4944 if (optval != 0)
2d21ac55
A
4945 so->so_flags |= SOF_REUSESHAREUID;
4946 else
4947 so->so_flags &= ~SOF_REUSESHAREUID;
4948 break;
39236c6e 4949
2d21ac55
A
4950 case SO_NOTIFYCONFLICT:
4951 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
4952 error = EPERM;
39236c6e 4953 goto out;
2d21ac55
A
4954 }
4955 error = sooptcopyin(sopt, &optval, sizeof (optval),
4956 sizeof (optval));
39236c6e
A
4957 if (error != 0)
4958 goto out;
4959 if (optval != 0)
2d21ac55
A
4960 so->so_flags |= SOF_NOTIFYCONFLICT;
4961 else
4962 so->so_flags &= ~SOF_NOTIFYCONFLICT;
4963 break;
39236c6e 4964
2d21ac55 4965 case SO_RESTRICTIONS:
2d21ac55
A
4966 error = sooptcopyin(sopt, &optval, sizeof (optval),
4967 sizeof (optval));
39236c6e
A
4968 if (error != 0)
4969 goto out;
4970
4971 error = so_set_restrictions(so, optval);
2d21ac55
A
4972 break;
4973
fe8ab488
A
4974 case SO_AWDL_UNRESTRICTED:
4975 if (SOCK_DOM(so) != PF_INET &&
4976 SOCK_DOM(so) != PF_INET6) {
4977 error = EOPNOTSUPP;
4978 goto out;
4979 }
4980 error = sooptcopyin(sopt, &optval, sizeof(optval),
4981 sizeof(optval));
4982 if (error != 0)
4983 goto out;
4984 if (optval != 0) {
39037602 4985 error = soopt_cred_check(so,
5ba3f43e 4986 PRIV_NET_RESTRICTED_AWDL, false);
fe8ab488
A
4987 if (error == 0)
4988 inp_set_awdl_unrestricted(
4989 sotoinpcb(so));
fe8ab488
A
4990 } else
4991 inp_clear_awdl_unrestricted(sotoinpcb(so));
4992 break;
39037602
A
4993 case SO_INTCOPROC_ALLOW:
4994 if (SOCK_DOM(so) != PF_INET6) {
4995 error = EOPNOTSUPP;
4996 goto out;
4997 }
4998 error = sooptcopyin(sopt, &optval, sizeof(optval),
4999 sizeof(optval));
5000 if (error != 0)
5001 goto out;
743345f9
A
5002 if (optval != 0 &&
5003 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
39037602 5004 error = soopt_cred_check(so,
5ba3f43e 5005 PRIV_NET_RESTRICTED_INTCOPROC, false);
39037602
A
5006 if (error == 0)
5007 inp_set_intcoproc_allowed(
5008 sotoinpcb(so));
743345f9 5009 } else if (optval == 0)
39037602
A
5010 inp_clear_intcoproc_allowed(sotoinpcb(so));
5011 break;
fe8ab488 5012
2d21ac55
A
5013 case SO_LABEL:
5014#if CONFIG_MACF_SOCKET
5015 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5016 sizeof (extmac))) != 0)
39236c6e 5017 goto out;
2d21ac55
A
5018
5019 error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
5020 so, &extmac);
5021#else
5022 error = EOPNOTSUPP;
5023#endif /* MAC_SOCKET */
55e303ae
A
5024 break;
5025
4a3eedf9
A
5026 case SO_UPCALLCLOSEWAIT:
5027 error = sooptcopyin(sopt, &optval, sizeof (optval),
5028 sizeof (optval));
39236c6e
A
5029 if (error != 0)
5030 goto out;
5031 if (optval != 0)
4a3eedf9
A
5032 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5033 else
5034 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5035 break;
4a3eedf9 5036
b0d623f7
A
5037 case SO_RANDOMPORT:
5038 error = sooptcopyin(sopt, &optval, sizeof (optval),
5039 sizeof (optval));
39236c6e
A
5040 if (error != 0)
5041 goto out;
5042 if (optval != 0)
b0d623f7
A
5043 so->so_flags |= SOF_BINDRANDOMPORT;
5044 else
5045 so->so_flags &= ~SOF_BINDRANDOMPORT;
5046 break;
5047
5048 case SO_NP_EXTENSIONS: {
5049 struct so_np_extensions sonpx;
5050
39236c6e
A
5051 error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
5052 sizeof (sonpx));
5053 if (error != 0)
5054 goto out;
b0d623f7
A
5055 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5056 error = EINVAL;
39236c6e 5057 goto out;
b0d623f7
A
5058 }
5059 /*
5060 * Only one bit defined for now
5061 */
5062 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5063 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
5064 so->so_flags |= SOF_NPX_SETOPTSHUT;
5065 else
5066 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5067 }
5068 break;
5069 }
5070
d41d1dae
A
5071 case SO_TRAFFIC_CLASS: {
5072 error = sooptcopyin(sopt, &optval, sizeof (optval),
39236c6e
A
5073 sizeof (optval));
5074 if (error != 0)
5075 goto out;
39037602
A
5076 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5077 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5078 error = so_set_net_service_type(so, netsvc);
5079 goto out;
5080 }
6d2010ae 5081 error = so_set_traffic_class(so, optval);
39236c6e
A
5082 if (error != 0)
5083 goto out;
39037602
A
5084 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5085 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
6d2010ae 5086 break;
d41d1dae 5087 }
6d2010ae
A
5088
5089 case SO_RECV_TRAFFIC_CLASS: {
5090 error = sooptcopyin(sopt, &optval, sizeof (optval),
39236c6e
A
5091 sizeof (optval));
5092 if (error != 0)
5093 goto out;
6d2010ae
A
5094 if (optval == 0)
5095 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5096 else
5097 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5098 break;
5099 }
316670eb 5100
39037602 5101#if (DEVELOPMENT || DEBUG)
6d2010ae
A
5102 case SO_TRAFFIC_CLASS_DBG: {
5103 struct so_tcdbg so_tcdbg;
316670eb
A
5104
5105 error = sooptcopyin(sopt, &so_tcdbg,
5106 sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
39236c6e
A
5107 if (error != 0)
5108 goto out;
6d2010ae 5109 error = so_set_tcdbg(so, &so_tcdbg);
39236c6e
A
5110 if (error != 0)
5111 goto out;
6d2010ae
A
5112 break;
5113 }
39037602 5114#endif /* (DEVELOPMENT || DEBUG) */
316670eb
A
5115
5116 case SO_PRIVILEGED_TRAFFIC_CLASS:
5117 error = priv_check_cred(kauth_cred_get(),
5118 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
39236c6e
A
5119 if (error != 0)
5120 goto out;
316670eb 5121 error = sooptcopyin(sopt, &optval, sizeof (optval),
39236c6e
A
5122 sizeof (optval));
5123 if (error != 0)
5124 goto out;
316670eb
A
5125 if (optval == 0)
5126 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5127 else
5128 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5129 break;
5130
6d2010ae
A
5131 case SO_DEFUNCTOK:
5132 error = sooptcopyin(sopt, &optval, sizeof (optval),
5133 sizeof (optval));
5134 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5135 if (error == 0)
5136 error = EBADF;
39236c6e 5137 goto out;
6d2010ae
A
5138 }
5139 /*
5140 * Any process can set SO_DEFUNCTOK (clear
5141 * SOF_NODEFUNCT), but only root can clear
5142 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5143 */
5144 if (optval == 0 &&
5145 kauth_cred_issuser(kauth_cred_get()) == 0) {
5146 error = EPERM;
39236c6e 5147 goto out;
6d2010ae
A
5148 }
5149 if (optval)
5150 so->so_flags &= ~SOF_NODEFUNCT;
5151 else
5152 so->so_flags |= SOF_NODEFUNCT;
5153
39236c6e
A
5154 if (SOCK_DOM(so) == PF_INET ||
5155 SOCK_DOM(so) == PF_INET6) {
5156 char s[MAX_IPv6_STR_LEN];
5157 char d[MAX_IPv6_STR_LEN];
5158 struct inpcb *inp = sotoinpcb(so);
5159
39037602
A
5160 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5161 "[%s %s:%d -> %s:%d] is now marked "
5162 "as %seligible for "
39236c6e 5163 "defunct\n", __func__, proc_selfpid(),
39037602 5164 proc_best_name(current_proc()),
3e170ce0 5165 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39236c6e
A
5166 (SOCK_TYPE(so) == SOCK_STREAM) ?
5167 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5168 ((SOCK_DOM(so) == PF_INET) ?
5169 (void *)&inp->inp_laddr.s_addr :
5170 (void *)&inp->in6p_laddr), s, sizeof (s)),
5171 ntohs(inp->in6p_lport),
5172 inet_ntop(SOCK_DOM(so),
5173 (SOCK_DOM(so) == PF_INET) ?
5174 (void *)&inp->inp_faddr.s_addr :
5175 (void *)&inp->in6p_faddr, d, sizeof (d)),
5176 ntohs(inp->in6p_fport),
5177 (so->so_flags & SOF_NODEFUNCT) ?
39037602 5178 "not " : "");
39236c6e 5179 } else {
39037602
A
5180 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5181 "is now marked as %seligible for "
5182 "defunct\n",
39236c6e 5183 __func__, proc_selfpid(),
39037602 5184 proc_best_name(current_proc()),
3e170ce0 5185 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39236c6e
A
5186 SOCK_DOM(so), SOCK_TYPE(so),
5187 (so->so_flags & SOF_NODEFUNCT) ?
39037602 5188 "not " : "");
39236c6e 5189 }
6d2010ae
A
5190 break;
5191
5192 case SO_ISDEFUNCT:
5193 /* This option is not settable */
5194 error = EINVAL;
5195 break;
d41d1dae 5196
316670eb
A
5197 case SO_OPPORTUNISTIC:
5198 error = sooptcopyin(sopt, &optval, sizeof (optval),
5199 sizeof (optval));
5200 if (error == 0)
5201 error = so_set_opportunistic(so, optval);
5202 break;
5203
5204 case SO_FLUSH:
5205 /* This option is handled by lower layer(s) */
5206 error = 0;
5207 break;
5208
5209 case SO_RECV_ANYIF:
5210 error = sooptcopyin(sopt, &optval, sizeof (optval),
5211 sizeof (optval));
5212 if (error == 0)
5213 error = so_set_recv_anyif(so, optval);
5214 break;
5215
39236c6e
A
5216 case SO_TRAFFIC_MGT_BACKGROUND: {
5217 /* This option is handled by lower layer(s) */
5218 error = 0;
5219 break;
5220 }
5221
5222#if FLOW_DIVERT
5223 case SO_FLOW_DIVERT_TOKEN:
5224 error = flow_divert_token_set(so, sopt);
5225 break;
5226#endif /* FLOW_DIVERT */
5227
5228
5229 case SO_DELEGATED:
5230 if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
5231 sizeof (optval))) != 0)
5232 break;
5233
5234 error = so_set_effective_pid(so, optval, sopt->sopt_p);
5235 break;
5236
5237 case SO_DELEGATED_UUID: {
5238 uuid_t euuid;
5239
5240 if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
5241 sizeof (euuid))) != 0)
5242 break;
5243
5244 error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
5245 break;
5246 }
3e170ce0 5247
fe8ab488
A
5248#if NECP
5249 case SO_NECP_ATTRIBUTES:
5250 error = necp_set_socket_attributes(so, sopt);
5251 break;
fe8ab488 5252
5ba3f43e
A
5253 case SO_NECP_CLIENTUUID:
5254 if (SOCK_DOM(so) == PF_MULTIPATH) {
5255 /* Handled by MPTCP itself */
fe8ab488
A
5256 break;
5257 }
5258
5ba3f43e
A
5259 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5260 error = EINVAL;
fe8ab488 5261 goto out;
5ba3f43e
A
5262 }
5263
5264 struct inpcb *inp = sotoinpcb(so);
5265 if (!uuid_is_null(inp->necp_client_uuid)) {
5266 // Clear out the old client UUID if present
5267 necp_inpcb_remove_cb(inp);
5268 }
5269
5270 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5271 sizeof(uuid_t), sizeof(uuid_t));
5272 if (error != 0) {
5273 goto out;
5274 }
5275
5276 if (uuid_is_null(inp->necp_client_uuid)) {
5277 error = EINVAL;
5278 goto out;
5279 }
5280
5281 error = necp_client_register_socket_flow(so->last_pid,
5282 inp->necp_client_uuid, inp);
5283 if (error != 0) {
5284 uuid_clear(inp->necp_client_uuid);
5285 goto out;
5286 }
5287
5288 if (inp->inp_lport != 0) {
5289 // There is bound local port, so this is not
5290 // a fresh socket. Assign to the client.
5291 necp_client_assign_from_socket(so->last_pid, inp->necp_client_uuid, inp);
5292 }
5293
fe8ab488 5294 break;
5ba3f43e 5295#endif /* NECP */
39236c6e 5296
3e170ce0
A
5297 case SO_EXTENDED_BK_IDLE:
5298 error = sooptcopyin(sopt, &optval, sizeof (optval),
5299 sizeof (optval));
5300 if (error == 0)
5301 error = so_set_extended_bk_idle(so, optval);
5302 break;
5303
490019cf
A
5304 case SO_MARK_CELLFALLBACK:
5305 error = sooptcopyin(sopt, &optval, sizeof(optval),
5306 sizeof(optval));
5307 if (error != 0)
5308 goto out;
5309 if (optval < 0) {
5310 error = EINVAL;
5311 goto out;
5312 }
5313 if (optval == 0)
5314 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5315 else
5316 so->so_flags1 |= SOF1_CELLFALLBACK;
5317 break;
39037602
A
5318
5319 case SO_NET_SERVICE_TYPE: {
5320 error = sooptcopyin(sopt, &optval, sizeof(optval),
5321 sizeof(optval));
5322 if (error != 0)
5323 goto out;
5324 error = so_set_net_service_type(so, optval);
5325 break;
5326 }
5327
5328 case SO_QOSMARKING_POLICY_OVERRIDE:
5329 error = priv_check_cred(kauth_cred_get(),
5330 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5331 if (error != 0)
5332 goto out;
5333 error = sooptcopyin(sopt, &optval, sizeof(optval),
5334 sizeof(optval));
5335 if (error != 0)
5336 goto out;
5337 if (optval == 0)
5338 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5339 else
5340 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5341 break;
5342
1c79356b
A
5343 default:
5344 error = ENOPROTOOPT;
5345 break;
5346 }
39236c6e
A
5347 if (error == 0 && so->so_proto != NULL &&
5348 so->so_proto->pr_ctloutput != NULL) {
5349 (void) so->so_proto->pr_ctloutput(so, sopt);
1c79356b
A
5350 }
5351 }
39236c6e
A
5352out:
5353 if (dolock)
5354 socket_unlock(so, 1);
1c79356b
A
5355 return (error);
5356}
5357
2d21ac55 5358/* Helper routines for getsockopt */
1c79356b 5359int
2d21ac55 5360sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
1c79356b
A
5361{
5362 int error;
5363 size_t valsize;
5364
5365 error = 0;
5366
5367 /*
5368 * Documented get behavior is that we always return a value,
5369 * possibly truncated to fit in the user's buffer.
5370 * Traditional behavior is that we always tell the user
5371 * precisely how much we copied, rather than something useful
5372 * like the total amount we had available for her.
5373 * Note that this interface is not idempotent; the entire answer must
5374 * generated ahead of time.
5375 */
5376 valsize = min(len, sopt->sopt_valsize);
5377 sopt->sopt_valsize = valsize;
91447636 5378 if (sopt->sopt_val != USER_ADDR_NULL) {
b0d623f7 5379 if (sopt->sopt_p != kernproc)
1c79356b
A
5380 error = copyout(buf, sopt->sopt_val, valsize);
5381 else
91447636 5382 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
1c79356b 5383 }
2d21ac55
A
5384 return (error);
5385}
5386
5387static int
39236c6e 5388sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
2d21ac55
A
5389{
5390 int error;
5391 size_t len;
5ba3f43e
A
5392 struct user64_timeval tv64 = {};
5393 struct user32_timeval tv32 = {};
2d21ac55
A
5394 const void * val;
5395 size_t valsize;
b0d623f7 5396
2d21ac55
A
5397 error = 0;
5398 if (proc_is64bit(sopt->sopt_p)) {
39236c6e 5399 len = sizeof (tv64);
2d21ac55
A
5400 tv64.tv_sec = tv_p->tv_sec;
5401 tv64.tv_usec = tv_p->tv_usec;
5402 val = &tv64;
5403 } else {
39236c6e 5404 len = sizeof (tv32);
b0d623f7
A
5405 tv32.tv_sec = tv_p->tv_sec;
5406 tv32.tv_usec = tv_p->tv_usec;
5407 val = &tv32;
2d21ac55
A
5408 }
5409 valsize = min(len, sopt->sopt_valsize);
5410 sopt->sopt_valsize = valsize;
5411 if (sopt->sopt_val != USER_ADDR_NULL) {
b0d623f7 5412 if (sopt->sopt_p != kernproc)
2d21ac55
A
5413 error = copyout(val, sopt->sopt_val, valsize);
5414 else
5415 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5416 }
5417 return (error);
1c79356b
A
5418}
5419
2d21ac55
A
5420/*
5421 * Return: 0 Success
5422 * ENOPROTOOPT
5423 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5424 * <pr_ctloutput>:???
5425 * <sf_getoption>:???
5426 */
1c79356b 5427int
39236c6e 5428sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
1c79356b
A
5429{
5430 int error, optval;
5431 struct linger l;
5432 struct timeval tv;
2d21ac55
A
5433#if CONFIG_MACF_SOCKET
5434 struct mac extmac;
5435#endif /* MAC_SOCKET */
1c79356b 5436
39236c6e 5437 if (sopt->sopt_dir != SOPT_GET)
2d21ac55 5438 sopt->sopt_dir = SOPT_GET;
9bccf70c 5439
39236c6e
A
5440 if (dolock)
5441 socket_lock(so, 1);
2d21ac55 5442
6d2010ae 5443 error = sflt_getsockopt(so, sopt);
39236c6e 5444 if (error != 0) {
6d2010ae
A
5445 if (error == EJUSTRETURN)
5446 error = 0;
39236c6e 5447 goto out;
1c79356b 5448 }
39236c6e 5449
1c79356b 5450 if (sopt->sopt_level != SOL_SOCKET) {
39236c6e
A
5451 if (so->so_proto != NULL &&
5452 so->so_proto->pr_ctloutput != NULL) {
2d21ac55 5453 error = (*so->so_proto->pr_ctloutput)(so, sopt);
39236c6e 5454 goto out;
91447636 5455 }
39236c6e 5456 error = ENOPROTOOPT;
1c79356b 5457 } else {
39236c6e
A
5458 /*
5459 * Allow socket-level (SOL_SOCKET) options to be filtered by
5460 * the protocol layer, if needed. A zero value returned from
5461 * the handler means use default socket-level processing as
5462 * done by the rest of this routine. Otherwise, any other
5463 * return value indicates that the option is unsupported.
5464 */
5465 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5466 pru_socheckopt(so, sopt)) != 0)
5467 goto out;
5468
5469 error = 0;
1c79356b
A
5470 switch (sopt->sopt_name) {
5471 case SO_LINGER:
91447636 5472 case SO_LINGER_SEC:
39236c6e 5473 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
2d21ac55
A
5474 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5475 so->so_linger : so->so_linger / hz;
5476 error = sooptcopyout(sopt, &l, sizeof (l));
1c79356b
A
5477 break;
5478
5479 case SO_USELOOPBACK:
5480 case SO_DONTROUTE:
5481 case SO_DEBUG:
5482 case SO_KEEPALIVE:
5483 case SO_REUSEADDR:
5484 case SO_REUSEPORT:
5485 case SO_BROADCAST:
5486 case SO_OOBINLINE:
5487 case SO_TIMESTAMP:
6d2010ae 5488 case SO_TIMESTAMP_MONOTONIC:
1c79356b
A
5489 case SO_DONTTRUNC:
5490 case SO_WANTMORE:
9bccf70c 5491 case SO_WANTOOBFLAG:
fe8ab488 5492 case SO_NOWAKEFROMSLEEP:
39037602 5493 case SO_NOAPNFALLBK:
1c79356b
A
5494 optval = so->so_options & sopt->sopt_name;
5495integer:
2d21ac55 5496 error = sooptcopyout(sopt, &optval, sizeof (optval));
1c79356b
A
5497 break;
5498
5499 case SO_TYPE:
5500 optval = so->so_type;
5501 goto integer;
5502
5503 case SO_NREAD:
2d21ac55
A
5504 if (so->so_proto->pr_flags & PR_ATOMIC) {
5505 int pkt_total;
5506 struct mbuf *m1;
1c79356b 5507
2d21ac55
A
5508 pkt_total = 0;
5509 m1 = so->so_rcv.sb_mb;
39236c6e
A
5510 while (m1 != NULL) {
5511 if (m1->m_type == MT_DATA ||
5512 m1->m_type == MT_HEADER ||
5513 m1->m_type == MT_OOBDATA)
1c79356b 5514 pkt_total += m1->m_len;
1c79356b
A
5515 m1 = m1->m_next;
5516 }
5517 optval = pkt_total;
2d21ac55
A
5518 } else {
5519 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5520 }
1c79356b 5521 goto integer;
39236c6e 5522
fe8ab488
A
5523 case SO_NUMRCVPKT:
5524 if (so->so_proto->pr_flags & PR_ATOMIC) {
5525 int cnt = 0;
5526 struct mbuf *m1;
5527
5528 m1 = so->so_rcv.sb_mb;
5529 while (m1 != NULL) {
5530 if (m1->m_type == MT_DATA ||
5531 m1->m_type == MT_HEADER ||
5532 m1->m_type == MT_OOBDATA)
5533 cnt += 1;
5534 m1 = m1->m_nextpkt;
5535 }
5536 optval = cnt;
5537 goto integer;
5538 } else {
5539 error = EINVAL;
5540 break;
5541 }
5542
91447636
A
5543 case SO_NWRITE:
5544 optval = so->so_snd.sb_cc;
2d21ac55 5545 goto integer;
39236c6e 5546
1c79356b
A
5547 case SO_ERROR:
5548 optval = so->so_error;
5549 so->so_error = 0;
5550 goto integer;
5551
fe8ab488
A
5552 case SO_SNDBUF: {
5553 u_int32_t hiwat = so->so_snd.sb_hiwat;
1c79356b 5554
fe8ab488
A
5555 if (so->so_snd.sb_flags & SB_UNIX) {
5556 struct unpcb *unp =
5557 (struct unpcb *)(so->so_pcb);
5558 if (unp != NULL && unp->unp_conn != NULL) {
5559 hiwat += unp->unp_conn->unp_cc;
5560 }
5561 }
5562
5563 optval = hiwat;
5564 goto integer;
5565 }
1c79356b
A
5566 case SO_RCVBUF:
5567 optval = so->so_rcv.sb_hiwat;
5568 goto integer;
5569
5570 case SO_SNDLOWAT:
5571 optval = so->so_snd.sb_lowat;
5572 goto integer;
5573
5574 case SO_RCVLOWAT:
5575 optval = so->so_rcv.sb_lowat;
5576 goto integer;
5577
5578 case SO_SNDTIMEO:
5579 case SO_RCVTIMEO:
91447636 5580 tv = (sopt->sopt_name == SO_SNDTIMEO ?
2d21ac55 5581 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1c79356b 5582
2d21ac55
A
5583 error = sooptcopyout_timeval(sopt, &tv);
5584 break;
1c79356b 5585
91447636
A
5586 case SO_NOSIGPIPE:
5587 optval = (so->so_flags & SOF_NOSIGPIPE);
5588 goto integer;
9bccf70c 5589
55e303ae 5590 case SO_NOADDRERR:
91447636
A
5591 optval = (so->so_flags & SOF_NOADDRAVAIL);
5592 goto integer;
55e303ae 5593
2d21ac55
A
5594 case SO_REUSESHAREUID:
5595 optval = (so->so_flags & SOF_REUSESHAREUID);
5596 goto integer;
5597
39236c6e 5598
2d21ac55
A
5599 case SO_NOTIFYCONFLICT:
5600 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5601 goto integer;
39236c6e 5602
2d21ac55 5603 case SO_RESTRICTIONS:
39236c6e 5604 optval = so_get_restrictions(so);
2d21ac55
A
5605 goto integer;
5606
fe8ab488 5607 case SO_AWDL_UNRESTRICTED:
3e170ce0 5608 if (SOCK_DOM(so) == PF_INET ||
fe8ab488
A
5609 SOCK_DOM(so) == PF_INET6) {
5610 optval = inp_get_awdl_unrestricted(
5611 sotoinpcb(so));
5612 goto integer;
5613 } else
5614 error = EOPNOTSUPP;
5615 break;
5616
39037602
A
5617 case SO_INTCOPROC_ALLOW:
5618 if (SOCK_DOM(so) == PF_INET6) {
5619 optval = inp_get_intcoproc_allowed(
5620 sotoinpcb(so));
5621 goto integer;
5622 } else
5623 error = EOPNOTSUPP;
5624 break;
5625
2d21ac55
A
5626 case SO_LABEL:
5627#if CONFIG_MACF_SOCKET
5628 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5629 sizeof (extmac))) != 0 ||
5630 (error = mac_socket_label_get(proc_ucred(
5631 sopt->sopt_p), so, &extmac)) != 0)
5632 break;
5633
5634 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5635#else
5636 error = EOPNOTSUPP;
5637#endif /* MAC_SOCKET */
5638 break;
5639
5640 case SO_PEERLABEL:
5641#if CONFIG_MACF_SOCKET
5642 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5643 sizeof (extmac))) != 0 ||
5644 (error = mac_socketpeer_label_get(proc_ucred(
5645 sopt->sopt_p), so, &extmac)) != 0)
5646 break;
5647
5648 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5649#else
5650 error = EOPNOTSUPP;
5651#endif /* MAC_SOCKET */
5652 break;
5653
4a3eedf9
A
5654#ifdef __APPLE_API_PRIVATE
5655 case SO_UPCALLCLOSEWAIT:
5656 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
5657 goto integer;
5658#endif
b0d623f7
A
5659 case SO_RANDOMPORT:
5660 optval = (so->so_flags & SOF_BINDRANDOMPORT);
5661 goto integer;
5662
5663 case SO_NP_EXTENSIONS: {
527f9951 5664 struct so_np_extensions sonpx = {};
b0d623f7 5665
39236c6e
A
5666 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
5667 SONPX_SETOPTSHUT : 0;
b0d623f7 5668 sonpx.npx_mask = SONPX_MASK_VALID;
4a3eedf9 5669
39236c6e
A
5670 error = sooptcopyout(sopt, &sonpx,
5671 sizeof (struct so_np_extensions));
5672 break;
b0d623f7 5673 }
6d2010ae 5674
d41d1dae
A
5675 case SO_TRAFFIC_CLASS:
5676 optval = so->so_traffic_class;
5677 goto integer;
316670eb 5678
6d2010ae
A
5679 case SO_RECV_TRAFFIC_CLASS:
5680 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
5681 goto integer;
5682
5683 case SO_TRAFFIC_CLASS_STATS:
39236c6e
A
5684 error = sooptcopyout(sopt, &so->so_tc_stats,
5685 sizeof (so->so_tc_stats));
316670eb 5686 break;
6d2010ae 5687
39037602 5688#if (DEVELOPMENT || DEBUG)
39236c6e 5689 case SO_TRAFFIC_CLASS_DBG:
6d2010ae
A
5690 error = sogetopt_tcdbg(so, sopt);
5691 break;
39037602 5692#endif /* (DEVELOPMENT || DEBUG) */
316670eb
A
5693
5694 case SO_PRIVILEGED_TRAFFIC_CLASS:
5695 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
5696 goto integer;
5697
6d2010ae
A
5698 case SO_DEFUNCTOK:
5699 optval = !(so->so_flags & SOF_NODEFUNCT);
5700 goto integer;
5701
5702 case SO_ISDEFUNCT:
5703 optval = (so->so_flags & SOF_DEFUNCT);
5704 goto integer;
d41d1dae 5705
316670eb
A
5706 case SO_OPPORTUNISTIC:
5707 optval = so_get_opportunistic(so);
5708 goto integer;
5709
5710 case SO_FLUSH:
5711 /* This option is not gettable */
5712 error = EINVAL;
5713 break;
5714
5715 case SO_RECV_ANYIF:
5716 optval = so_get_recv_anyif(so);
5717 goto integer;
5718
39236c6e
A
5719 case SO_TRAFFIC_MGT_BACKGROUND:
5720 /* This option is handled by lower layer(s) */
5721 if (so->so_proto != NULL &&
5722 so->so_proto->pr_ctloutput != NULL) {
5723 (void) so->so_proto->pr_ctloutput(so, sopt);
5724 }
5725 break;
5726
5727#if FLOW_DIVERT
5728 case SO_FLOW_DIVERT_TOKEN:
5729 error = flow_divert_token_get(so, sopt);
5730 break;
5731#endif /* FLOW_DIVERT */
3e170ce0 5732
fe8ab488
A
5733#if NECP
5734 case SO_NECP_ATTRIBUTES:
5735 error = necp_get_socket_attributes(so, sopt);
5736 break;
5ba3f43e
A
5737
5738 case SO_NECP_CLIENTUUID:
5739 {
5740 uuid_t *ncu;
5741
5742 if (SOCK_DOM(so) == PF_MULTIPATH) {
5743 ncu = &mpsotomppcb(so)->necp_client_uuid;
5744 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
5745 ncu = &sotoinpcb(so)->necp_client_uuid;
5746 } else {
5747 error = EINVAL;
5748 goto out;
5749 }
5750
5751 error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
5752 break;
5753 }
fe8ab488
A
5754#endif /* NECP */
5755
5756#if CONTENT_FILTER
5757 case SO_CFIL_SOCK_ID: {
5758 cfil_sock_id_t sock_id;
5759
5760 sock_id = cfil_sock_id_from_socket(so);
5761
3e170ce0 5762 error = sooptcopyout(sopt, &sock_id,
fe8ab488
A
5763 sizeof(cfil_sock_id_t));
5764 break;
5765 }
5766#endif /* CONTENT_FILTER */
5767
3e170ce0
A
5768 case SO_EXTENDED_BK_IDLE:
5769 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
5770 goto integer;
490019cf
A
5771 case SO_MARK_CELLFALLBACK:
5772 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
5773 ? 1 : 0;
5774 goto integer;
39037602
A
5775 case SO_NET_SERVICE_TYPE: {
5776 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE))
5777 optval = so->so_netsvctype;
5778 else
5779 optval = NET_SERVICE_TYPE_BE;
5780 goto integer;
5781 }
5782 case SO_NETSVC_MARKING_LEVEL:
5783 optval = so_get_netsvc_marking_level(so);
5784 goto integer;
5785
1c79356b
A
5786 default:
5787 error = ENOPROTOOPT;
5788 break;
5789 }
1c79356b 5790 }
39236c6e
A
5791out:
5792 if (dolock)
5793 socket_unlock(so, 1);
5794 return (error);
1c79356b 5795}
39236c6e
A
5796
5797/*
5798 * The size limits on our soopt_getm is different from that on FreeBSD.
6d2010ae
A
5799 * We limit the size of options to MCLBYTES. This will have to change
5800 * if we need to define options that need more space than MCLBYTES.
5801 */
1c79356b 5802int
9bccf70c 5803soopt_getm(struct sockopt *sopt, struct mbuf **mp)
1c79356b
A
5804{
5805 struct mbuf *m, *m_prev;
5806 int sopt_size = sopt->sopt_valsize;
b0d623f7 5807 int how;
1c79356b 5808
6d2010ae 5809 if (sopt_size <= 0 || sopt_size > MCLBYTES)
2d21ac55 5810 return (EMSGSIZE);
a3d08fcd 5811
b0d623f7
A
5812 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
5813 MGET(m, how, MT_DATA);
39236c6e 5814 if (m == NULL)
2d21ac55 5815 return (ENOBUFS);
1c79356b 5816 if (sopt_size > MLEN) {
b0d623f7 5817 MCLGET(m, how);
1c79356b
A
5818 if ((m->m_flags & M_EXT) == 0) {
5819 m_free(m);
2d21ac55 5820 return (ENOBUFS);
1c79356b
A
5821 }
5822 m->m_len = min(MCLBYTES, sopt_size);
5823 } else {
5824 m->m_len = min(MLEN, sopt_size);
5825 }
5826 sopt_size -= m->m_len;
5827 *mp = m;
5828 m_prev = m;
5829
6d2010ae 5830 while (sopt_size > 0) {
b0d623f7 5831 MGET(m, how, MT_DATA);
39236c6e 5832 if (m == NULL) {
1c79356b 5833 m_freem(*mp);
2d21ac55 5834 return (ENOBUFS);
1c79356b
A
5835 }
5836 if (sopt_size > MLEN) {
b0d623f7 5837 MCLGET(m, how);
1c79356b
A
5838 if ((m->m_flags & M_EXT) == 0) {
5839 m_freem(*mp);
6d2010ae 5840 m_freem(m);
2d21ac55 5841 return (ENOBUFS);
1c79356b
A
5842 }
5843 m->m_len = min(MCLBYTES, sopt_size);
5844 } else {
5845 m->m_len = min(MLEN, sopt_size);
5846 }
5847 sopt_size -= m->m_len;
5848 m_prev->m_next = m;
5849 m_prev = m;
5850 }
2d21ac55 5851 return (0);
1c79356b
A
5852}
5853
6d2010ae 5854/* copyin sopt data into mbuf chain */
1c79356b 5855int
9bccf70c 5856soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
1c79356b
A
5857{
5858 struct mbuf *m0 = m;
5859
91447636 5860 if (sopt->sopt_val == USER_ADDR_NULL)
2d21ac55 5861 return (0);
1c79356b 5862 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
b0d623f7 5863 if (sopt->sopt_p != kernproc) {
1c79356b
A
5864 int error;
5865
2d21ac55
A
5866 error = copyin(sopt->sopt_val, mtod(m, char *),
5867 m->m_len);
1c79356b
A
5868 if (error != 0) {
5869 m_freem(m0);
2d21ac55 5870 return (error);
1c79356b 5871 }
2d21ac55
A
5872 } else {
5873 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
5874 mtod(m, char *), m->m_len);
5875 }
1c79356b 5876 sopt->sopt_valsize -= m->m_len;
2d21ac55 5877 sopt->sopt_val += m->m_len;
1c79356b
A
5878 m = m->m_next;
5879 }
39236c6e
A
5880 /* should be allocated enoughly at ip6_sooptmcopyin() */
5881 if (m != NULL) {
9bccf70c 5882 panic("soopt_mcopyin");
39236c6e
A
5883 /* NOTREACHED */
5884 }
2d21ac55 5885 return (0);
1c79356b
A
5886}
5887
6d2010ae 5888/* copyout mbuf chain data into soopt */
1c79356b 5889int
9bccf70c 5890soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
1c79356b
A
5891{
5892 struct mbuf *m0 = m;
5893 size_t valsize = 0;
5894
91447636 5895 if (sopt->sopt_val == USER_ADDR_NULL)
2d21ac55 5896 return (0);
1c79356b 5897 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
b0d623f7 5898 if (sopt->sopt_p != kernproc) {
1c79356b
A
5899 int error;
5900
2d21ac55
A
5901 error = copyout(mtod(m, char *), sopt->sopt_val,
5902 m->m_len);
1c79356b
A
5903 if (error != 0) {
5904 m_freem(m0);
2d21ac55 5905 return (error);
1c79356b 5906 }
2d21ac55
A
5907 } else {
5908 bcopy(mtod(m, char *),
5909 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
5910 }
5911 sopt->sopt_valsize -= m->m_len;
5912 sopt->sopt_val += m->m_len;
5913 valsize += m->m_len;
5914 m = m->m_next;
1c79356b
A
5915 }
5916 if (m != NULL) {
5917 /* enough soopt buffer should be given from user-land */
5918 m_freem(m0);
2d21ac55 5919 return (EINVAL);
1c79356b
A
5920 }
5921 sopt->sopt_valsize = valsize;
2d21ac55 5922 return (0);
1c79356b
A
5923}
5924
9bccf70c 5925void
2d21ac55 5926sohasoutofband(struct socket *so)
9bccf70c 5927{
9bccf70c
A
5928 if (so->so_pgid < 0)
5929 gsignal(-so->so_pgid, SIGURG);
2d21ac55
A
5930 else if (so->so_pgid > 0)
5931 proc_signal(so->so_pgid, SIGURG);
9bccf70c 5932 selwakeup(&so->so_rcv.sb_sel);
39037602
A
5933 if (so->so_rcv.sb_flags & SB_KNOTE) {
5934 KNOTE(&so->so_rcv.sb_sel.si_note,
5935 (NOTE_OOB | SO_FILT_HINT_LOCKED));
5936 }
9bccf70c
A
5937}
5938
5939int
39236c6e 5940sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
9bccf70c 5941{
39236c6e 5942#pragma unused(cred)
9bccf70c
A
5943 struct proc *p = current_proc();
5944 int revents = 0;
91447636
A
5945
5946 socket_lock(so, 1);
39236c6e
A
5947 so_update_last_owner_locked(so, PROC_NULL);
5948 so_update_policy(so);
9bccf70c
A
5949
5950 if (events & (POLLIN | POLLRDNORM))
5951 if (soreadable(so))
5952 revents |= events & (POLLIN | POLLRDNORM);
5953
5954 if (events & (POLLOUT | POLLWRNORM))
5955 if (sowriteable(so))
5956 revents |= events & (POLLOUT | POLLWRNORM);
5957
5958 if (events & (POLLPRI | POLLRDBAND))
5959 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
5960 revents |= events & (POLLPRI | POLLRDBAND);
5961
5962 if (revents == 0) {
5963 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
2d21ac55
A
5964 /*
5965 * Darwin sets the flag first,
5966 * BSD calls selrecord first
5967 */
9bccf70c
A
5968 so->so_rcv.sb_flags |= SB_SEL;
5969 selrecord(p, &so->so_rcv.sb_sel, wql);
5970 }
5971
5972 if (events & (POLLOUT | POLLWRNORM)) {
2d21ac55
A
5973 /*
5974 * Darwin sets the flag first,
5975 * BSD calls selrecord first
5976 */
9bccf70c
A
5977 so->so_snd.sb_flags |= SB_SEL;
5978 selrecord(p, &so->so_snd.sb_sel, wql);
5979 }
5980 }
5981
91447636 5982 socket_unlock(so, 1);
9bccf70c
A
5983 return (revents);
5984}
55e303ae 5985
55e303ae 5986int
5ba3f43e
A
5987soo_kqfilter(struct fileproc *fp, struct knote *kn,
5988 struct kevent_internal_s *kev, vfs_context_t ctx)
55e303ae 5989{
39236c6e
A
5990#pragma unused(fp)
5991#if !CONFIG_MACF_SOCKET
5992#pragma unused(ctx)
5993#endif /* MAC_SOCKET */
91447636 5994 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
39037602 5995 int result;
2d21ac55 5996
91447636 5997 socket_lock(so, 1);
39236c6e
A
5998 so_update_last_owner_locked(so, PROC_NULL);
5999 so_update_policy(so);
55e303ae 6000
2d21ac55 6001#if CONFIG_MACF_SOCKET
39236c6e
A
6002 if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
6003 kn, so) != 0) {
2d21ac55 6004 socket_unlock(so, 1);
39037602
A
6005 kn->kn_flags = EV_ERROR;
6006 kn->kn_data = EPERM;
6007 return 0;
2d21ac55
A
6008 }
6009#endif /* MAC_SOCKET */
6010
55e303ae
A
6011 switch (kn->kn_filter) {
6012 case EVFILT_READ:
39037602 6013 kn->kn_filtid = EVFILTID_SOREAD;
55e303ae
A
6014 break;
6015 case EVFILT_WRITE:
39037602 6016 kn->kn_filtid = EVFILTID_SOWRITE;
316670eb
A
6017 break;
6018 case EVFILT_SOCK:
39037602
A
6019 kn->kn_filtid = EVFILTID_SCK;
6020 break;
6021 case EVFILT_EXCEPT:
6022 kn->kn_filtid = EVFILTID_SOEXCEPT;
55e303ae
A
6023 break;
6024 default:
91447636 6025 socket_unlock(so, 1);
39037602
A
6026 kn->kn_flags = EV_ERROR;
6027 kn->kn_data = EINVAL;
6028 return 0;
316670eb 6029 }
55e303ae 6030
39037602
A
6031 /*
6032 * call the appropriate sub-filter attach
6033 * with the socket still locked
6034 */
5ba3f43e 6035 result = knote_fops(kn)->f_attach(kn, kev);
55e303ae 6036
91447636 6037 socket_unlock(so, 1);
39037602
A
6038
6039 return result;
55e303ae
A
6040}
6041
55e303ae 6042static int
39037602 6043filt_soread_common(struct knote *kn, struct socket *so)
55e303ae 6044{
b0d623f7 6045 if (so->so_options & SO_ACCEPTCONN) {
39037602 6046 int is_not_empty;
b0d623f7 6047
39236c6e
A
6048 /*
6049 * Radar 6615193 handle the listen case dynamically
6050 * for kqueue read filter. This allows to call listen()
6051 * after registering the kqueue EVFILT_READ.
b0d623f7
A
6052 */
6053
6054 kn->kn_data = so->so_qlen;
39037602 6055 is_not_empty = ! TAILQ_EMPTY(&so->so_comp);
b0d623f7 6056
39037602 6057 return (is_not_empty);
b0d623f7
A
6058 }
6059
6060 /* socket isn't a listener */
3e170ce0
A
6061 /*
6062 * NOTE_LOWAT specifies new low water mark in data, i.e.
6063 * the bytes of protocol data. We therefore exclude any
6064 * control bytes.
6065 */
2d21ac55 6066 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3e170ce0 6067
39037602
A
6068 if (kn->kn_sfflags & NOTE_OOB) {
6069 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6070 kn->kn_fflags |= NOTE_OOB;
2d21ac55 6071 kn->kn_data -= so->so_oobmark;
91447636
A
6072 return (1);
6073 }
04b8595b 6074 }
3e170ce0 6075
04b8595b 6076 if ((so->so_state & SS_CANTRCVMORE)
fe8ab488 6077#if CONTENT_FILTER
04b8595b 6078 && cfil_sock_data_pending(&so->so_rcv) == 0
fe8ab488 6079#endif /* CONTENT_FILTER */
04b8595b
A
6080 ) {
6081 kn->kn_flags |= EV_EOF;
6082 kn->kn_fflags = so->so_error;
04b8595b 6083 return (1);
91447636
A
6084 }
6085
6086 if (so->so_error) { /* temporary udp error */
55e303ae 6087 return (1);
91447636
A
6088 }
6089
6d2010ae 6090 int64_t lowwat = so->so_rcv.sb_lowat;
3e170ce0
A
6091 /*
6092 * Ensure that when NOTE_LOWAT is used, the derived
6093 * low water mark is bounded by socket's rcv buf's
6094 * high and low water mark values.
6095 */
39236c6e 6096 if (kn->kn_sfflags & NOTE_LOWAT) {
6d2010ae
A
6097 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
6098 lowwat = so->so_rcv.sb_hiwat;
6099 else if (kn->kn_sdata > lowwat)
6100 lowwat = kn->kn_sdata;
6101 }
39236c6e 6102
3e170ce0
A
6103 /*
6104 * The order below is important. Since NOTE_LOWAT
6105 * overrides sb_lowat, check for NOTE_LOWAT case
6106 * first.
6107 */
6108 if (kn->kn_sfflags & NOTE_LOWAT)
6109 return (kn->kn_data >= lowwat);
6110
6111 return (so->so_rcv.sb_cc >= lowwat);
55e303ae
A
6112}
6113
39037602 6114static int
5ba3f43e 6115filt_sorattach(struct knote *kn, __unused struct kevent_internal_s *kev)
39037602
A
6116{
6117 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6118
6119 /* socket locked */
6120
6121 /*
6122 * If the caller explicitly asked for OOB results (e.g. poll())
6123 * from EVFILT_READ, then save that off in the hookid field
6124 * and reserve the kn_flags EV_OOBAND bit for output only.
6125 */
6126 if (kn->kn_filter == EVFILT_READ &&
6127 kn->kn_flags & EV_OOBAND) {
6128 kn->kn_flags &= ~EV_OOBAND;
6129 kn->kn_hookid = EV_OOBAND;
6130 } else {
6131 kn->kn_hookid = 0;
6132 }
6133 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn))
6134 so->so_rcv.sb_flags |= SB_KNOTE;
6135
6136 /* indicate if event is already fired */
6137 return filt_soread_common(kn, so);
6138}
6139
55e303ae 6140static void
39037602 6141filt_sordetach(struct knote *kn)
55e303ae 6142{
91447636 6143 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
39037602 6144
91447636 6145 socket_lock(so, 1);
39037602
A
6146 if (so->so_rcv.sb_flags & SB_KNOTE)
6147 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
6148 so->so_rcv.sb_flags &= ~SB_KNOTE;
6149 socket_unlock(so, 1);
6150}
6151
6152/*ARGSUSED*/
6153static int
6154filt_soread(struct knote *kn, long hint)
6155{
6156 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6157 int retval;
6158
6159 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6160 socket_lock(so, 1);
6161
6162 retval = filt_soread_common(kn, so);
6163
6164 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6165 socket_unlock(so, 1);
6166
6167 return retval;
6168}
6169
6170static int
6171filt_sortouch(struct knote *kn, struct kevent_internal_s *kev)
6172{
6173 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6174 int retval;
6175
6176 socket_lock(so, 1);
6177
6178 /* save off the new input fflags and data */
6179 kn->kn_sfflags = kev->fflags;
6180 kn->kn_sdata = kev->data;
6181 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6182 kn->kn_udata = kev->udata;
6183
6184 /* determine if changes result in fired events */
6185 retval = filt_soread_common(kn, so);
55e303ae 6186
91447636 6187 socket_unlock(so, 1);
39037602
A
6188
6189 return retval;
6190}
6191
6192static int
6193filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6194{
6195#pragma unused(data)
6196 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6197 int retval;
6198
6199 socket_lock(so, 1);
6200 retval = filt_soread_common(kn, so);
6201 if (retval) {
6202 *kev = kn->kn_kevent;
6203 if (kn->kn_flags & EV_CLEAR) {
6204 kn->kn_fflags = 0;
6205 kn->kn_data = 0;
6206 }
6207 }
6208 socket_unlock(so, 1);
6209
6210 return retval;
55e303ae
A
6211}
6212
316670eb
A
6213int
6214so_wait_for_if_feedback(struct socket *so)
6215{
39236c6e 6216 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
316670eb
A
6217 (so->so_state & SS_ISCONNECTED)) {
6218 struct inpcb *inp = sotoinpcb(so);
6219 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
6220 return (1);
6221 }
6222 return (0);
6223}
6224
55e303ae 6225static int
39037602 6226filt_sowrite_common(struct knote *kn, struct socket *so)
55e303ae 6227{
316670eb 6228 int ret = 0;
91447636 6229
55e303ae
A
6230 kn->kn_data = sbspace(&so->so_snd);
6231 if (so->so_state & SS_CANTSENDMORE) {
2d21ac55 6232 kn->kn_flags |= EV_EOF;
55e303ae 6233 kn->kn_fflags = so->so_error;
39037602 6234 return 1;
55e303ae 6235 }
91447636 6236 if (so->so_error) { /* temporary udp error */
39037602 6237 return 1;
91447636 6238 }
3e170ce0 6239 if (!socanwrite(so)) {
39037602 6240 return 0;
91447636 6241 }
3e170ce0 6242 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
39037602 6243 return 1;
3e170ce0 6244 }
6d2010ae 6245 int64_t lowwat = so->so_snd.sb_lowat;
39236c6e 6246 if (kn->kn_sfflags & NOTE_LOWAT) {
6d2010ae
A
6247 if (kn->kn_sdata > so->so_snd.sb_hiwat)
6248 lowwat = so->so_snd.sb_hiwat;
6249 else if (kn->kn_sdata > lowwat)
6250 lowwat = kn->kn_sdata;
6251 }
316670eb 6252 if (kn->kn_data >= lowwat) {
39037602
A
6253 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6254#if (DEBUG || DEVELOPMENT)
6255 && so_notsent_lowat_check == 1
6256#endif /* DEBUG || DEVELOPMENT */
6257 ) {
6258 if ((SOCK_DOM(so) == PF_INET ||
6259 SOCK_DOM(so) == PF_INET6) &&
6260 so->so_type == SOCK_STREAM) {
fe8ab488
A
6261 ret = tcp_notsent_lowat_check(so);
6262 }
6263#if MPTCP
6264 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6265 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6266 ret = mptcp_notsent_lowat_check(so);
6267 }
6268#endif
6269 else {
39037602 6270 return 1;
fe8ab488 6271 }
316670eb
A
6272 } else {
6273 ret = 1;
6274 }
6275 }
6276 if (so_wait_for_if_feedback(so))
6277 ret = 0;
39236c6e 6278 return (ret);
316670eb
A
6279}
6280
39037602 6281static int
5ba3f43e 6282filt_sowattach(struct knote *kn, __unused struct kevent_internal_s *kev)
39037602
A
6283{
6284 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6285
6286 /* socket locked */
6287 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn))
6288 so->so_snd.sb_flags |= SB_KNOTE;
6289
6290 /* determine if its already fired */
6291 return filt_sowrite_common(kn, so);
6292}
6293
316670eb 6294static void
39037602 6295filt_sowdetach(struct knote *kn)
316670eb
A
6296{
6297 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6298 socket_lock(so, 1);
39236c6e 6299
39037602
A
6300 if (so->so_snd.sb_flags & SB_KNOTE)
6301 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
6302 so->so_snd.sb_flags &= ~SB_KNOTE;
316670eb
A
6303 socket_unlock(so, 1);
6304}
6305
39037602 6306/*ARGSUSED*/
316670eb 6307static int
39037602 6308filt_sowrite(struct knote *kn, long hint)
316670eb 6309{
316670eb 6310 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
39037602 6311 int ret;
316670eb 6312
39037602 6313 if ((hint & SO_FILT_HINT_LOCKED) == 0)
316670eb 6314 socket_lock(so, 1);
39037602
A
6315
6316 ret = filt_sowrite_common(kn, so);
6317
6318 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6319 socket_unlock(so, 1);
6320
6321 return ret;
6322}
6323
6324static int
6325filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev)
6326{
6327 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6328 int ret;
6329
6330 socket_lock(so, 1);
6331
6332 /*save off the new input fflags and data */
6333 kn->kn_sfflags = kev->fflags;
6334 kn->kn_sdata = kev->data;
6335 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6336 kn->kn_udata = kev->udata;
6337
6338 /* determine if these changes result in a triggered event */
6339 ret = filt_sowrite_common(kn, so);
6340
6341 socket_unlock(so, 1);
6342
6343 return ret;
6344}
6345
6346static int
6347filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6348{
6349#pragma unused(data)
6350 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6351 int ret;
6352
6353 socket_lock(so, 1);
6354 ret = filt_sowrite_common(kn, so);
6355 if (ret) {
6356 *kev = kn->kn_kevent;
6357 if (kn->kn_flags & EV_CLEAR) {
6358 kn->kn_fflags = 0;
6359 kn->kn_data = 0;
6360 }
316670eb 6361 }
39037602
A
6362 socket_unlock(so, 1);
6363 return ret;
6364}
6365
6366static int
6367filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint)
6368{
6369 int ret = 0;
6370 uint32_t level_trigger = 0;
316670eb 6371
39236c6e 6372 if (ev_hint & SO_FILT_HINT_CONNRESET) {
3e170ce0 6373 kn->kn_fflags |= NOTE_CONNRESET;
39236c6e
A
6374 }
6375 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
3e170ce0 6376 kn->kn_fflags |= NOTE_TIMEOUT;
39236c6e
A
6377 }
6378 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
3e170ce0 6379 kn->kn_fflags |= NOTE_NOSRCADDR;
39236c6e
A
6380 }
6381 if (ev_hint & SO_FILT_HINT_IFDENIED) {
3e170ce0 6382 kn->kn_fflags |= NOTE_IFDENIED;
39236c6e
A
6383 }
6384 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
3e170ce0 6385 kn->kn_fflags |= NOTE_KEEPALIVE;
316670eb 6386 }
39236c6e 6387 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
3e170ce0 6388 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
39236c6e
A
6389 }
6390 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
3e170ce0 6391 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
39236c6e 6392 }
3e170ce0
A
6393 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6394 (so->so_state & SS_ISCONNECTED)) {
6395 kn->kn_fflags |= NOTE_CONNECTED;
6396 level_trigger |= NOTE_CONNECTED;
39236c6e 6397 }
3e170ce0
A
6398 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6399 (so->so_state & SS_ISDISCONNECTED)) {
6400 kn->kn_fflags |= NOTE_DISCONNECTED;
6401 level_trigger |= NOTE_DISCONNECTED;
39236c6e
A
6402 }
6403 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6404 if (so->so_proto != NULL &&
3e170ce0 6405 (so->so_proto->pr_flags & PR_EVCONNINFO))
39236c6e
A
6406 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6407 }
316670eb 6408
39037602
A
6409 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6410 tcp_notify_ack_active(so)) {
6411 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6412 }
6413
3e170ce0 6414 if ((so->so_state & SS_CANTRCVMORE)
fe8ab488 6415#if CONTENT_FILTER
3e170ce0 6416 && cfil_sock_data_pending(&so->so_rcv) == 0
fe8ab488 6417#endif /* CONTENT_FILTER */
3e170ce0 6418 ) {
316670eb 6419 kn->kn_fflags |= NOTE_READCLOSED;
3e170ce0
A
6420 level_trigger |= NOTE_READCLOSED;
6421 }
316670eb 6422
3e170ce0 6423 if (so->so_state & SS_CANTSENDMORE) {
316670eb 6424 kn->kn_fflags |= NOTE_WRITECLOSED;
3e170ce0
A
6425 level_trigger |= NOTE_WRITECLOSED;
6426 }
316670eb 6427
3e170ce0
A
6428 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6429 (so->so_flags & SOF_SUSPENDED)) {
39236c6e 6430 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
3e170ce0
A
6431
6432 /* If resume event was delivered before, reset it */
6433 kn->kn_hookid &= ~NOTE_RESUME;
6434
316670eb 6435 kn->kn_fflags |= NOTE_SUSPEND;
3e170ce0 6436 level_trigger |= NOTE_SUSPEND;
316670eb
A
6437 }
6438
3e170ce0
A
6439 if ((ev_hint & SO_FILT_HINT_RESUME) ||
6440 (so->so_flags & SOF_SUSPENDED) == 0) {
39236c6e 6441 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
3e170ce0
A
6442
6443 /* If suspend event was delivered before, reset it */
6444 kn->kn_hookid &= ~NOTE_SUSPEND;
6445
316670eb 6446 kn->kn_fflags |= NOTE_RESUME;
3e170ce0 6447 level_trigger |= NOTE_RESUME;
316670eb
A
6448 }
6449
6450 if (so->so_error != 0) {
6451 ret = 1;
6452 kn->kn_data = so->so_error;
6453 kn->kn_flags |= EV_EOF;
6454 } else {
6455 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
6456 }
6457
3e170ce0
A
6458 /* Reset any events that are not requested on this knote */
6459 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6460 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6461
6462 /* Find the level triggerred events that are already delivered */
6463 level_trigger &= kn->kn_hookid;
6464 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6465
6466 /* Do not deliver level triggerred events more than once */
6467 if ((kn->kn_fflags & ~level_trigger) != 0)
316670eb
A
6468 ret = 1;
6469
39236c6e 6470 return (ret);
316670eb
A
6471}
6472
39037602 6473static int
5ba3f43e 6474filt_sockattach(struct knote *kn, __unused struct kevent_internal_s *kev)
39037602
A
6475{
6476 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6477
6478 /* socket locked */
6479 kn->kn_hookid = 0;
6480 if (KNOTE_ATTACH(&so->so_klist, kn))
6481 so->so_flags |= SOF_KNOTE;
6482
6483 /* determine if event already fired */
6484 return filt_sockev_common(kn, so, 0);
6485}
6486
3e170ce0 6487static void
39037602 6488filt_sockdetach(struct knote *kn)
3e170ce0 6489{
39037602
A
6490 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6491 socket_lock(so, 1);
3e170ce0 6492
39037602
A
6493 if ((so->so_flags & SOF_KNOTE) != 0)
6494 if (KNOTE_DETACH(&so->so_klist, kn))
6495 so->so_flags &= ~SOF_KNOTE;
6496 socket_unlock(so, 1);
6497}
6498
6499static int
6500filt_sockev(struct knote *kn, long hint)
6501{
6502 int ret = 0, locked = 0;
6503 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6504 long ev_hint = (hint & SO_FILT_HINT_EV);
6505
6506 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6507 socket_lock(so, 1);
6508 locked = 1;
3e170ce0 6509 }
39037602
A
6510
6511 ret = filt_sockev_common(kn, so, ev_hint);
6512
6513 if (locked)
6514 socket_unlock(so, 1);
6515
6516 return ret;
6517}
6518
6519
6520
6521/*
6522 * filt_socktouch - update event state
6523 */
6524static int
6525filt_socktouch(
6526 struct knote *kn,
6527 struct kevent_internal_s *kev)
6528{
6529 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6530 uint32_t changed_flags;
6531 int ret;
6532
6533 socket_lock(so, 1);
6534
6535 /* save off the [result] data and fflags */
6536 changed_flags = (kn->kn_sfflags ^ kn->kn_hookid);
6537
6538 /* save off the new input fflags and data */
6539 kn->kn_sfflags = kev->fflags;
6540 kn->kn_sdata = kev->data;
6541 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6542 kn->kn_udata = kev->udata;
6543
6544 /* restrict the current results to the (smaller?) set of new interest */
6545 /*
6546 * For compatibility with previous implementations, we leave kn_fflags
6547 * as they were before.
6548 */
6549 //kn->kn_fflags &= kev->fflags;
6550
6551 /*
6552 * Since we keep track of events that are already
6553 * delivered, if any of those events are not requested
6554 * anymore the state related to them can be reset
6555 */
6556 kn->kn_hookid &=
6557 ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6558
6559 /* determine if we have events to deliver */
6560 ret = filt_sockev_common(kn, so, 0);
6561
6562 socket_unlock(so, 1);
6563
6564 return ret;
6565}
6566
6567/*
6568 * filt_sockprocess - query event fired state and return data
6569 */
6570static int
6571filt_sockprocess(
6572 struct knote *kn,
6573 struct filt_process_s *data,
6574 struct kevent_internal_s *kev)
6575{
6576#pragma unused(data)
6577
6578 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6579 int ret = 0;
6580
6581 socket_lock(so, 1);
6582
6583 ret = filt_sockev_common(kn, so, 0);
6584 if (ret) {
6585 *kev = kn->kn_kevent;
6586
3e170ce0
A
6587 /*
6588 * Store the state of the events being delivered. This
6589 * state can be used to deliver level triggered events
6590 * ateast once and still avoid waking up the application
6591 * multiple times as long as the event is active.
6592 */
6593 if (kn->kn_fflags != 0)
6594 kn->kn_hookid |= (kn->kn_fflags &
39037602 6595 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
3e170ce0
A
6596
6597 /*
6598 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6599 * only one of them and remember the last one that was
6600 * delivered last
6601 */
6602 if (kn->kn_fflags & NOTE_SUSPEND)
6603 kn->kn_hookid &= ~NOTE_RESUME;
6604 if (kn->kn_fflags & NOTE_RESUME)
6605 kn->kn_hookid &= ~NOTE_SUSPEND;
39037602
A
6606
6607 if (kn->kn_flags & EV_CLEAR) {
6608 kn->kn_data = 0;
6609 kn->kn_fflags = 0;
6610 }
3e170ce0 6611 }
39037602
A
6612
6613 socket_unlock(so, 1);
6614
6615 return ret;
3e170ce0
A
6616}
6617
316670eb 6618void
39236c6e
A
6619get_sockev_state(struct socket *so, u_int32_t *statep)
6620{
316670eb
A
6621 u_int32_t state = *(statep);
6622
39037602
A
6623 /*
6624 * If the state variable is already used by a previous event,
6625 * reset it.
6626 */
6627 if (state != 0)
6628 return;
6629
39236c6e 6630 if (so->so_state & SS_ISCONNECTED)
316670eb 6631 state |= SOCKEV_CONNECTED;
39236c6e 6632 else
316670eb 6633 state &= ~(SOCKEV_CONNECTED);
39236c6e 6634 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
316670eb 6635 *(statep) = state;
55e303ae
A
6636}
6637
39236c6e
A
6638#define SO_LOCK_HISTORY_STR_LEN \
6639 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
b0d623f7 6640
39236c6e
A
6641__private_extern__ const char *
6642solockhistory_nr(struct socket *so)
55e303ae 6643{
39236c6e
A
6644 size_t n = 0;
6645 int i;
6646 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
6647
6648 bzero(lock_history_str, sizeof (lock_history_str));
6649 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
6650 n += snprintf(lock_history_str + n,
6651 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
6652 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
6653 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
b0d623f7 6654 }
39236c6e 6655 return (lock_history_str);
55e303ae
A
6656}
6657
5ba3f43e 6658void
2d21ac55 6659socket_lock(struct socket *so, int refcount)
91447636 6660{
b0d623f7 6661 void *lr_saved;
0c530ab8 6662
b0d623f7 6663 lr_saved = __builtin_return_address(0);
91447636
A
6664
6665 if (so->so_proto->pr_lock) {
5ba3f43e 6666 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
2d21ac55 6667 } else {
91447636 6668#ifdef MORE_LOCKING_DEBUG
5ba3f43e 6669 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
2d21ac55 6670 LCK_MTX_ASSERT_NOTOWNED);
91447636
A
6671#endif
6672 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
6673 if (refcount)
6674 so->so_usecount++;
b0d623f7 6675 so->lock_lr[so->next_lock_lr] = lr_saved;
0c530ab8 6676 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
91447636 6677 }
5ba3f43e 6678}
91447636 6679
5ba3f43e
A
6680void
6681socket_lock_assert_owned(struct socket *so)
6682{
6683 lck_mtx_t *mutex_held;
6684
6685 if (so->so_proto->pr_getlock != NULL)
6686 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
6687 else
6688 mutex_held = so->so_proto->pr_domain->dom_mtx;
6689
6690 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
91447636
A
6691}
6692
6693int
5ba3f43e
A
6694socket_try_lock(struct socket *so)
6695{
6696 lck_mtx_t *mtx;
6697
6698 if (so->so_proto->pr_getlock != NULL)
6699 mtx = (*so->so_proto->pr_getlock)(so, 0);
6700 else
6701 mtx = so->so_proto->pr_domain->dom_mtx;
6702
6703 return (lck_mtx_try_lock(mtx));
6704}
6705
6706void
2d21ac55 6707socket_unlock(struct socket *so, int refcount)
91447636 6708{
b0d623f7 6709 void *lr_saved;
2d21ac55 6710 lck_mtx_t *mutex_held;
91447636 6711
b0d623f7 6712 lr_saved = __builtin_return_address(0);
91447636 6713
39236c6e
A
6714 if (so->so_proto == NULL) {
6715 panic("%s: null so_proto so=%p\n", __func__, so);
6716 /* NOTREACHED */
6717 }
91447636 6718
2d21ac55 6719 if (so && so->so_proto->pr_unlock) {
5ba3f43e 6720 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
2d21ac55 6721 } else {
91447636
A
6722 mutex_held = so->so_proto->pr_domain->dom_mtx;
6723#ifdef MORE_LOCKING_DEBUG
5ba3f43e 6724 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
91447636 6725#endif
b0d623f7 6726 so->unlock_lr[so->next_unlock_lr] = lr_saved;
0c530ab8
A
6727 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
6728
91447636 6729 if (refcount) {
39236c6e
A
6730 if (so->so_usecount <= 0) {
6731 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
6732 "lrh=%s", __func__, so->so_usecount, so,
6733 SOCK_DOM(so), so->so_type,
6734 SOCK_PROTO(so), solockhistory_nr(so));
6735 /* NOTREACHED */
6736 }
6737
91447636 6738 so->so_usecount--;
39236c6e 6739 if (so->so_usecount == 0)
91447636 6740 sofreelastref(so, 1);
91447636
A
6741 }
6742 lck_mtx_unlock(mutex_held);
6743 }
91447636 6744}
2d21ac55
A
6745
6746/* Called with socket locked, will unlock socket */
91447636 6747void
2d21ac55 6748sofree(struct socket *so)
91447636 6749{
2d21ac55 6750 lck_mtx_t *mutex_held;
39236c6e 6751
2d21ac55 6752 if (so->so_proto->pr_getlock != NULL)
91447636 6753 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2d21ac55 6754 else
91447636 6755 mutex_held = so->so_proto->pr_domain->dom_mtx;
5ba3f43e 6756 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
2d21ac55 6757
91447636
A
6758 sofreelastref(so, 0);
6759}
6760
6761void
2d21ac55 6762soreference(struct socket *so)
91447636
A
6763{
6764 socket_lock(so, 1); /* locks & take one reference on socket */
6765 socket_unlock(so, 0); /* unlock only */
6766}
6767
6768void
2d21ac55 6769sodereference(struct socket *so)
91447636
A
6770{
6771 socket_lock(so, 0);
6772 socket_unlock(so, 1);
6773}
2d21ac55
A
6774
6775/*
6776 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
6777 * possibility of using jumbo clusters. Caller must ensure to hold
6778 * the socket lock.
6779 */
6780void
6781somultipages(struct socket *so, boolean_t set)
6782{
6783 if (set)
6784 so->so_flags |= SOF_MULTIPAGES;
6785 else
6786 so->so_flags &= ~SOF_MULTIPAGES;
6787}
b0d623f7 6788
fe8ab488
A
6789void
6790soif2kcl(struct socket *so, boolean_t set)
6791{
6792 if (set)
6793 so->so_flags1 |= SOF1_IF_2KCL;
6794 else
6795 so->so_flags1 &= ~SOF1_IF_2KCL;
6796}
6797
b0d623f7
A
6798int
6799so_isdstlocal(struct socket *so) {
6800
6801 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6802
39236c6e
A
6803 if (SOCK_DOM(so) == PF_INET)
6804 return (inaddr_local(inp->inp_faddr));
6805 else if (SOCK_DOM(so) == PF_INET6)
6806 return (in6addr_local(&inp->in6p_faddr));
6807
6808 return (0);
b0d623f7 6809}
6d2010ae
A
6810
6811int
6812sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
6813{
39236c6e 6814 struct sockbuf *rcv, *snd;
6d2010ae
A
6815 int err = 0, defunct;
6816
39236c6e
A
6817 rcv = &so->so_rcv;
6818 snd = &so->so_snd;
6819
6d2010ae
A
6820 defunct = (so->so_flags & SOF_DEFUNCT);
6821 if (defunct) {
39236c6e 6822 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
6d2010ae 6823 panic("%s: SB_DROP not set", __func__);
39236c6e
A
6824 /* NOTREACHED */
6825 }
6d2010ae
A
6826 goto done;
6827 }
6828
6829 if (so->so_flags & SOF_NODEFUNCT) {
6830 if (noforce) {
6831 err = EOPNOTSUPP;
39037602
A
6832 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
6833 "name %s level %d) so 0x%llx [%d,%d] "
6834 "is not eligible for defunct "
6835 "(%d)\n", __func__, proc_selfpid(),
6836 proc_best_name(current_proc()), proc_pid(p),
6837 proc_best_name(p), level,
6838 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6839 SOCK_DOM(so), SOCK_TYPE(so), err);
6d2010ae
A
6840 return (err);
6841 }
6842 so->so_flags &= ~SOF_NODEFUNCT;
39037602
A
6843 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6844 "so 0x%llx [%d,%d] defunct by force\n", __func__,
6845 proc_selfpid(), proc_best_name(current_proc()),
6846 proc_pid(p), proc_best_name(p), level,
6847 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6848 SOCK_DOM(so), SOCK_TYPE(so));
3e170ce0
A
6849 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
6850 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6851 struct ifnet *ifp = inp->inp_last_outifp;
6852
6853 if (ifp && IFNET_IS_CELLULAR(ifp)) {
6854 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
6855 } else if (so->so_flags & SOF_DELEGATED) {
6856 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
6857 } else if (soextbkidlestat.so_xbkidle_time == 0) {
6858 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
6859 } else if (noforce) {
6860 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
39037602 6861
3e170ce0
A
6862 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
6863 so->so_extended_bk_start = net_uptime();
6864 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
39037602 6865
3e170ce0 6866 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
39037602 6867
3e170ce0 6868 err = EOPNOTSUPP;
39037602
A
6869 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s "
6870 "level %d) extend bk idle so 0x%llx rcv hw %d "
6871 "cc %d\n",
6872 __func__, proc_selfpid(),
6873 proc_best_name(current_proc()), proc_pid(p),
6874 proc_best_name(p), level,
6875 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6876 so->so_rcv.sb_hiwat, so->so_rcv.sb_cc);
3e170ce0
A
6877 return (err);
6878 } else {
6879 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
6880 }
6d2010ae
A
6881 }
6882
6883 so->so_flags |= SOF_DEFUNCT;
39236c6e 6884
6d2010ae 6885 /* Prevent further data from being appended to the socket buffers */
39236c6e
A
6886 snd->sb_flags |= SB_DROP;
6887 rcv->sb_flags |= SB_DROP;
6888
6889 /* Flush any existing data in the socket buffers */
6890 if (rcv->sb_cc != 0) {
6891 rcv->sb_flags &= ~SB_SEL;
6892 selthreadclear(&rcv->sb_sel);
6893 sbrelease(rcv);
6894 }
6895 if (snd->sb_cc != 0) {
6896 snd->sb_flags &= ~SB_SEL;
6897 selthreadclear(&snd->sb_sel);
6898 sbrelease(snd);
6899 }
6d2010ae
A
6900
6901done:
39037602
A
6902 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6903 "so 0x%llx [%d,%d] %s defunct%s\n", __func__, proc_selfpid(),
6904 proc_best_name(current_proc()), proc_pid(p), proc_best_name(p),
6905 level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6906 SOCK_TYPE(so), defunct ? "is already" : "marked as",
6907 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ? " extbkidle" : "");
6d2010ae
A
6908
6909 return (err);
6910}
6911
6912int
6913sodefunct(struct proc *p, struct socket *so, int level)
6914{
6915 struct sockbuf *rcv, *snd;
6916
39236c6e 6917 if (!(so->so_flags & SOF_DEFUNCT)) {
6d2010ae 6918 panic("%s improperly called", __func__);
39236c6e
A
6919 /* NOTREACHED */
6920 }
6d2010ae
A
6921 if (so->so_state & SS_DEFUNCT)
6922 goto done;
6923
6924 rcv = &so->so_rcv;
6925 snd = &so->so_snd;
6926
39236c6e
A
6927 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6928 char s[MAX_IPv6_STR_LEN];
6929 char d[MAX_IPv6_STR_LEN];
6930 struct inpcb *inp = sotoinpcb(so);
6931
39037602
A
6932 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6933 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
6934 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n",
6935 __func__, proc_selfpid(), proc_best_name(current_proc()),
6936 proc_pid(p), proc_best_name(p), level,
6937 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39236c6e
A
6938 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
6939 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
6940 (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr),
6941 s, sizeof (s)), ntohs(inp->in6p_lport),
6942 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
6943 (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr,
6944 d, sizeof (d)), ntohs(inp->in6p_fport),
6945 (uint32_t)rcv->sb_sel.si_flags,
6946 (uint32_t)snd->sb_sel.si_flags,
39037602 6947 rcv->sb_flags, snd->sb_flags);
39236c6e 6948 } else {
39037602
A
6949 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6950 "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
6951 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
6952 proc_selfpid(), proc_best_name(current_proc()),
6953 proc_pid(p), proc_best_name(p), level,
6954 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6955 SOCK_DOM(so), SOCK_TYPE(so),
6956 (uint32_t)rcv->sb_sel.si_flags,
39236c6e 6957 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
39037602 6958 snd->sb_flags);
39236c6e 6959 }
6d2010ae
A
6960
6961 /*
6962 * Unwedge threads blocked on sbwait() and sb_lock().
6963 */
6964 sbwakeup(rcv);
6965 sbwakeup(snd);
6966
fe8ab488 6967 so->so_flags1 |= SOF1_DEFUNCTINPROG;
6d2010ae 6968 if (rcv->sb_flags & SB_LOCK)
39236c6e 6969 sbunlock(rcv, TRUE); /* keep socket locked */
6d2010ae 6970 if (snd->sb_flags & SB_LOCK)
39236c6e 6971 sbunlock(snd, TRUE); /* keep socket locked */
6d2010ae
A
6972
6973 /*
6974 * Flush the buffers and disconnect. We explicitly call shutdown
6975 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
6976 * states are set for the socket. This would also flush out data
6977 * hanging off the receive list of this socket.
6978 */
fe8ab488
A
6979 (void) soshutdownlock_final(so, SHUT_RD);
6980 (void) soshutdownlock_final(so, SHUT_WR);
6d2010ae
A
6981 (void) sodisconnectlocked(so);
6982
6983 /*
6984 * Explicitly handle connectionless-protocol disconnection
6985 * and release any remaining data in the socket buffers.
6986 */
5ba3f43e 6987 if (!(so->so_state & SS_ISDISCONNECTED))
6d2010ae
A
6988 (void) soisdisconnected(so);
6989
6990 if (so->so_error == 0)
6991 so->so_error = EBADF;
6992
39236c6e
A
6993 if (rcv->sb_cc != 0) {
6994 rcv->sb_flags &= ~SB_SEL;
6995 selthreadclear(&rcv->sb_sel);
6d2010ae 6996 sbrelease(rcv);
39236c6e
A
6997 }
6998 if (snd->sb_cc != 0) {
6999 snd->sb_flags &= ~SB_SEL;
7000 selthreadclear(&snd->sb_sel);
6d2010ae 7001 sbrelease(snd);
39236c6e 7002 }
6d2010ae 7003 so->so_state |= SS_DEFUNCT;
39037602 7004 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
6d2010ae
A
7005
7006done:
7007 return (0);
7008}
316670eb 7009
3e170ce0
A
7010int
7011soresume(struct proc *p, struct socket *so, int locked)
7012{
7013 if (locked == 0)
7014 socket_lock(so, 1);
7015
7016 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
39037602
A
7017 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7018 "[%d,%d] resumed from bk idle\n",
7019 __func__, proc_selfpid(), proc_best_name(current_proc()),
7020 proc_pid(p), proc_best_name(p),
3e170ce0 7021 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 7022 SOCK_DOM(so), SOCK_TYPE(so));
3e170ce0
A
7023
7024 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7025 so->so_extended_bk_start = 0;
7026 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7027
7028 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7029 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7030 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7031 }
7032 if (locked == 0)
7033 socket_unlock(so, 1);
7034
7035 return (0);
7036}
7037
7038/*
7039 * Does not attempt to account for sockets that are delegated from
7040 * the current process
7041 */
7042int
7043so_set_extended_bk_idle(struct socket *so, int optval)
7044{
7045 int error = 0;
7046
7047 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7048 SOCK_PROTO(so) != IPPROTO_TCP) {
7049 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7050 error = EOPNOTSUPP;
7051 } else if (optval == 0) {
7052 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7053
7054 soresume(current_proc(), so, 1);
7055 } else {
7056 struct proc *p = current_proc();
7057 int i;
7058 struct filedesc *fdp;
7059 int count = 0;
7060
5ba3f43e
A
7061 /*
7062 * Unlock socket to avoid lock ordering issue with
7063 * the proc fd table lock
7064 */
7065 socket_unlock(so, 0);
7066
3e170ce0
A
7067 proc_fdlock(p);
7068
7069 fdp = p->p_fd;
7070 for (i = 0; i < fdp->fd_nfiles; i++) {
7071 struct fileproc *fp = fdp->fd_ofiles[i];
7072 struct socket *so2;
7073
7074 if (fp == NULL ||
7075 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7076 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
7077 continue;
7078
7079 so2 = (struct socket *)fp->f_fglob->fg_data;
7080 if (so != so2 &&
7081 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED)
7082 count++;
7083 if (count >= soextbkidlestat.so_xbkidle_maxperproc)
7084 break;
7085 }
5ba3f43e
A
7086 proc_fdunlock(p);
7087
7088 socket_lock(so, 0);
7089
3e170ce0
A
7090 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7091 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7092 error = EBUSY;
7093 } else if (so->so_flags & SOF_DELEGATED) {
7094 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7095 error = EBUSY;
7096 } else {
7097 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7098 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7099 }
39037602 7100 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
3e170ce0 7101 "%s marked for extended bk idle\n",
39037602 7102 __func__, proc_selfpid(), proc_best_name(current_proc()),
3e170ce0
A
7103 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7104 SOCK_DOM(so), SOCK_TYPE(so),
7105 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
39037602 7106 "is" : "not");
3e170ce0
A
7107 }
7108
7109 return (error);
7110}
7111
7112static void
7113so_stop_extended_bk_idle(struct socket *so)
7114{
7115 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7116 so->so_extended_bk_start = 0;
7117
7118 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7119 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7120 /*
7121 * Force defunct
7122 */
7123 sosetdefunct(current_proc(), so,
7124 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7125 if (so->so_flags & SOF_DEFUNCT) {
7126 sodefunct(current_proc(), so,
7127 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7128 }
7129}
7130
7131void
7132so_drain_extended_bk_idle(struct socket *so)
7133{
7134 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7135 /*
7136 * Only penalize sockets that have outstanding data
7137 */
7138 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7139 so_stop_extended_bk_idle(so);
7140
7141 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7142 }
7143 }
7144}
7145
7146/*
7147 * Return values tells if socket is still in extended background idle
7148 */
7149int
7150so_check_extended_bk_idle_time(struct socket *so)
7151{
7152 int ret = 1;
7153
7154 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
39037602
A
7155 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7156 __func__, proc_selfpid(), proc_best_name(current_proc()),
3e170ce0 7157 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 7158 SOCK_DOM(so), SOCK_TYPE(so));
3e170ce0
A
7159 if (net_uptime() - so->so_extended_bk_start >
7160 soextbkidlestat.so_xbkidle_time) {
7161 so_stop_extended_bk_idle(so);
7162
7163 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7164
7165 ret = 0;
7166 } else {
7167 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7168
7169 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7170 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7171 }
7172 }
39037602 7173
3e170ce0
A
7174 return (ret);
7175}
7176
7177void
7178resume_proc_sockets(proc_t p)
7179{
7180 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7181 struct filedesc *fdp;
7182 int i;
7183
7184 proc_fdlock(p);
7185 fdp = p->p_fd;
7186 for (i = 0; i < fdp->fd_nfiles; i++) {
7187 struct fileproc *fp;
7188 struct socket *so;
7189
7190 fp = fdp->fd_ofiles[i];
39037602 7191 if (fp == NULL ||
3e170ce0
A
7192 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7193 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
7194 continue;
7195
7196 so = (struct socket *)fp->f_fglob->fg_data;
7197 (void) soresume(p, so, 0);
7198 }
7199 proc_fdunlock(p);
7200
7201 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7202 }
7203}
7204
316670eb
A
7205__private_extern__ int
7206so_set_recv_anyif(struct socket *so, int optval)
7207{
7208 int ret = 0;
7209
7210#if INET6
39236c6e 7211 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
316670eb 7212#else
39236c6e 7213 if (SOCK_DOM(so) == PF_INET) {
316670eb
A
7214#endif /* !INET6 */
7215 if (optval)
7216 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7217 else
7218 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
316670eb
A
7219 }
7220
5ba3f43e 7221
316670eb
A
7222 return (ret);
7223}
7224
7225__private_extern__ int
7226so_get_recv_anyif(struct socket *so)
7227{
7228 int ret = 0;
7229
7230#if INET6
39236c6e 7231 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
316670eb 7232#else
39236c6e 7233 if (SOCK_DOM(so) == PF_INET) {
316670eb
A
7234#endif /* !INET6 */
7235 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7236 }
7237
7238 return (ret);
7239}
39236c6e
A
7240
7241int
7242so_set_restrictions(struct socket *so, uint32_t vals)
7243{
7244 int nocell_old, nocell_new;
fe8ab488 7245 int noexpensive_old, noexpensive_new;
39236c6e
A
7246
7247 /*
7248 * Deny-type restrictions are trapdoors; once set they cannot be
7249 * unset for the lifetime of the socket. This allows them to be
7250 * issued by a framework on behalf of the application without
7251 * having to worry that they can be undone.
7252 *
7253 * Note here that socket-level restrictions overrides any protocol
7254 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7255 * socket restriction issued on the socket has a higher precendence
7256 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7257 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7258 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7259 */
7260 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
fe8ab488 7261 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
39236c6e 7262 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
3e170ce0 7263 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
fe8ab488 7264 SO_RESTRICT_DENY_EXPENSIVE));
39236c6e 7265 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
fe8ab488 7266 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
39236c6e
A
7267
7268 /* we can only set, not clear restrictions */
fe8ab488
A
7269 if ((nocell_new - nocell_old) == 0 &&
7270 (noexpensive_new - noexpensive_old) == 0)
7271 return (0);
39236c6e
A
7272#if INET6
7273 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7274#else
7275 if (SOCK_DOM(so) == PF_INET) {
7276#endif /* !INET6 */
fe8ab488 7277 if (nocell_new - nocell_old != 0) {
3e170ce0
A
7278 /*
7279 * if deny cellular is now set, do what's needed
7280 * for INPCB
7281 */
fe8ab488
A
7282 inp_set_nocellular(sotoinpcb(so));
7283 }
7284 if (noexpensive_new - noexpensive_old != 0) {
7285 inp_set_noexpensive(sotoinpcb(so));
7286 }
39236c6e
A
7287 }
7288
5ba3f43e
A
7289 if (SOCK_DOM(so) == PF_MULTIPATH)
7290 mptcp_set_restrictions(so);
7291
fe8ab488 7292 return (0);
39236c6e
A
7293}
7294
7295uint32_t
7296so_get_restrictions(struct socket *so)
7297{
7298 return (so->so_restrictions & (SO_RESTRICT_DENY_IN |
3e170ce0 7299 SO_RESTRICT_DENY_OUT |
fe8ab488 7300 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE));
39236c6e
A
7301}
7302
39236c6e
A
7303int
7304so_set_effective_pid(struct socket *so, int epid, struct proc *p)
7305{
7306 struct proc *ep = PROC_NULL;
7307 int error = 0;
7308
7309 /* pid 0 is reserved for kernel */
7310 if (epid == 0) {
7311 error = EINVAL;
7312 goto done;
7313 }
7314
7315 /*
7316 * If this is an in-kernel socket, prevent its delegate
7317 * association from changing unless the socket option is
7318 * coming from within the kernel itself.
7319 */
7320 if (so->last_pid == 0 && p != kernproc) {
7321 error = EACCES;
7322 goto done;
7323 }
7324
7325 /*
7326 * If this is issued by a process that's recorded as the
7327 * real owner of the socket, or if the pid is the same as
7328 * the process's own pid, then proceed. Otherwise ensure
7329 * that the issuing process has the necessary privileges.
7330 */
7331 if (epid != so->last_pid || epid != proc_pid(p)) {
7332 if ((error = priv_check_cred(kauth_cred_get(),
7333 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7334 error = EACCES;
7335 goto done;
7336 }
7337 }
7338
7339 /* Find the process that corresponds to the effective pid */
7340 if ((ep = proc_find(epid)) == PROC_NULL) {
7341 error = ESRCH;
7342 goto done;
7343 }
7344
7345 /*
7346 * If a process tries to delegate the socket to itself, then
7347 * there's really nothing to do; treat it as a way for the
7348 * delegate association to be cleared. Note that we check
7349 * the passed-in proc rather than calling proc_selfpid(),
7350 * as we need to check the process issuing the socket option
7351 * which could be kernproc. Given that we don't allow 0 for
7352 * effective pid, it means that a delegated in-kernel socket
7353 * stays delegated during its lifetime (which is probably OK.)
7354 */
7355 if (epid == proc_pid(p)) {
7356 so->so_flags &= ~SOF_DELEGATED;
7357 so->e_upid = 0;
7358 so->e_pid = 0;
7359 uuid_clear(so->e_uuid);
7360 } else {
7361 so->so_flags |= SOF_DELEGATED;
7362 so->e_upid = proc_uniqueid(ep);
7363 so->e_pid = proc_pid(ep);
7364 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
7365 }
39236c6e
A
7366done:
7367 if (error == 0 && net_io_policy_log) {
7368 uuid_string_t buf;
7369
7370 uuid_unparse(so->e_uuid, buf);
7371 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7372 "euuid %s%s\n", __func__, proc_name_address(p),
3e170ce0
A
7373 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7374 SOCK_DOM(so), SOCK_TYPE(so),
7375 so->e_pid, proc_name_address(ep), buf,
39236c6e
A
7376 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7377 } else if (error != 0 && net_io_policy_log) {
7378 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7379 "ERROR (%d)\n", __func__, proc_name_address(p),
3e170ce0
A
7380 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7381 SOCK_DOM(so), SOCK_TYPE(so),
7382 epid, (ep == PROC_NULL) ? "PROC_NULL" :
39236c6e
A
7383 proc_name_address(ep), error);
7384 }
7385
fe8ab488
A
7386 /* Update this socket's policy upon success */
7387 if (error == 0) {
7388 so->so_policy_gencnt *= -1;
7389 so_update_policy(so);
7390#if NECP
7391 so_update_necp_policy(so, NULL, NULL);
7392#endif /* NECP */
7393 }
7394
39236c6e
A
7395 if (ep != PROC_NULL)
7396 proc_rele(ep);
7397
7398 return (error);
7399}
7400
7401int
7402so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
7403{
7404 uuid_string_t buf;
7405 uuid_t uuid;
7406 int error = 0;
7407
7408 /* UUID must not be all-zeroes (reserved for kernel) */
7409 if (uuid_is_null(euuid)) {
7410 error = EINVAL;
3e170ce0 7411 goto done;
39236c6e
A
7412 }
7413
7414 /*
7415 * If this is an in-kernel socket, prevent its delegate
7416 * association from changing unless the socket option is
7417 * coming from within the kernel itself.
7418 */
7419 if (so->last_pid == 0 && p != kernproc) {
7420 error = EACCES;
7421 goto done;
7422 }
7423
7424 /* Get the UUID of the issuing process */
7425 proc_getexecutableuuid(p, uuid, sizeof (uuid));
7426
7427 /*
7428 * If this is issued by a process that's recorded as the
7429 * real owner of the socket, or if the uuid is the same as
7430 * the process's own uuid, then proceed. Otherwise ensure
7431 * that the issuing process has the necessary privileges.
7432 */
7433 if (uuid_compare(euuid, so->last_uuid) != 0 ||
7434 uuid_compare(euuid, uuid) != 0) {
7435 if ((error = priv_check_cred(kauth_cred_get(),
7436 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7437 error = EACCES;
7438 goto done;
7439 }
7440 }
7441
7442 /*
7443 * If a process tries to delegate the socket to itself, then
7444 * there's really nothing to do; treat it as a way for the
7445 * delegate association to be cleared. Note that we check
7446 * the uuid of the passed-in proc rather than that of the
7447 * current process, as we need to check the process issuing
7448 * the socket option which could be kernproc itself. Given
7449 * that we don't allow 0 for effective uuid, it means that
7450 * a delegated in-kernel socket stays delegated during its
7451 * lifetime (which is okay.)
7452 */
7453 if (uuid_compare(euuid, uuid) == 0) {
7454 so->so_flags &= ~SOF_DELEGATED;
7455 so->e_upid = 0;
7456 so->e_pid = 0;
7457 uuid_clear(so->e_uuid);
7458 } else {
7459 so->so_flags |= SOF_DELEGATED;
7460 /*
7461 * Unlike so_set_effective_pid(), we only have the UUID
7462 * here and the process ID is not known. Inherit the
7463 * real {pid,upid} of the socket.
7464 */
7465 so->e_upid = so->last_upid;
7466 so->e_pid = so->last_pid;
7467 uuid_copy(so->e_uuid, euuid);
7468 }
7469
7470done:
7471 if (error == 0 && net_io_policy_log) {
7472 uuid_unparse(so->e_uuid, buf);
7473 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7474 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
3e170ce0 7475 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
39236c6e
A
7476 SOCK_TYPE(so), so->e_pid, buf,
7477 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7478 } else if (error != 0 && net_io_policy_log) {
7479 uuid_unparse(euuid, buf);
7480 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7481 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
3e170ce0 7482 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
39236c6e
A
7483 SOCK_TYPE(so), buf, error);
7484 }
7485
fe8ab488
A
7486 /* Update this socket's policy upon success */
7487 if (error == 0) {
7488 so->so_policy_gencnt *= -1;
7489 so_update_policy(so);
7490#if NECP
7491 so_update_necp_policy(so, NULL, NULL);
7492#endif /* NECP */
7493 }
7494
39236c6e
A
7495 return (error);
7496}
7497
7498void
7499netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
7500 uint32_t ev_datalen)
7501{
7502 struct kev_msg ev_msg;
7503
7504 /*
7505 * A netpolicy event always starts with a netpolicy_event_data
7506 * structure, but the caller can provide for a longer event
7507 * structure to post, depending on the event code.
7508 */
7509 VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
7510
7511 bzero(&ev_msg, sizeof (ev_msg));
7512 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7513 ev_msg.kev_class = KEV_NETWORK_CLASS;
7514 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
7515 ev_msg.event_code = ev_code;
7516
7517 ev_msg.dv[0].data_ptr = ev_data;
7518 ev_msg.dv[0].data_length = ev_datalen;
7519
7520 kev_post_msg(&ev_msg);
7521}
fe8ab488
A
7522
7523void
3e170ce0 7524socket_post_kev_msg(uint32_t ev_code,
fe8ab488
A
7525 struct kev_socket_event_data *ev_data,
7526 uint32_t ev_datalen)
7527{
7528 struct kev_msg ev_msg;
7529
7530 bzero(&ev_msg, sizeof(ev_msg));
7531 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7532 ev_msg.kev_class = KEV_NETWORK_CLASS;
7533 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
7534 ev_msg.event_code = ev_code;
7535
7536 ev_msg.dv[0].data_ptr = ev_data;
7537 ev_msg.dv[0]. data_length = ev_datalen;
7538
7539 kev_post_msg(&ev_msg);
7540}
7541
7542void
7543socket_post_kev_msg_closed(struct socket *so)
7544{
7545 struct kev_socket_closed ev;
7546 struct sockaddr *socksa = NULL, *peersa = NULL;
7547 int err;
7548 bzero(&ev, sizeof(ev));
7549 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
7550 if (err == 0) {
7551 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
7552 &peersa);
7553 if (err == 0) {
7554 memcpy(&ev.ev_data.kev_sockname, socksa,
7555 min(socksa->sa_len,
7556 sizeof (ev.ev_data.kev_sockname)));
7557 memcpy(&ev.ev_data.kev_peername, peersa,
7558 min(peersa->sa_len,
7559 sizeof (ev.ev_data.kev_peername)));
7560 socket_post_kev_msg(KEV_SOCKET_CLOSED,
3e170ce0 7561 &ev.ev_data, sizeof (ev));
fe8ab488
A
7562 }
7563 }
7564 if (socksa != NULL)
7565 FREE(socksa, M_SONAME);
7566 if (peersa != NULL)
7567 FREE(peersa, M_SONAME);
7568}