]> git.saurik.com Git - apple/xnu.git/blame - bsd/kern/uipc_socket.c
xnu-3789.1.32.tar.gz
[apple/xnu.git] / bsd / kern / uipc_socket.c
CommitLineData
1c79356b 1/*
39037602 2 * Copyright (c) 1998-2016 Apple Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
39236c6e 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
39236c6e 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
39236c6e 17 *
2d21ac55
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
39236c6e 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b 27 */
1c79356b
A
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
9bccf70c 61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
1c79356b 62 */
2d21ac55
A
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
1c79356b
A
69
70#include <sys/param.h>
71#include <sys/systm.h>
55e303ae 72#include <sys/filedesc.h>
2d21ac55 73#include <sys/proc.h>
91447636
A
74#include <sys/proc_internal.h>
75#include <sys/kauth.h>
76#include <sys/file_internal.h>
1c79356b
A
77#include <sys/fcntl.h>
78#include <sys/malloc.h>
79#include <sys/mbuf.h>
80#include <sys/domain.h>
81#include <sys/kernel.h>
55e303ae 82#include <sys/event.h>
1c79356b
A
83#include <sys/poll.h>
84#include <sys/protosw.h>
85#include <sys/socket.h>
86#include <sys/socketvar.h>
87#include <sys/resourcevar.h>
88#include <sys/signalvar.h>
89#include <sys/sysctl.h>
39236c6e 90#include <sys/syslog.h>
1c79356b 91#include <sys/uio.h>
fe8ab488 92#include <sys/uio_internal.h>
1c79356b
A
93#include <sys/ev.h>
94#include <sys/kdebug.h>
2d21ac55 95#include <sys/un.h>
d1ecb069 96#include <sys/user.h>
316670eb 97#include <sys/priv.h>
39236c6e 98#include <sys/kern_event.h>
1c79356b 99#include <net/route.h>
39236c6e 100#include <net/init.h>
316670eb 101#include <net/ntstat.h>
fe8ab488 102#include <net/content_filter.h>
1c79356b
A
103#include <netinet/in.h>
104#include <netinet/in_pcb.h>
39037602
A
105#include <netinet/in_tclass.h>
106#include <netinet/tcp_var.h>
6d2010ae
A
107#include <netinet/ip6.h>
108#include <netinet6/ip6_var.h>
39236c6e 109#include <netinet/flow_divert.h>
1c79356b 110#include <kern/zalloc.h>
91447636 111#include <kern/locks.h>
1c79356b 112#include <machine/limits.h>
2d21ac55
A
113#include <libkern/OSAtomic.h>
114#include <pexpert/pexpert.h>
b0d623f7 115#include <kern/assert.h>
6d2010ae 116#include <kern/task.h>
39037602
A
117#include <kern/policy_internal.h>
118
316670eb 119#include <sys/kpi_mbuf.h>
6d2010ae 120#include <sys/mcache.h>
fe8ab488 121#include <sys/unpcb.h>
2d21ac55
A
122
123#if CONFIG_MACF
124#include <security/mac.h>
125#include <security/mac_framework.h>
126#endif /* MAC */
127
39236c6e
A
128#if MULTIPATH
129#include <netinet/mp_pcb.h>
fe8ab488 130#include <netinet/mptcp_var.h>
39236c6e
A
131#endif /* MULTIPATH */
132
3e170ce0
A
133#define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
134
135#if DEBUG || DEVELOPMENT
136#define DEBUG_KERNEL_ADDRPERM(_v) (_v)
137#else
138#define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
139#endif
140
39236c6e
A
141/* TODO: this should be in a header file somewhere */
142extern char *proc_name_address(void *p);
39037602 143extern char *proc_best_name(proc_t);
39236c6e
A
144
145static u_int32_t so_cache_hw; /* High water mark for socache */
146static u_int32_t so_cache_timeouts; /* number of timeouts */
147static u_int32_t so_cache_max_freed; /* max freed per timeout */
148static u_int32_t cached_sock_count = 0;
149STAILQ_HEAD(, socket) so_cache_head;
150int max_cached_sock_count = MAX_CACHED_SOCKETS;
151static u_int32_t so_cache_time;
152static int socketinit_done;
153static struct zone *so_cache_zone;
154
155static lck_grp_t *so_cache_mtx_grp;
156static lck_attr_t *so_cache_mtx_attr;
91447636 157static lck_grp_attr_t *so_cache_mtx_grp_attr;
39236c6e 158static lck_mtx_t *so_cache_mtx;
91447636 159
1c79356b
A
160#include <machine/limits.h>
161
39037602 162static int filt_sorattach(struct knote *kn);
2d21ac55
A
163static void filt_sordetach(struct knote *kn);
164static int filt_soread(struct knote *kn, long hint);
39037602
A
165static int filt_sortouch(struct knote *kn, struct kevent_internal_s *kev);
166static int filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
167
168static int filt_sowattach(struct knote *kn);
2d21ac55
A
169static void filt_sowdetach(struct knote *kn);
170static int filt_sowrite(struct knote *kn, long hint);
39037602
A
171static int filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev);
172static int filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
173
174static int filt_sockattach(struct knote *kn);
316670eb
A
175static void filt_sockdetach(struct knote *kn);
176static int filt_sockev(struct knote *kn, long hint);
39037602
A
177static int filt_socktouch(struct knote *kn, struct kevent_internal_s *kev);
178static int filt_sockprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
2d21ac55 179
39236c6e
A
180static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
181static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
55e303ae 182
39037602 183struct filterops soread_filtops = {
39236c6e 184 .f_isfd = 1,
39037602 185 .f_attach = filt_sorattach,
39236c6e
A
186 .f_detach = filt_sordetach,
187 .f_event = filt_soread,
39037602
A
188 .f_touch = filt_sortouch,
189 .f_process = filt_sorprocess,
b0d623f7 190};
39236c6e 191
39037602 192struct filterops sowrite_filtops = {
39236c6e 193 .f_isfd = 1,
39037602 194 .f_attach = filt_sowattach,
39236c6e
A
195 .f_detach = filt_sowdetach,
196 .f_event = filt_sowrite,
39037602
A
197 .f_touch = filt_sowtouch,
198 .f_process = filt_sowprocess,
b0d623f7 199};
39236c6e 200
39037602 201struct filterops sock_filtops = {
316670eb 202 .f_isfd = 1,
39037602 203 .f_attach = filt_sockattach,
316670eb
A
204 .f_detach = filt_sockdetach,
205 .f_event = filt_sockev,
3e170ce0 206 .f_touch = filt_socktouch,
39037602
A
207 .f_process = filt_sockprocess,
208};
209
210struct filterops soexcept_filtops = {
211 .f_isfd = 1,
212 .f_attach = filt_sorattach,
213 .f_detach = filt_sordetach,
214 .f_event = filt_soread,
215 .f_touch = filt_sortouch,
216 .f_process = filt_sorprocess,
316670eb 217};
55e303ae 218
fe8ab488
A
219SYSCTL_DECL(_kern_ipc);
220
2d21ac55 221#define EVEN_MORE_LOCKING_DEBUG 0
fe8ab488 222
1c79356b 223int socket_debug = 0;
fe8ab488
A
224SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
225 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
226
39037602
A
227static unsigned long sodefunct_calls = 0;
228SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
229 &sodefunct_calls, "");
230
39236c6e 231static int socket_zone = M_SOCKET;
1c79356b
A
232so_gen_t so_gencnt; /* generation count for sockets */
233
234MALLOC_DEFINE(M_SONAME, "soname", "socket name");
235MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
236
2d21ac55
A
237#define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
238#define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
239#define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
240#define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
241#define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
fe8ab488 242#define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
2d21ac55 243#define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
fe8ab488 244#define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
2d21ac55 245#define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
1c79356b 246
2d21ac55 247#define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
1c79356b 248
2d21ac55 249int somaxconn = SOMAXCONN;
39236c6e
A
250SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
251 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
1c79356b
A
252
253/* Should we get a maximum also ??? */
fa4905b1 254static int sosendmaxchain = 65536;
1c79356b 255static int sosendminchain = 16384;
55e303ae 256static int sorecvmincopy = 16384;
39236c6e
A
257SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
258 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
259SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
260 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
2d21ac55
A
261
262/*
263 * Set to enable jumbo clusters (if available) for large writes when
264 * the socket is marked with SOF_MULTIPAGES; see below.
265 */
266int sosendjcl = 1;
39236c6e
A
267SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
268 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
1c79356b 269
2d21ac55
A
270/*
271 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
272 * writes on the socket for all protocols on any network interfaces,
273 * depending upon sosendjcl above. Be extra careful when setting this
274 * to 1, because sending down packets that cross physical pages down to
275 * broken drivers (those that falsely assume that the physical pages
276 * are contiguous) might lead to system panics or silent data corruption.
277 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
278 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
279 * capable. Set this to 1 only for testing/debugging purposes.
280 */
281int sosendjcl_ignore_capab = 0;
39236c6e
A
282SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
283 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
1c79356b 284
3e170ce0
A
285/*
286 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
287 * writes on the socket for all protocols on any network interfaces.
288 * Be extra careful when setting this to 1, because sending down packets with
289 * clusters larger that 2 KB might lead to system panics or data corruption.
290 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
291 * on the outgoing interface
292 * Set this to 1 for testing/debugging purposes only.
293 */
fe8ab488
A
294int sosendbigcl_ignore_capab = 0;
295SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
296 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
297
6d2010ae
A
298int sodefunctlog = 0;
299SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
39236c6e 300 &sodefunctlog, 0, "");
6d2010ae 301
316670eb
A
302int sothrottlelog = 0;
303SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
39236c6e
A
304 &sothrottlelog, 0, "");
305
306int sorestrictrecv = 1;
307SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
308 &sorestrictrecv, 0, "Enable inbound interface restrictions");
316670eb 309
fe8ab488
A
310int sorestrictsend = 1;
311SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
312 &sorestrictsend, 0, "Enable outbound interface restrictions");
1c79356b 313
3e170ce0
A
314int soreserveheadroom = 1;
315SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
316 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
317
39037602
A
318#if (DEBUG || DEVELOPMENT)
319int so_notsent_lowat_check = 1;
320SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW|CTLFLAG_LOCKED,
321 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
322#endif /* DEBUG || DEVELOPMENT */
323
39236c6e 324extern struct inpcbinfo tcbinfo;
2d21ac55
A
325
326/* TODO: these should be in header file */
327extern int get_inpcb_str_size(void);
328extern int get_tcp_str_size(void);
2d21ac55 329
39236c6e
A
330static unsigned int sl_zone_size; /* size of sockaddr_list */
331static struct zone *sl_zone; /* zone for sockaddr_list */
332
333static unsigned int se_zone_size; /* size of sockaddr_entry */
334static struct zone *se_zone; /* zone for sockaddr_entry */
91447636
A
335
336vm_size_t so_cache_zone_element_size;
337
3e170ce0
A
338static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
339 user_ssize_t *);
2d21ac55
A
340static void cached_sock_alloc(struct socket **, int);
341static void cached_sock_free(struct socket *);
91447636 342
3e170ce0
A
343/*
344 * Maximum of extended background idle sockets per process
345 * Set to zero to disable further setting of the option
346 */
347
348#define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
349#define SO_IDLE_BK_IDLE_TIME 600
350#define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
351
352struct soextbkidlestat soextbkidlestat;
353
354SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
355 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
356 "Maximum of extended background idle sockets per process");
357
358SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
359 &soextbkidlestat.so_xbkidle_time, 0,
360 "Time in seconds to keep extended background idle sockets");
361
362SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
363 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
364 "High water mark for extended background idle sockets");
365
366SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
367 &soextbkidlestat, soextbkidlestat, "");
368
369int so_set_extended_bk_idle(struct socket *, int);
370
316670eb
A
371/*
372 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
373 * setting the DSCP code on the packet based on the service class; see
374 * <rdar://problem/11277343> for details.
375 */
39037602 376__private_extern__ u_int32_t sotcdb = 0;
6d2010ae 377SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
39236c6e 378 &sotcdb, 0, "");
91447636 379
2d21ac55
A
380void
381socketinit(void)
1c79356b 382{
fe8ab488
A
383 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
384 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
385
3e170ce0
A
386#ifdef __LP64__
387 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
388 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
389 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
390 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
391 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
392 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
393#else
394 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
395 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
396 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
397 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
398 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
399 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
400#endif
401
39236c6e 402 if (socketinit_done) {
91447636
A
403 printf("socketinit: already called...\n");
404 return;
405 }
39236c6e 406 socketinit_done = 1;
91447636 407
39236c6e
A
408 PE_parse_boot_argn("socket_debug", &socket_debug,
409 sizeof (socket_debug));
2d21ac55 410
91447636
A
411 /*
412 * allocate lock group attribute and group for socket cache mutex
413 */
414 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
2d21ac55
A
415 so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
416 so_cache_mtx_grp_attr);
417
91447636
A
418 /*
419 * allocate the lock attribute for socket cache mutex
420 */
421 so_cache_mtx_attr = lck_attr_alloc_init();
91447636 422
2d21ac55
A
423 /* cached sockets mutex */
424 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
39236c6e
A
425 if (so_cache_mtx == NULL) {
426 panic("%s: unable to allocate so_cache_mtx\n", __func__);
427 /* NOTREACHED */
428 }
429 STAILQ_INIT(&so_cache_head);
1c79356b 430
39236c6e
A
431 so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4
432 + get_inpcb_str_size() + 4 + get_tcp_str_size());
2d21ac55 433
3e170ce0 434 so_cache_zone = zinit(so_cache_zone_element_size,
39236c6e 435 (120000 * so_cache_zone_element_size), 8192, "socache zone");
6d2010ae 436 zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
0b4c1975 437 zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
1c79356b 438
39236c6e
A
439 sl_zone_size = sizeof (struct sockaddr_list);
440 if ((sl_zone = zinit(sl_zone_size, 1024 * sl_zone_size, 1024,
441 "sockaddr_list")) == NULL) {
442 panic("%s: unable to allocate sockaddr_list zone\n", __func__);
443 /* NOTREACHED */
444 }
445 zone_change(sl_zone, Z_CALLERACCT, FALSE);
446 zone_change(sl_zone, Z_EXPAND, TRUE);
447
448 se_zone_size = sizeof (struct sockaddr_entry);
449 if ((se_zone = zinit(se_zone_size, 1024 * se_zone_size, 1024,
450 "sockaddr_entry")) == NULL) {
451 panic("%s: unable to allocate sockaddr_entry zone\n", __func__);
452 /* NOTREACHED */
453 }
454 zone_change(se_zone, Z_CALLERACCT, FALSE);
455 zone_change(se_zone, Z_EXPAND, TRUE);
6d2010ae 456
3e170ce0
A
457 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
458 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
459 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
460 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
316670eb 461
39236c6e
A
462 in_pcbinit();
463 sflt_init();
6d2010ae 464 socket_tclass_init();
39236c6e
A
465#if MULTIPATH
466 mp_pcbinit();
467#endif /* MULTIPATH */
1c79356b
A
468}
469
2d21ac55
A
470static void
471cached_sock_alloc(struct socket **so, int waitok)
1c79356b 472{
2d21ac55 473 caddr_t temp;
39236c6e 474 uintptr_t offset;
1c79356b 475
91447636
A
476 lck_mtx_lock(so_cache_mtx);
477
39236c6e
A
478 if (!STAILQ_EMPTY(&so_cache_head)) {
479 VERIFY(cached_sock_count > 0);
1c79356b 480
39236c6e
A
481 *so = STAILQ_FIRST(&so_cache_head);
482 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
483 STAILQ_NEXT((*so), so_cache_ent) = NULL;
91447636 484
39236c6e 485 cached_sock_count--;
91447636 486 lck_mtx_unlock(so_cache_mtx);
1c79356b 487
2d21ac55
A
488 temp = (*so)->so_saved_pcb;
489 bzero((caddr_t)*so, sizeof (struct socket));
39236c6e 490
2d21ac55 491 (*so)->so_saved_pcb = temp;
2d21ac55 492 } else {
1c79356b 493
2d21ac55 494 lck_mtx_unlock(so_cache_mtx);
1c79356b 495
2d21ac55
A
496 if (waitok)
497 *so = (struct socket *)zalloc(so_cache_zone);
498 else
499 *so = (struct socket *)zalloc_noblock(so_cache_zone);
1c79356b 500
39236c6e 501 if (*so == NULL)
2d21ac55 502 return;
1c79356b 503
2d21ac55 504 bzero((caddr_t)*so, sizeof (struct socket));
1c79356b 505
2d21ac55 506 /*
3e170ce0
A
507 * Define offsets for extra structures into our
508 * single block of memory. Align extra structures
39236c6e 509 * on longword boundaries.
2d21ac55 510 */
b0d623f7 511
39236c6e 512 offset = (uintptr_t)*so;
2d21ac55 513 offset += sizeof (struct socket);
b0d623f7
A
514
515 offset = ALIGN(offset);
516
2d21ac55
A
517 (*so)->so_saved_pcb = (caddr_t)offset;
518 offset += get_inpcb_str_size();
b0d623f7
A
519
520 offset = ALIGN(offset);
1c79356b 521
316670eb 522 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
2d21ac55 523 (caddr_t)offset;
2d21ac55 524 }
1c79356b 525
3e170ce0 526 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
1c79356b
A
527}
528
2d21ac55
A
529static void
530cached_sock_free(struct socket *so)
1c79356b 531{
1c79356b 532
91447636 533 lck_mtx_lock(so_cache_mtx);
1c79356b 534
39236c6e 535 so_cache_time = net_uptime();
b0d623f7 536 if (++cached_sock_count > max_cached_sock_count) {
1c79356b 537 --cached_sock_count;
91447636 538 lck_mtx_unlock(so_cache_mtx);
91447636 539 zfree(so_cache_zone, so);
2d21ac55 540 } else {
1c79356b
A
541 if (so_cache_hw < cached_sock_count)
542 so_cache_hw = cached_sock_count;
543
39236c6e 544 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
1c79356b
A
545
546 so->cache_timestamp = so_cache_time;
91447636 547 lck_mtx_unlock(so_cache_mtx);
1c79356b 548 }
1c79356b
A
549}
550
39236c6e
A
551void
552so_update_last_owner_locked(struct socket *so, proc_t self)
6d2010ae 553{
39236c6e
A
554 if (so->last_pid != 0) {
555 /*
556 * last_pid and last_upid should remain zero for sockets
557 * created using sock_socket. The check above achieves that
558 */
559 if (self == PROC_NULL)
316670eb 560 self = current_proc();
39236c6e
A
561
562 if (so->last_upid != proc_uniqueid(self) ||
563 so->last_pid != proc_pid(self)) {
316670eb
A
564 so->last_upid = proc_uniqueid(self);
565 so->last_pid = proc_pid(self);
39236c6e
A
566 proc_getexecutableuuid(self, so->last_uuid,
567 sizeof (so->last_uuid));
316670eb 568 }
fe8ab488 569 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
6d2010ae
A
570 }
571}
572
39236c6e
A
573void
574so_update_policy(struct socket *so)
1c79356b 575{
39236c6e
A
576 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
577 (void) inp_update_policy(sotoinpcb(so));
578}
1c79356b 579
fe8ab488
A
580#if NECP
581static void
3e170ce0
A
582so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
583 struct sockaddr *override_remote_addr)
fe8ab488
A
584{
585 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
3e170ce0
A
586 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
587 override_remote_addr, 0);
fe8ab488
A
588}
589#endif /* NECP */
590
39236c6e
A
591boolean_t
592so_cache_timer(void)
593{
594 struct socket *p;
595 int n_freed = 0;
596 boolean_t rc = FALSE;
1c79356b 597
39236c6e
A
598 lck_mtx_lock(so_cache_mtx);
599 so_cache_timeouts++;
600 so_cache_time = net_uptime();
601
602 while (!STAILQ_EMPTY(&so_cache_head)) {
603 VERIFY(cached_sock_count > 0);
604 p = STAILQ_FIRST(&so_cache_head);
3e170ce0 605 if ((so_cache_time - p->cache_timestamp) <
39236c6e 606 SO_CACHE_TIME_LIMIT)
2d21ac55 607 break;
1c79356b 608
39236c6e
A
609 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
610 --cached_sock_count;
1c79356b 611
91447636 612 zfree(so_cache_zone, p);
2d21ac55
A
613
614 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
615 so_cache_max_freed++;
1c79356b
A
616 break;
617 }
618 }
1c79356b 619
39236c6e
A
620 /* Schedule again if there is more to cleanup */
621 if (!STAILQ_EMPTY(&so_cache_head))
622 rc = TRUE;
623
624 lck_mtx_unlock(so_cache_mtx);
625 return (rc);
1c79356b 626}
1c79356b
A
627
628/*
629 * Get a socket structure from our zone, and initialize it.
630 * We don't implement `waitok' yet (see comments in uipc_domain.c).
631 * Note that it would probably be better to allocate socket
632 * and PCB at the same time, but I'm not convinced that all
633 * the protocols can be easily modified to do this.
634 */
635struct socket *
2d21ac55 636soalloc(int waitok, int dom, int type)
1c79356b
A
637{
638 struct socket *so;
639
2d21ac55
A
640 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
641 cached_sock_alloc(&so, waitok);
642 } else {
643 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
644 M_WAITOK);
645 if (so != NULL)
646 bzero(so, sizeof (*so));
1c79356b 647 }
2d21ac55 648 if (so != NULL) {
fe8ab488 649 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
2d21ac55
A
650 so->so_zone = socket_zone;
651#if CONFIG_MACF_SOCKET
39236c6e
A
652 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
653 if (mac_socket_label_init(so, !waitok) != 0) {
2d21ac55
A
654 sodealloc(so);
655 return (NULL);
656 }
657#endif /* MAC_SOCKET */
1c79356b
A
658 }
659
2d21ac55 660 return (so);
1c79356b
A
661}
662
663int
39236c6e
A
664socreate_internal(int dom, struct socket **aso, int type, int proto,
665 struct proc *p, uint32_t flags, struct proc *ep)
1c79356b 666{
39236c6e
A
667 struct protosw *prp;
668 struct socket *so;
669 int error = 0;
d1ecb069 670
55e303ae
A
671#if TCPDEBUG
672 extern int tcpconsdebug;
673#endif
39236c6e
A
674
675 VERIFY(aso != NULL);
676 *aso = NULL;
677
678 if (proto != 0)
1c79356b
A
679 prp = pffindproto(dom, proto, type);
680 else
681 prp = pffindtype(dom, type);
9bccf70c 682
39236c6e
A
683 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
684 if (pffinddomain(dom) == NULL)
2d21ac55 685 return (EAFNOSUPPORT);
2d21ac55 686 if (proto != 0) {
39236c6e 687 if (pffindprotonotype(dom, proto) != NULL)
2d21ac55 688 return (EPROTOTYPE);
2d21ac55 689 }
9bccf70c
A
690 return (EPROTONOSUPPORT);
691 }
1c79356b
A
692 if (prp->pr_type != type)
693 return (EPROTOTYPE);
b0d623f7 694 so = soalloc(1, dom, type);
39236c6e 695 if (so == NULL)
1c79356b
A
696 return (ENOBUFS);
697
39236c6e
A
698 if (flags & SOCF_ASYNC)
699 so->so_state |= SS_NBIO;
700#if MULTIPATH
701 if (flags & SOCF_MP_SUBFLOW) {
702 /*
703 * A multipath subflow socket is used internally in the kernel,
704 * therefore it does not have a file desciptor associated by
705 * default.
706 */
707 so->so_state |= SS_NOFDREF;
708 so->so_flags |= SOF_MP_SUBFLOW;
709 }
710#endif /* MULTIPATH */
711
1c79356b
A
712 TAILQ_INIT(&so->so_incomp);
713 TAILQ_INIT(&so->so_comp);
714 so->so_type = type;
316670eb
A
715 so->last_upid = proc_uniqueid(p);
716 so->last_pid = proc_pid(p);
39236c6e 717 proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
fe8ab488 718 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
39236c6e
A
719
720 if (ep != PROC_NULL && ep != p) {
721 so->e_upid = proc_uniqueid(ep);
722 so->e_pid = proc_pid(ep);
723 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
724 so->so_flags |= SOF_DELEGATED;
725 }
1c79356b 726
316670eb 727 so->so_cred = kauth_cred_proc_ref(p);
b0d623f7 728 if (!suser(kauth_cred_get(), NULL))
39236c6e 729 so->so_state |= SS_PRIV;
b0d623f7 730
1c79356b 731 so->so_proto = prp;
39236c6e 732 so->so_rcv.sb_flags |= SB_RECV;
91447636 733 so->so_rcv.sb_so = so->so_snd.sb_so = so;
0c530ab8
A
734 so->next_lock_lr = 0;
735 so->next_unlock_lr = 0;
2d21ac55
A
736
737#if CONFIG_MACF_SOCKET
738 mac_socket_label_associate(kauth_cred_get(), so);
739#endif /* MAC_SOCKET */
740
2d21ac55 741 /*
39236c6e
A
742 * Attachment will create the per pcb lock if necessary and
743 * increase refcount for creation, make sure it's done before
744 * socket is inserted in lists.
2d21ac55
A
745 */
746 so->so_usecount++;
91447636
A
747
748 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
39236c6e 749 if (error != 0) {
2d21ac55
A
750 /*
751 * Warning:
752 * If so_pcb is not zero, the socket will be leaked,
753 * so protocol attachment handler must be coded carefuly
55e303ae 754 */
1c79356b 755 so->so_state |= SS_NOFDREF;
37839358
A
756 so->so_usecount--;
757 sofreelastref(so, 1); /* will deallocate the socket */
1c79356b
A
758 return (error);
759 }
39236c6e
A
760
761 atomic_add_32(&prp->pr_domain->dom_refs, 1);
1c79356b 762 TAILQ_INIT(&so->so_evlist);
91447636
A
763
764 /* Attach socket filters for this protocol */
765 sflt_initsock(so);
55e303ae
A
766#if TCPDEBUG
767 if (tcpconsdebug == 2)
768 so->so_options |= SO_DEBUG;
9bccf70c 769#endif
6d2010ae 770 so_set_default_traffic_class(so);
39236c6e 771
d1ecb069 772 /*
39236c6e
A
773 * If this thread or task is marked to create backgrounded sockets,
774 * mark the socket as background.
d1ecb069 775 */
3e170ce0
A
776 if (proc_get_effective_thread_policy(current_thread(),
777 TASK_POLICY_NEW_SOCKETS_BG)) {
d1ecb069 778 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
6d2010ae
A
779 so->so_background_thread = current_thread();
780 }
781
782 switch (dom) {
316670eb 783 /*
39236c6e
A
784 * Don't mark Unix domain, system or multipath sockets as
785 * eligible for defunct by default.
786 */
6d2010ae 787 case PF_LOCAL:
316670eb 788 case PF_SYSTEM:
39236c6e 789 case PF_MULTIPATH:
6d2010ae
A
790 so->so_flags |= SOF_NODEFUNCT;
791 break;
316670eb
A
792 default:
793 break;
d1ecb069
A
794 }
795
fe8ab488
A
796 /*
797 * Entitlements can't be checked at socket creation time except if the
798 * application requested a feature guarded by a privilege (c.f., socket
799 * delegation).
800 * The priv(9) and the Sandboxing APIs are designed with the idea that
801 * a privilege check should only be triggered by a userland request.
802 * A privilege check at socket creation time is time consuming and
803 * could trigger many authorisation error messages from the security
804 * APIs.
805 */
806
1c79356b 807 *aso = so;
39236c6e 808
1c79356b
A
809 return (0);
810}
811
39236c6e
A
812/*
813 * Returns: 0 Success
814 * EAFNOSUPPORT
815 * EPROTOTYPE
816 * EPROTONOSUPPORT
817 * ENOBUFS
818 * <pru_attach>:ENOBUFS[AF_UNIX]
819 * <pru_attach>:ENOBUFS[TCP]
820 * <pru_attach>:ENOMEM[TCP]
821 * <pru_attach>:??? [other protocol families, IPSEC]
822 */
823int
824socreate(int dom, struct socket **aso, int type, int proto)
825{
826 return (socreate_internal(dom, aso, type, proto, current_proc(), 0,
827 PROC_NULL));
828}
829
830int
831socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
832{
833 int error = 0;
834 struct proc *ep = PROC_NULL;
835
836 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
837 error = ESRCH;
838 goto done;
839 }
840
841 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
842
843 /*
844 * It might not be wise to hold the proc reference when calling
845 * socreate_internal since it calls soalloc with M_WAITOK
846 */
847done:
848 if (ep != PROC_NULL)
849 proc_rele(ep);
850
851 return (error);
852}
853
2d21ac55
A
854/*
855 * Returns: 0 Success
856 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
857 * <pru_bind>:EAFNOSUPPORT Address family not supported
858 * <pru_bind>:EADDRNOTAVAIL Address not available.
859 * <pru_bind>:EINVAL Invalid argument
860 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
861 * <pru_bind>:EACCES Permission denied
862 * <pru_bind>:EADDRINUSE Address in use
863 * <pru_bind>:EAGAIN Resource unavailable, try again
864 * <pru_bind>:EPERM Operation not permitted
865 * <pru_bind>:???
866 * <sf_bind>:???
867 *
868 * Notes: It's not possible to fully enumerate the return codes above,
869 * since socket filter authors and protocol family authors may
870 * not choose to limit their error returns to those listed, even
871 * though this may result in some software operating incorrectly.
872 *
873 * The error codes which are enumerated above are those known to
874 * be returned by the tcp_usr_bind function supplied.
875 */
1c79356b 876int
39236c6e 877sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
1c79356b
A
878{
879 struct proc *p = current_proc();
91447636 880 int error = 0;
1c79356b 881
39236c6e
A
882 if (dolock)
883 socket_lock(so, 1);
884 VERIFY(so->so_usecount > 1);
885
6d2010ae 886 so_update_last_owner_locked(so, p);
39236c6e 887 so_update_policy(so);
3e170ce0 888
fe8ab488
A
889#if NECP
890 so_update_necp_policy(so, nam, NULL);
891#endif /* NECP */
3e170ce0 892
2d21ac55 893 /*
6d2010ae
A
894 * If this is a bind request on a socket that has been marked
895 * as inactive, reject it now before we go any further.
2d21ac55
A
896 */
897 if (so->so_flags & SOF_DEFUNCT) {
898 error = EINVAL;
39037602
A
899 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
900 __func__, proc_pid(p), proc_best_name(p),
901 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
902 SOCK_DOM(so), SOCK_TYPE(so), error);
2d21ac55
A
903 goto out;
904 }
905
91447636 906 /* Socket filter */
6d2010ae 907 error = sflt_bind(so, nam);
2d21ac55 908
91447636
A
909 if (error == 0)
910 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
2d21ac55 911out:
39236c6e
A
912 if (dolock)
913 socket_unlock(so, 1);
2d21ac55 914
91447636
A
915 if (error == EJUSTRETURN)
916 error = 0;
2d21ac55 917
1c79356b
A
918 return (error);
919}
920
921void
2d21ac55 922sodealloc(struct socket *so)
1c79356b 923{
316670eb
A
924 kauth_cred_unref(&so->so_cred);
925
6d2010ae
A
926 /* Remove any filters */
927 sflt_termsock(so);
928
fe8ab488
A
929#if CONTENT_FILTER
930 cfil_sock_detach(so);
931#endif /* CONTENT_FILTER */
932
39236c6e
A
933 /* Delete the state allocated for msg queues on a socket */
934 if (so->so_flags & SOF_ENABLE_MSGS) {
935 FREE(so->so_msg_state, M_TEMP);
936 so->so_msg_state = NULL;
937 }
938 VERIFY(so->so_msg_state == NULL);
939
fe8ab488 940 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
1c79356b 941
2d21ac55
A
942#if CONFIG_MACF_SOCKET
943 mac_socket_label_destroy(so);
944#endif /* MAC_SOCKET */
39236c6e 945
3e170ce0 946 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
2d21ac55
A
947 cached_sock_free(so);
948 } else {
2d21ac55 949 FREE_ZONE(so, sizeof (*so), so->so_zone);
91447636 950 }
1c79356b
A
951}
952
2d21ac55
A
953/*
954 * Returns: 0 Success
955 * EINVAL
956 * EOPNOTSUPP
957 * <pru_listen>:EINVAL[AF_UNIX]
958 * <pru_listen>:EINVAL[TCP]
959 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
960 * <pru_listen>:EINVAL[TCP] Invalid argument
961 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
962 * <pru_listen>:EACCES[TCP] Permission denied
963 * <pru_listen>:EADDRINUSE[TCP] Address in use
964 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
965 * <pru_listen>:EPERM[TCP] Operation not permitted
966 * <sf_listen>:???
967 *
968 * Notes: Other <pru_listen> returns depend on the protocol family; all
969 * <sf_listen> returns depend on what the filter author causes
970 * their filter to return.
971 */
1c79356b 972int
2d21ac55 973solisten(struct socket *so, int backlog)
1c79356b 974{
1c79356b 975 struct proc *p = current_proc();
2d21ac55 976 int error = 0;
1c79356b 977
91447636 978 socket_lock(so, 1);
39236c6e
A
979
980 so_update_last_owner_locked(so, p);
981 so_update_policy(so);
3e170ce0 982
fe8ab488
A
983#if NECP
984 so_update_necp_policy(so, NULL, NULL);
985#endif /* NECP */
3e170ce0 986
2d21ac55
A
987 if (so->so_proto == NULL) {
988 error = EINVAL;
989 goto out;
990 }
991 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
992 error = EOPNOTSUPP;
993 goto out;
994 }
995
996 /*
997 * If the listen request is made on a socket that is not fully
6d2010ae
A
998 * disconnected, or on a socket that has been marked as inactive,
999 * reject the request now.
2d21ac55
A
1000 */
1001 if ((so->so_state &
1002 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
1003 (so->so_flags & SOF_DEFUNCT)) {
1004 error = EINVAL;
6d2010ae 1005 if (so->so_flags & SOF_DEFUNCT) {
39037602 1006 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
39236c6e 1007 "(%d)\n", __func__, proc_pid(p),
39037602 1008 proc_best_name(p),
3e170ce0 1009 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 1010 SOCK_DOM(so), SOCK_TYPE(so), error);
6d2010ae 1011 }
2d21ac55
A
1012 goto out;
1013 }
1014
39236c6e 1015 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
2d21ac55
A
1016 error = EPERM;
1017 goto out;
1018 }
1019
6d2010ae 1020 error = sflt_listen(so);
39236c6e 1021 if (error == 0)
91447636 1022 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
2d21ac55 1023
1c79356b 1024 if (error) {
91447636
A
1025 if (error == EJUSTRETURN)
1026 error = 0;
2d21ac55 1027 goto out;
1c79356b 1028 }
2d21ac55 1029
91447636 1030 if (TAILQ_EMPTY(&so->so_comp))
1c79356b 1031 so->so_options |= SO_ACCEPTCONN;
2d21ac55
A
1032 /*
1033 * POSIX: The implementation may have an upper limit on the length of
1034 * the listen queue-either global or per accepting socket. If backlog
1035 * exceeds this limit, the length of the listen queue is set to the
1036 * limit.
1037 *
1038 * If listen() is called with a backlog argument value that is less
1039 * than 0, the function behaves as if it had been called with a backlog
1040 * argument value of 0.
1041 *
1042 * A backlog argument of 0 may allow the socket to accept connections,
1043 * in which case the length of the listen queue may be set to an
1044 * implementation-defined minimum value.
1045 */
1046 if (backlog <= 0 || backlog > somaxconn)
1c79356b 1047 backlog = somaxconn;
1c79356b 1048
2d21ac55
A
1049 so->so_qlimit = backlog;
1050out:
91447636 1051 socket_unlock(so, 1);
2d21ac55 1052 return (error);
1c79356b
A
1053}
1054
1c79356b 1055void
2d21ac55 1056sofreelastref(struct socket *so, int dealloc)
9bccf70c 1057{
1c79356b
A
1058 struct socket *head = so->so_head;
1059
2d21ac55 1060 /* Assume socket is locked */
1c79356b 1061
39236c6e 1062 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
0b4e3aa0
A
1063 selthreadclear(&so->so_snd.sb_sel);
1064 selthreadclear(&so->so_rcv.sb_sel);
39236c6e
A
1065 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1066 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
fe8ab488 1067 so->so_event = sonullevent;
1c79356b 1068 return;
0b4e3aa0 1069 }
9bccf70c 1070 if (head != NULL) {
91447636 1071 socket_lock(head, 1);
9bccf70c
A
1072 if (so->so_state & SS_INCOMP) {
1073 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1074 head->so_incqlen--;
1075 } else if (so->so_state & SS_COMP) {
1076 /*
1077 * We must not decommission a socket that's
1078 * on the accept(2) queue. If we do, then
1079 * accept(2) may hang after select(2) indicated
1080 * that the listening socket was ready.
1081 */
9bccf70c
A
1082 selthreadclear(&so->so_snd.sb_sel);
1083 selthreadclear(&so->so_rcv.sb_sel);
39236c6e
A
1084 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1085 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
fe8ab488 1086 so->so_event = sonullevent;
91447636 1087 socket_unlock(head, 1);
9bccf70c
A
1088 return;
1089 } else {
1090 panic("sofree: not queued");
1091 }
1c79356b 1092 head->so_qlen--;
9bccf70c 1093 so->so_state &= ~SS_INCOMP;
1c79356b 1094 so->so_head = NULL;
91447636 1095 socket_unlock(head, 1);
1c79356b 1096 }
39236c6e 1097 sowflush(so);
1c79356b 1098 sorflush(so);
2d21ac55 1099
39236c6e
A
1100#if FLOW_DIVERT
1101 if (so->so_flags & SOF_FLOW_DIVERT) {
1102 flow_divert_detach(so);
1103 }
1104#endif /* FLOW_DIVERT */
1105
91447636
A
1106 /* 3932268: disable upcall */
1107 so->so_rcv.sb_flags &= ~SB_UPCALL;
39037602 1108 so->so_snd.sb_flags &= ~(SB_UPCALL|SB_SNDBYTE_CNT);
fe8ab488 1109 so->so_event = sonullevent;
2d21ac55 1110
91447636
A
1111 if (dealloc)
1112 sodealloc(so);
1c79356b
A
1113}
1114
2d21ac55
A
1115void
1116soclose_wait_locked(struct socket *so)
1117{
1118 lck_mtx_t *mutex_held;
1119
1120 if (so->so_proto->pr_getlock != NULL)
1121 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1122 else
1123 mutex_held = so->so_proto->pr_domain->dom_mtx;
1124 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1125
4a3eedf9
A
1126 /*
1127 * Double check here and return if there's no outstanding upcall;
1128 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1129 */
316670eb 1130 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
2d21ac55 1131 return;
316670eb
A
1132 so->so_rcv.sb_flags &= ~SB_UPCALL;
1133 so->so_snd.sb_flags &= ~SB_UPCALL;
2d21ac55 1134 so->so_flags |= SOF_CLOSEWAIT;
39236c6e 1135 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
2d21ac55
A
1136 "soclose_wait_locked", NULL);
1137 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1138 so->so_flags &= ~SOF_CLOSEWAIT;
1139}
1140
1c79356b
A
1141/*
1142 * Close a socket on last file table reference removal.
1143 * Initiate disconnect if connected.
1144 * Free socket when disconnect complete.
1145 */
1146int
2d21ac55 1147soclose_locked(struct socket *so)
1c79356b 1148{
1c79356b 1149 int error = 0;
2d21ac55 1150 lck_mtx_t *mutex_held;
91447636 1151 struct timespec ts;
1c79356b 1152
91447636 1153 if (so->so_usecount == 0) {
2d21ac55 1154 panic("soclose: so=%p refcount=0\n", so);
39236c6e 1155 /* NOTREACHED */
1c79356b
A
1156 }
1157
91447636 1158 sflt_notify(so, sock_evt_closing, NULL);
2d21ac55 1159
39236c6e
A
1160 if (so->so_upcallusecount)
1161 soclose_wait_locked(so);
1162
fe8ab488
A
1163#if CONTENT_FILTER
1164 /*
1165 * We have to wait until the content filters are done
1166 */
1167 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1168 cfil_sock_close_wait(so);
1169 cfil_sock_is_closed(so);
1170 cfil_sock_detach(so);
1171 }
1172#endif /* CONTENT_FILTER */
1173
3e170ce0
A
1174 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1175 soresume(current_proc(), so, 1);
1176 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1177 }
1178
91447636 1179 if ((so->so_options & SO_ACCEPTCONN)) {
2d21ac55
A
1180 struct socket *sp, *sonext;
1181 int socklock = 0;
1182
1183 /*
1184 * We do not want new connection to be added
1185 * to the connection queues
1186 */
91447636 1187 so->so_options &= ~SO_ACCEPTCONN;
2d21ac55 1188
39236c6e
A
1189 for (sp = TAILQ_FIRST(&so->so_incomp);
1190 sp != NULL; sp = sonext) {
2d21ac55
A
1191 sonext = TAILQ_NEXT(sp, so_list);
1192
39236c6e
A
1193 /*
1194 * Radar 5350314
2d21ac55
A
1195 * skip sockets thrown away by tcpdropdropblreq
1196 * they will get cleanup by the garbage collection.
1197 * otherwise, remove the incomp socket from the queue
1198 * and let soabort trigger the appropriate cleanup.
91447636 1199 */
39236c6e 1200 if (sp->so_flags & SOF_OVERFLOW)
2d21ac55
A
1201 continue;
1202
ff6e181a 1203 if (so->so_proto->pr_getlock != NULL) {
39236c6e
A
1204 /*
1205 * Lock ordering for consistency with the
1206 * rest of the stack, we lock the socket
1207 * first and then grabb the head.
2d21ac55 1208 */
91447636 1209 socket_unlock(so, 0);
ff6e181a 1210 socket_lock(sp, 1);
ff6e181a 1211 socket_lock(so, 0);
39236c6e 1212 socklock = 1;
2d21ac55
A
1213 }
1214
1215 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1216 so->so_incqlen--;
1217
1218 if (sp->so_state & SS_INCOMP) {
1219 sp->so_state &= ~SS_INCOMP;
1220 sp->so_head = NULL;
1221
1222 (void) soabort(sp);
ff6e181a 1223 }
2d21ac55 1224
39236c6e 1225 if (socklock)
2d21ac55 1226 socket_unlock(sp, 1);
91447636
A
1227 }
1228
1229 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
91447636 1230 /* Dequeue from so_comp since sofree() won't do it */
2d21ac55 1231 TAILQ_REMOVE(&so->so_comp, sp, so_list);
91447636 1232 so->so_qlen--;
ff6e181a
A
1233
1234 if (so->so_proto->pr_getlock != NULL) {
1235 socket_unlock(so, 0);
1236 socket_lock(sp, 1);
1237 }
1238
2d21ac55
A
1239 if (sp->so_state & SS_COMP) {
1240 sp->so_state &= ~SS_COMP;
1241 sp->so_head = NULL;
1242
1243 (void) soabort(sp);
1244 }
91447636 1245
ff6e181a 1246 if (so->so_proto->pr_getlock != NULL) {
91447636 1247 socket_unlock(sp, 1);
ff6e181a
A
1248 socket_lock(so, 0);
1249 }
91447636
A
1250 }
1251 }
39236c6e 1252 if (so->so_pcb == NULL) {
91447636
A
1253 /* 3915887: mark the socket as ready for dealloc */
1254 so->so_flags |= SOF_PCBCLEARING;
1c79356b 1255 goto discard;
91447636 1256 }
1c79356b
A
1257 if (so->so_state & SS_ISCONNECTED) {
1258 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
91447636 1259 error = sodisconnectlocked(so);
1c79356b
A
1260 if (error)
1261 goto drop;
1262 }
1263 if (so->so_options & SO_LINGER) {
1264 if ((so->so_state & SS_ISDISCONNECTING) &&
1265 (so->so_state & SS_NBIO))
1266 goto drop;
2d21ac55 1267 if (so->so_proto->pr_getlock != NULL)
91447636 1268 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2d21ac55 1269 else
91447636 1270 mutex_held = so->so_proto->pr_domain->dom_mtx;
1c79356b 1271 while (so->so_state & SS_ISCONNECTED) {
91447636 1272 ts.tv_sec = (so->so_linger/100);
2d21ac55
A
1273 ts.tv_nsec = (so->so_linger % 100) *
1274 NSEC_PER_USEC * 1000 * 10;
1275 error = msleep((caddr_t)&so->so_timeo,
1276 mutex_held, PSOCK | PCATCH, "soclose", &ts);
91447636 1277 if (error) {
2d21ac55
A
1278 /*
1279 * It's OK when the time fires,
1280 * don't report an error
1281 */
91447636
A
1282 if (error == EWOULDBLOCK)
1283 error = 0;
1c79356b 1284 break;
91447636 1285 }
1c79356b
A
1286 }
1287 }
1288 }
1289drop:
39236c6e 1290 if (so->so_usecount == 0) {
2d21ac55 1291 panic("soclose: usecount is zero so=%p\n", so);
39236c6e
A
1292 /* NOTREACHED */
1293 }
1294 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1c79356b
A
1295 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1296 if (error == 0)
1297 error = error2;
1298 }
39236c6e 1299 if (so->so_usecount <= 0) {
2d21ac55 1300 panic("soclose: usecount is zero so=%p\n", so);
39236c6e
A
1301 /* NOTREACHED */
1302 }
1c79356b 1303discard:
39236c6e
A
1304 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1305 (so->so_state & SS_NOFDREF)) {
1c79356b 1306 panic("soclose: NOFDREF");
39236c6e
A
1307 /* NOTREACHED */
1308 }
1c79356b 1309 so->so_state |= SS_NOFDREF;
39236c6e
A
1310
1311 if (so->so_flags & SOF_MP_SUBFLOW)
1312 so->so_flags &= ~SOF_MP_SUBFLOW;
1313
316670eb
A
1314 if ((so->so_flags & SOF_KNOTE) != 0)
1315 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
39236c6e
A
1316
1317 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1c79356b 1318 evsofree(so);
39236c6e 1319
91447636 1320 so->so_usecount--;
1c79356b 1321 sofree(so);
1c79356b
A
1322 return (error);
1323}
1324
91447636 1325int
2d21ac55 1326soclose(struct socket *so)
91447636
A
1327{
1328 int error = 0;
1329 socket_lock(so, 1);
2d21ac55 1330
2d21ac55 1331 if (so->so_retaincnt == 0) {
91447636 1332 error = soclose_locked(so);
2d21ac55
A
1333 } else {
1334 /*
1335 * if the FD is going away, but socket is
1336 * retained in kernel remove its reference
1337 */
91447636
A
1338 so->so_usecount--;
1339 if (so->so_usecount < 2)
2d21ac55
A
1340 panic("soclose: retaincnt non null and so=%p "
1341 "usecount=%d\n", so, so->so_usecount);
91447636
A
1342 }
1343 socket_unlock(so, 1);
1344 return (error);
1345}
1346
1c79356b
A
1347/*
1348 * Must be called at splnet...
1349 */
2d21ac55 1350/* Should already be locked */
1c79356b 1351int
2d21ac55 1352soabort(struct socket *so)
1c79356b 1353{
9bccf70c 1354 int error;
1c79356b 1355
91447636 1356#ifdef MORE_LOCKING_DEBUG
2d21ac55 1357 lck_mtx_t *mutex_held;
91447636 1358
2d21ac55 1359 if (so->so_proto->pr_getlock != NULL)
91447636 1360 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2d21ac55 1361 else
91447636
A
1362 mutex_held = so->so_proto->pr_domain->dom_mtx;
1363 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1364#endif
1365
2d21ac55
A
1366 if ((so->so_flags & SOF_ABORTED) == 0) {
1367 so->so_flags |= SOF_ABORTED;
1368 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1369 if (error) {
1370 sofree(so);
1371 return (error);
1372 }
9bccf70c
A
1373 }
1374 return (0);
1c79356b
A
1375}
1376
1377int
2d21ac55 1378soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
9bccf70c 1379{
1c79356b 1380 int error;
91447636 1381
2d21ac55
A
1382 if (dolock)
1383 socket_lock(so, 1);
1c79356b 1384
39236c6e
A
1385 so_update_last_owner_locked(so, PROC_NULL);
1386 so_update_policy(so);
fe8ab488
A
1387#if NECP
1388 so_update_necp_policy(so, NULL, NULL);
1389#endif /* NECP */
39236c6e 1390
1c79356b
A
1391 if ((so->so_state & SS_NOFDREF) == 0)
1392 panic("soaccept: !NOFDREF");
1393 so->so_state &= ~SS_NOFDREF;
1394 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
2d21ac55
A
1395
1396 if (dolock)
1397 socket_unlock(so, 1);
1c79356b
A
1398 return (error);
1399}
2d21ac55 1400
91447636 1401int
2d21ac55 1402soaccept(struct socket *so, struct sockaddr **nam)
91447636
A
1403{
1404 return (soacceptlock(so, nam, 1));
1405}
1c79356b
A
1406
1407int
2d21ac55
A
1408soacceptfilter(struct socket *so)
1409{
1410 struct sockaddr *local = NULL, *remote = NULL;
6d2010ae 1411 int error = 0;
2d21ac55
A
1412 struct socket *head = so->so_head;
1413
1414 /*
39236c6e
A
1415 * Hold the lock even if this socket has not been made visible
1416 * to the filter(s). For sockets with global locks, this protects
1417 * against the head or peer going away
2d21ac55 1418 */
b0d623f7
A
1419 socket_lock(so, 1);
1420 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1421 sogetaddr_locked(so, &local, 0) != 0) {
2d21ac55
A
1422 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1423 so->so_head = NULL;
b0d623f7 1424 socket_unlock(so, 1);
2d21ac55
A
1425 soclose(so);
1426 /* Out of resources; try it again next time */
1427 error = ECONNABORTED;
1428 goto done;
1429 }
1430
6d2010ae 1431 error = sflt_accept(head, so, local, remote);
2d21ac55
A
1432
1433 /*
1434 * If we get EJUSTRETURN from one of the filters, mark this socket
1435 * as inactive and return it anyway. This newly accepted socket
1436 * will be disconnected later before we hand it off to the caller.
1437 */
1438 if (error == EJUSTRETURN) {
1439 error = 0;
6d2010ae
A
1440 (void) sosetdefunct(current_proc(), so,
1441 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
2d21ac55
A
1442 }
1443
1444 if (error != 0) {
1445 /*
1446 * This may seem like a duplication to the above error
1447 * handling part when we return ECONNABORTED, except
1448 * the following is done while holding the lock since
1449 * the socket has been exposed to the filter(s) earlier.
1450 */
1451 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1452 so->so_head = NULL;
1453 socket_unlock(so, 1);
1454 soclose(so);
1455 /* Propagate socket filter's error code to the caller */
1456 } else {
1457 socket_unlock(so, 1);
1458 }
1459done:
1460 /* Callee checks for NULL pointer */
1461 sock_freeaddr(remote);
1462 sock_freeaddr(local);
1463 return (error);
1464}
1c79356b 1465
2d21ac55
A
1466/*
1467 * Returns: 0 Success
1468 * EOPNOTSUPP Operation not supported on socket
1469 * EISCONN Socket is connected
1470 * <pru_connect>:EADDRNOTAVAIL Address not available.
1471 * <pru_connect>:EINVAL Invalid argument
1472 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1473 * <pru_connect>:EACCES Permission denied
1474 * <pru_connect>:EADDRINUSE Address in use
1475 * <pru_connect>:EAGAIN Resource unavailable, try again
1476 * <pru_connect>:EPERM Operation not permitted
1477 * <sf_connect_out>:??? [anything a filter writer might set]
1478 */
1479int
1480soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1c79356b 1481{
1c79356b
A
1482 int error;
1483 struct proc *p = current_proc();
1c79356b 1484
2d21ac55
A
1485 if (dolock)
1486 socket_lock(so, 1);
39236c6e
A
1487
1488 so_update_last_owner_locked(so, p);
1489 so_update_policy(so);
1490
fe8ab488
A
1491#if NECP
1492 so_update_necp_policy(so, NULL, nam);
1493#endif /* NECP */
1494
2d21ac55
A
1495 /*
1496 * If this is a listening socket or if this is a previously-accepted
1497 * socket that has been marked as inactive, reject the connect request.
1498 */
1499 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
6d2010ae
A
1500 error = EOPNOTSUPP;
1501 if (so->so_flags & SOF_DEFUNCT) {
39037602 1502 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
39236c6e 1503 "(%d)\n", __func__, proc_pid(p),
39037602 1504 proc_best_name(p),
3e170ce0 1505 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 1506 SOCK_DOM(so), SOCK_TYPE(so), error);
6d2010ae 1507 }
2d21ac55
A
1508 if (dolock)
1509 socket_unlock(so, 1);
6d2010ae 1510 return (error);
91447636 1511 }
2d21ac55 1512
39236c6e 1513 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
2d21ac55
A
1514 if (dolock)
1515 socket_unlock(so, 1);
1516 return (EPERM);
1517 }
1518
1c79356b
A
1519 /*
1520 * If protocol is connection-based, can only connect once.
1521 * Otherwise, if connected, try to disconnect first.
1522 * This allows user to disconnect by connecting to, e.g.,
1523 * a null address.
1524 */
1525 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1526 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
2d21ac55 1527 (error = sodisconnectlocked(so)))) {
1c79356b 1528 error = EISCONN;
2d21ac55 1529 } else {
91447636
A
1530 /*
1531 * Run connect filter before calling protocol:
1532 * - non-blocking connect returns before completion;
1533 */
6d2010ae 1534 error = sflt_connectout(so, nam);
39236c6e 1535 if (error != 0) {
91447636
A
1536 if (error == EJUSTRETURN)
1537 error = 0;
6d2010ae 1538 } else {
39236c6e
A
1539 error = (*so->so_proto->pr_usrreqs->pru_connect)
1540 (so, nam, p);
91447636 1541 }
1c79356b 1542 }
2d21ac55
A
1543 if (dolock)
1544 socket_unlock(so, 1);
1c79356b
A
1545 return (error);
1546}
1547
91447636 1548int
2d21ac55 1549soconnect(struct socket *so, struct sockaddr *nam)
91447636
A
1550{
1551 return (soconnectlock(so, nam, 1));
1552}
1553
2d21ac55
A
1554/*
1555 * Returns: 0 Success
1556 * <pru_connect2>:EINVAL[AF_UNIX]
1557 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1558 * <pru_connect2>:??? [other protocol families]
1559 *
1560 * Notes: <pru_connect2> is not supported by [TCP].
1561 */
1c79356b 1562int
2d21ac55 1563soconnect2(struct socket *so1, struct socket *so2)
1c79356b 1564{
1c79356b 1565 int error;
91447636 1566
0c530ab8 1567 socket_lock(so1, 1);
2d21ac55 1568 if (so2->so_proto->pr_lock)
0c530ab8 1569 socket_lock(so2, 1);
1c79356b
A
1570
1571 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
2d21ac55 1572
0c530ab8 1573 socket_unlock(so1, 1);
2d21ac55 1574 if (so2->so_proto->pr_lock)
0c530ab8 1575 socket_unlock(so2, 1);
1c79356b
A
1576 return (error);
1577}
1578
39236c6e
A
1579int
1580soconnectxlocked(struct socket *so, struct sockaddr_list **src_sl,
1581 struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
3e170ce0
A
1582 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1583 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
39236c6e
A
1584{
1585 int error;
1586
fe8ab488
A
1587 so_update_last_owner_locked(so, p);
1588 so_update_policy(so);
3e170ce0 1589
39236c6e
A
1590 /*
1591 * If this is a listening socket or if this is a previously-accepted
1592 * socket that has been marked as inactive, reject the connect request.
1593 */
1594 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1595 error = EOPNOTSUPP;
1596 if (so->so_flags & SOF_DEFUNCT) {
39037602 1597 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
39236c6e 1598 "(%d)\n", __func__, proc_pid(p),
39037602 1599 proc_best_name(p),
3e170ce0 1600 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 1601 SOCK_DOM(so), SOCK_TYPE(so), error);
39236c6e
A
1602 }
1603 return (error);
1604 }
1605
1606 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0)
1607 return (EPERM);
1608
1609 /*
1610 * If protocol is connection-based, can only connect once
1611 * unless PR_MULTICONN is set. Otherwise, if connected,
1612 * try to disconnect first. This allows user to disconnect
1613 * by connecting to, e.g., a null address.
1614 */
1615 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) &&
1616 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1617 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1618 (error = sodisconnectlocked(so)) != 0)) {
1619 error = EISCONN;
1620 } else {
1621 /*
1622 * Run connect filter before calling protocol:
1623 * - non-blocking connect returns before completion;
1624 */
1625 error = sflt_connectxout(so, dst_sl);
1626 if (error != 0) {
490019cf
A
1627 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1628 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
39236c6e
A
1629 if (error == EJUSTRETURN)
1630 error = 0;
1631 } else {
1632 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1633 (so, src_sl, dst_sl, p, ifscope, aid, pcid,
3e170ce0 1634 flags, arg, arglen, auio, bytes_written);
39236c6e
A
1635 }
1636 }
1637
1638 return (error);
1639}
1640
1c79356b 1641int
2d21ac55 1642sodisconnectlocked(struct socket *so)
1c79356b 1643{
1c79356b 1644 int error;
1c79356b
A
1645
1646 if ((so->so_state & SS_ISCONNECTED) == 0) {
1647 error = ENOTCONN;
1648 goto bad;
1649 }
1650 if (so->so_state & SS_ISDISCONNECTING) {
1651 error = EALREADY;
1652 goto bad;
1653 }
2d21ac55 1654
1c79356b 1655 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
39236c6e 1656 if (error == 0)
91447636 1657 sflt_notify(so, sock_evt_disconnected, NULL);
39236c6e 1658
1c79356b 1659bad:
1c79356b
A
1660 return (error);
1661}
2d21ac55
A
1662
1663/* Locking version */
91447636 1664int
2d21ac55 1665sodisconnect(struct socket *so)
91447636 1666{
2d21ac55 1667 int error;
91447636
A
1668
1669 socket_lock(so, 1);
1670 error = sodisconnectlocked(so);
1671 socket_unlock(so, 1);
2d21ac55 1672 return (error);
91447636 1673}
1c79356b 1674
39236c6e 1675int
3e170ce0 1676sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
39236c6e
A
1677{
1678 int error;
1679
1680 /*
1681 * Call the protocol disconnectx handler; let it handle all
1682 * matters related to the connection state of this session.
1683 */
1684 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1685 if (error == 0) {
1686 /*
1687 * The event applies only for the session, not for
1688 * the disconnection of individual subflows.
1689 */
1690 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
1691 sflt_notify(so, sock_evt_disconnected, NULL);
1692 }
1693 return (error);
1694}
1695
1696int
3e170ce0 1697sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
39236c6e
A
1698{
1699 int error;
1700
1701 socket_lock(so, 1);
1702 error = sodisconnectxlocked(so, aid, cid);
1703 socket_unlock(so, 1);
1704 return (error);
1705}
1706
1707int
3e170ce0 1708sopeelofflocked(struct socket *so, sae_associd_t aid, struct socket **psop)
39236c6e
A
1709{
1710 return ((*so->so_proto->pr_usrreqs->pru_peeloff)(so, aid, psop));
1711}
1712
1713#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
91447636
A
1714
1715/*
1716 * sosendcheck will lock the socket buffer if it isn't locked and
1717 * verify that there is space for the data being inserted.
2d21ac55
A
1718 *
1719 * Returns: 0 Success
1720 * EPIPE
1721 * sblock:EWOULDBLOCK
1722 * sblock:EINTR
1723 * sbwait:EBADF
1724 * sbwait:EINTR
1725 * [so_error]:???
91447636 1726 */
39236c6e
A
1727int
1728sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1729 int32_t clen, int32_t atomic, int flags, int *sblocked,
1730 struct mbuf *control)
91447636 1731{
39236c6e 1732 int error = 0;
b0d623f7 1733 int32_t space;
3a60a9f5 1734 int assumelock = 0;
91447636
A
1735
1736restart:
1737 if (*sblocked == 0) {
3a60a9f5 1738 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
2d21ac55
A
1739 so->so_send_filt_thread != 0 &&
1740 so->so_send_filt_thread == current_thread()) {
3a60a9f5
A
1741 /*
1742 * We're being called recursively from a filter,
1743 * allow this to continue. Radar 4150520.
1744 * Don't set sblocked because we don't want
1745 * to perform an unlock later.
1746 */
1747 assumelock = 1;
2d21ac55 1748 } else {
3a60a9f5
A
1749 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1750 if (error) {
6d2010ae
A
1751 if (so->so_flags & SOF_DEFUNCT)
1752 goto defunct;
2d21ac55 1753 return (error);
3a60a9f5
A
1754 }
1755 *sblocked = 1;
1756 }
91447636 1757 }
2d21ac55
A
1758
1759 /*
6d2010ae
A
1760 * If a send attempt is made on a socket that has been marked
1761 * as inactive (disconnected), reject the request.
2d21ac55 1762 */
6d2010ae
A
1763 if (so->so_flags & SOF_DEFUNCT) {
1764defunct:
1765 error = EPIPE;
39037602
A
1766 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1767 __func__, proc_selfpid(), proc_best_name(current_proc()),
3e170ce0 1768 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 1769 SOCK_DOM(so), SOCK_TYPE(so), error);
6d2010ae
A
1770 return (error);
1771 }
2d21ac55 1772
fe8ab488
A
1773 if (so->so_state & SS_CANTSENDMORE) {
1774#if CONTENT_FILTER
1775 /*
1776 * Can re-inject data of half closed connections
1777 */
1778 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1779 so->so_snd.sb_cfil_thread == current_thread() &&
1780 cfil_sock_data_pending(&so->so_snd) != 0)
1781 CFIL_LOG(LOG_INFO,
1782 "so %llx ignore SS_CANTSENDMORE",
3e170ce0 1783 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
fe8ab488
A
1784 else
1785#endif /* CONTENT_FILTER */
1786 return (EPIPE);
1787 }
91447636
A
1788 if (so->so_error) {
1789 error = so->so_error;
1790 so->so_error = 0;
2d21ac55 1791 return (error);
91447636 1792 }
2d21ac55 1793
91447636 1794 if ((so->so_state & SS_ISCONNECTED) == 0) {
2d21ac55 1795 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
fe8ab488 1796 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
3e170ce0
A
1797 (resid != 0 || clen == 0) &&
1798 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
fe8ab488 1799#if MPTCP
3e170ce0
A
1800 /*
1801 * MPTCP Fast Join sends data before the
fe8ab488
A
1802 * socket is truly connected.
1803 */
1804 if ((so->so_flags & (SOF_MP_SUBFLOW |
1805 SOF_MPTCP_FASTJOIN)) !=
1806 (SOF_MP_SUBFLOW | SOF_MPTCP_FASTJOIN))
3e170ce0 1807#endif /* MPTCP */
2d21ac55 1808 return (ENOTCONN);
fe8ab488 1809 }
2d21ac55
A
1810 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1811 return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1812 ENOTCONN : EDESTADDRREQ);
1813 }
91447636 1814 }
3e170ce0 1815
39236c6e
A
1816 if (so->so_flags & SOF_ENABLE_MSGS)
1817 space = msgq_sbspace(so, control);
1818 else
1819 space = sbspace(&so->so_snd);
1820
91447636
A
1821 if (flags & MSG_OOB)
1822 space += 1024;
1823 if ((atomic && resid > so->so_snd.sb_hiwat) ||
2d21ac55
A
1824 clen > so->so_snd.sb_hiwat)
1825 return (EMSGSIZE);
39236c6e 1826
316670eb 1827 if ((space < resid + clen &&
3e170ce0
A
1828 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
1829 space < clen)) ||
316670eb 1830 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
3e170ce0
A
1831 /*
1832 * don't block the connectx call when there's more data
1833 * than can be copied.
1834 */
1835 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
1836 if (space == 0) {
1837 return (EWOULDBLOCK);
1838 }
1839 if (space < (int32_t)so->so_snd.sb_lowat) {
1840 return (0);
1841 }
1842 }
2d21ac55
A
1843 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1844 assumelock) {
1845 return (EWOULDBLOCK);
3a60a9f5 1846 }
39236c6e 1847 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
6d2010ae 1848 *sblocked = 0;
91447636
A
1849 error = sbwait(&so->so_snd);
1850 if (error) {
6d2010ae
A
1851 if (so->so_flags & SOF_DEFUNCT)
1852 goto defunct;
2d21ac55 1853 return (error);
91447636
A
1854 }
1855 goto restart;
1856 }
2d21ac55 1857 return (0);
91447636
A
1858}
1859
1c79356b
A
1860/*
1861 * Send on a socket.
1862 * If send must go all at once and message is larger than
1863 * send buffering, then hard error.
1864 * Lock against other senders.
1865 * If must go all at once and not enough room now, then
1866 * inform user that this would block and do nothing.
1867 * Otherwise, if nonblocking, send as much as possible.
1868 * The data to be sent is described by "uio" if nonzero,
1869 * otherwise by the mbuf chain "top" (which must be null
1870 * if uio is not). Data provided in mbuf chain must be small
1871 * enough to send all at once.
1872 *
1873 * Returns nonzero on error, timeout or signal; callers
1874 * must check for short counts if EINTR/ERESTART are returned.
1875 * Data and control buffers are freed on return.
1876 * Experiment:
1877 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1878 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1879 * point at the mbuf chain being constructed and go from there.
2d21ac55
A
1880 *
1881 * Returns: 0 Success
1882 * EOPNOTSUPP
1883 * EINVAL
1884 * ENOBUFS
1885 * uiomove:EFAULT
1886 * sosendcheck:EPIPE
1887 * sosendcheck:EWOULDBLOCK
1888 * sosendcheck:EINTR
1889 * sosendcheck:EBADF
1890 * sosendcheck:EINTR
1891 * sosendcheck:??? [value from so_error]
1892 * <pru_send>:ECONNRESET[TCP]
1893 * <pru_send>:EINVAL[TCP]
1894 * <pru_send>:ENOBUFS[TCP]
1895 * <pru_send>:EADDRINUSE[TCP]
1896 * <pru_send>:EADDRNOTAVAIL[TCP]
1897 * <pru_send>:EAFNOSUPPORT[TCP]
1898 * <pru_send>:EACCES[TCP]
1899 * <pru_send>:EAGAIN[TCP]
1900 * <pru_send>:EPERM[TCP]
1901 * <pru_send>:EMSGSIZE[TCP]
1902 * <pru_send>:EHOSTUNREACH[TCP]
1903 * <pru_send>:ENETUNREACH[TCP]
1904 * <pru_send>:ENETDOWN[TCP]
1905 * <pru_send>:ENOMEM[TCP]
1906 * <pru_send>:ENOBUFS[TCP]
1907 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
1908 * <pru_send>:EINVAL[AF_UNIX]
1909 * <pru_send>:EOPNOTSUPP[AF_UNIX]
1910 * <pru_send>:EPIPE[AF_UNIX]
1911 * <pru_send>:ENOTCONN[AF_UNIX]
1912 * <pru_send>:EISCONN[AF_UNIX]
1913 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
1914 * <sf_data_out>:??? [whatever a filter author chooses]
1915 *
1916 * Notes: Other <pru_send> returns depend on the protocol family; all
1917 * <sf_data_out> returns depend on what the filter author causes
1918 * their filter to return.
1c79356b
A
1919 */
1920int
2d21ac55
A
1921sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1922 struct mbuf *top, struct mbuf *control, int flags)
1c79356b
A
1923{
1924 struct mbuf **mp;
39236c6e 1925 struct mbuf *m, *freelist = NULL;
3e170ce0 1926 user_ssize_t space, len, resid, orig_resid;
91447636 1927 int clen = 0, error, dontroute, mlen, sendflags;
1c79356b 1928 int atomic = sosendallatonce(so) || top;
91447636 1929 int sblocked = 0;
1c79356b 1930 struct proc *p = current_proc();
39236c6e 1931 struct mbuf *control_copy = NULL;
3e170ce0
A
1932 uint16_t headroom = 0;
1933 boolean_t en_tracing = FALSE;
1c79356b 1934
39236c6e 1935 if (uio != NULL)
91447636 1936 resid = uio_resid(uio);
39236c6e 1937 else
1c79356b 1938 resid = top->m_pkthdr.len;
39236c6e 1939
2d21ac55
A
1940 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
1941 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1c79356b 1942
91447636 1943 socket_lock(so, 1);
fe8ab488 1944
3e170ce0
A
1945 /*
1946 * trace if tracing & network (vs. unix) sockets & and
1947 * non-loopback
1948 */
1949 if (ENTR_SHOULDTRACE &&
1950 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
1951 struct inpcb *inp = sotoinpcb(so);
1952 if (inp->inp_last_outifp != NULL &&
1953 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
1954 en_tracing = TRUE;
1955 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
1956 VM_KERNEL_ADDRPERM(so),
1957 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
1958 (int64_t)resid);
1959 orig_resid = resid;
1960 }
1961 }
1962
fe8ab488
A
1963 /*
1964 * Re-injection should not affect process accounting
1965 */
1966 if ((flags & MSG_SKIPCFIL) == 0) {
3e170ce0
A
1967 so_update_last_owner_locked(so, p);
1968 so_update_policy(so);
1969
fe8ab488 1970#if NECP
3e170ce0 1971 so_update_necp_policy(so, NULL, addr);
fe8ab488
A
1972#endif /* NECP */
1973 }
3e170ce0 1974
2d21ac55
A
1975 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
1976 error = EOPNOTSUPP;
1977 socket_unlock(so, 1);
1978 goto out;
1979 }
91447636 1980
1c79356b
A
1981 /*
1982 * In theory resid should be unsigned.
1983 * However, space must be signed, as it might be less than 0
1984 * if we over-committed, and we must use a signed comparison
1985 * of space and resid. On the other hand, a negative resid
1986 * causes us to loop sending 0-length segments to the protocol.
1987 *
39236c6e
A
1988 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
1989 * But it will be used by sockets doing message delivery.
1990 *
fe8ab488 1991 * Note: We limit resid to be a positive int value as we use
39236c6e 1992 * imin() to set bytes_to_copy -- radr://14558484
1c79356b 1993 */
fe8ab488 1994 if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
39236c6e 1995 !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
1c79356b 1996 error = EINVAL;
91447636 1997 socket_unlock(so, 1);
1c79356b
A
1998 goto out;
1999 }
2000
39236c6e
A
2001 dontroute = (flags & MSG_DONTROUTE) &&
2002 (so->so_options & SO_DONTROUTE) == 0 &&
1c79356b 2003 (so->so_proto->pr_flags & PR_ATOMIC);
b0d623f7 2004 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
39236c6e
A
2005
2006 if (control != NULL)
1c79356b 2007 clen = control->m_len;
1c79356b 2008
3e170ce0
A
2009 if (soreserveheadroom != 0)
2010 headroom = so->so_pktheadroom;
2011
1c79356b 2012 do {
2d21ac55 2013 error = sosendcheck(so, addr, resid, clen, atomic, flags,
39236c6e
A
2014 &sblocked, control);
2015 if (error)
3a60a9f5 2016 goto release;
39236c6e 2017
1c79356b 2018 mp = &top;
39236c6e
A
2019 if (so->so_flags & SOF_ENABLE_MSGS)
2020 space = msgq_sbspace(so, control);
2021 else
2022 space = sbspace(&so->so_snd) - clen;
2023 space += ((flags & MSG_OOB) ? 1024 : 0);
fa4905b1 2024
1c79356b 2025 do {
2d21ac55 2026 if (uio == NULL) {
91447636
A
2027 /*
2028 * Data is prepackaged in "top".
2029 */
2030 resid = 0;
1c79356b
A
2031 if (flags & MSG_EOR)
2032 top->m_flags |= M_EOR;
91447636 2033 } else {
2d21ac55
A
2034 int chainlength;
2035 int bytes_to_copy;
2036 boolean_t jumbocl;
fe8ab488 2037 boolean_t bigcl;
3e170ce0 2038 int bytes_to_alloc;
2d21ac55 2039
b0d623f7 2040 bytes_to_copy = imin(resid, space);
2d21ac55 2041
3e170ce0
A
2042 bytes_to_alloc = bytes_to_copy;
2043 if (top == NULL)
2044 bytes_to_alloc += headroom;
2045
39236c6e 2046 if (sosendminchain > 0)
91447636 2047 chainlength = 0;
39236c6e 2048 else
91447636 2049 chainlength = sosendmaxchain;
2d21ac55 2050
fe8ab488 2051 /*
3e170ce0
A
2052 * Use big 4 KB cluster when the outgoing interface
2053 * does not prefer 2 KB clusters
fe8ab488 2054 */
3e170ce0 2055 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
fe8ab488 2056 sosendbigcl_ignore_capab;
3e170ce0 2057
2d21ac55
A
2058 /*
2059 * Attempt to use larger than system page-size
2060 * clusters for large writes only if there is
2061 * a jumbo cluster pool and if the socket is
2062 * marked accordingly.
2063 */
2064 jumbocl = sosendjcl && njcl > 0 &&
2065 ((so->so_flags & SOF_MULTIPAGES) ||
fe8ab488
A
2066 sosendjcl_ignore_capab) &&
2067 bigcl;
2d21ac55 2068
91447636 2069 socket_unlock(so, 0);
2d21ac55 2070
91447636
A
2071 do {
2072 int num_needed;
39236c6e 2073 int hdrs_needed = (top == NULL) ? 1 : 0;
2d21ac55 2074
91447636 2075 /*
2d21ac55
A
2076 * try to maintain a local cache of mbuf
2077 * clusters needed to complete this
2078 * write the list is further limited to
2079 * the number that are currently needed
2080 * to fill the socket this mechanism
2081 * allows a large number of mbufs/
2082 * clusters to be grabbed under a single
2083 * mbuf lock... if we can't get any
2084 * clusters, than fall back to trying
2085 * for mbufs if we fail early (or
2086 * miscalcluate the number needed) make
2087 * sure to release any clusters we
2088 * haven't yet consumed.
91447636 2089 */
2d21ac55 2090 if (freelist == NULL &&
3e170ce0 2091 bytes_to_alloc > MBIGCLBYTES &&
6d2010ae 2092 jumbocl) {
2d21ac55 2093 num_needed =
3e170ce0 2094 bytes_to_alloc / M16KCLBYTES;
2d21ac55 2095
3e170ce0 2096 if ((bytes_to_alloc -
2d21ac55
A
2097 (num_needed * M16KCLBYTES))
2098 >= MINCLSIZE)
2099 num_needed++;
91447636 2100
2d21ac55
A
2101 freelist =
2102 m_getpackets_internal(
2103 (unsigned int *)&num_needed,
2104 hdrs_needed, M_WAIT, 0,
2105 M16KCLBYTES);
2106 /*
2107 * Fall back to 4K cluster size
2108 * if allocation failed
2109 */
2110 }
2111
2112 if (freelist == NULL &&
3e170ce0 2113 bytes_to_alloc > MCLBYTES &&
fe8ab488 2114 bigcl) {
2d21ac55 2115 num_needed =
3e170ce0 2116 bytes_to_alloc / MBIGCLBYTES;
2d21ac55 2117
3e170ce0 2118 if ((bytes_to_alloc -
6d2010ae 2119 (num_needed * MBIGCLBYTES)) >=
2d21ac55 2120 MINCLSIZE)
91447636 2121 num_needed++;
2d21ac55
A
2122
2123 freelist =
2124 m_getpackets_internal(
2125 (unsigned int *)&num_needed,
2126 hdrs_needed, M_WAIT, 0,
6d2010ae 2127 MBIGCLBYTES);
2d21ac55
A
2128 /*
2129 * Fall back to cluster size
2130 * if allocation failed
2131 */
91447636 2132 }
2d21ac55 2133
3e170ce0
A
2134 /*
2135 * Allocate a cluster as we want to
2136 * avoid to split the data in more
2137 * that one segment and using MINCLSIZE
2138 * would lead us to allocate two mbufs
2139 */
2140 if (soreserveheadroom != 0 &&
2141 freelist == NULL &&
2142 ((top == NULL &&
2143 bytes_to_alloc > _MHLEN) ||
2144 bytes_to_alloc > _MLEN)) {
2145 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2146 MCLBYTES;
2147 freelist =
2148 m_getpackets_internal(
2149 (unsigned int *)&num_needed,
2150 hdrs_needed, M_WAIT, 0,
2151 MCLBYTES);
2152 /*
2153 * Fall back to a single mbuf
2154 * if allocation failed
2155 */
2156 } else if (freelist == NULL &&
2157 bytes_to_alloc > MINCLSIZE) {
2d21ac55 2158 num_needed =
3e170ce0 2159 bytes_to_alloc / MCLBYTES;
2d21ac55 2160
3e170ce0 2161 if ((bytes_to_alloc -
2d21ac55
A
2162 (num_needed * MCLBYTES)) >=
2163 MINCLSIZE)
91447636 2164 num_needed++;
2d21ac55
A
2165
2166 freelist =
2167 m_getpackets_internal(
2168 (unsigned int *)&num_needed,
2169 hdrs_needed, M_WAIT, 0,
2170 MCLBYTES);
2171 /*
2172 * Fall back to a single mbuf
2173 * if allocation failed
2174 */
91447636 2175 }
3e170ce0
A
2176 /*
2177 * For datagram protocols, leave
2178 * headroom for protocol headers
2179 * in the first cluster of the chain
2180 */
2181 if (freelist != NULL && atomic &&
2182 top == NULL && headroom > 0) {
2183 freelist->m_data += headroom;
2184 }
39037602 2185
3e170ce0
A
2186 /*
2187 * Fall back to regular mbufs without
2188 * reserving the socket headroom
2189 */
91447636 2190 if (freelist == NULL) {
39236c6e 2191 if (top == NULL)
2d21ac55
A
2192 MGETHDR(freelist,
2193 M_WAIT, MT_DATA);
91447636 2194 else
2d21ac55
A
2195 MGET(freelist,
2196 M_WAIT, MT_DATA);
91447636
A
2197
2198 if (freelist == NULL) {
2199 error = ENOBUFS;
2200 socket_lock(so, 0);
3a60a9f5 2201 goto release;
91447636
A
2202 }
2203 /*
2d21ac55
A
2204 * For datagram protocols,
2205 * leave room for protocol
2206 * headers in first mbuf.
91447636 2207 */
39236c6e 2208 if (atomic && top == NULL &&
2d21ac55
A
2209 bytes_to_copy < MHLEN) {
2210 MH_ALIGN(freelist,
2211 bytes_to_copy);
2212 }
91447636
A
2213 }
2214 m = freelist;
2215 freelist = m->m_next;
2216 m->m_next = NULL;
2d21ac55 2217
91447636 2218 if ((m->m_flags & M_EXT))
3e170ce0
A
2219 mlen = m->m_ext.ext_size -
2220 m_leadingspace(m);
91447636 2221 else if ((m->m_flags & M_PKTHDR))
2d21ac55
A
2222 mlen =
2223 MHLEN - m_leadingspace(m);
91447636 2224 else
3e170ce0 2225 mlen = MLEN - m_leadingspace(m);
b0d623f7 2226 len = imin(mlen, bytes_to_copy);
91447636
A
2227
2228 chainlength += len;
2d21ac55 2229
91447636 2230 space -= len;
fa4905b1 2231
2d21ac55 2232 error = uiomove(mtod(m, caddr_t),
b0d623f7 2233 len, uio);
2d21ac55 2234
91447636 2235 resid = uio_resid(uio);
2d21ac55 2236
91447636
A
2237 m->m_len = len;
2238 *mp = m;
2239 top->m_pkthdr.len += len;
2d21ac55 2240 if (error)
91447636
A
2241 break;
2242 mp = &m->m_next;
2243 if (resid <= 0) {
2244 if (flags & MSG_EOR)
2245 top->m_flags |= M_EOR;
2246 break;
2247 }
2248 bytes_to_copy = min(resid, space);
2d21ac55
A
2249
2250 } while (space > 0 &&
2251 (chainlength < sosendmaxchain || atomic ||
2252 resid < MINCLSIZE));
2253
91447636 2254 socket_lock(so, 0);
2d21ac55 2255
91447636
A
2256 if (error)
2257 goto release;
2258 }
2d21ac55
A
2259
2260 if (flags & (MSG_HOLD|MSG_SEND)) {
3a60a9f5 2261 /* Enqueue for later, go away if HOLD */
39236c6e 2262 struct mbuf *mb1;
2d21ac55 2263 if (so->so_temp && (flags & MSG_FLUSH)) {
3a60a9f5
A
2264 m_freem(so->so_temp);
2265 so->so_temp = NULL;
2266 }
2267 if (so->so_temp)
2268 so->so_tail->m_next = top;
2269 else
2270 so->so_temp = top;
2271 mb1 = top;
2272 while (mb1->m_next)
2d21ac55 2273 mb1 = mb1->m_next;
3a60a9f5 2274 so->so_tail = mb1;
2d21ac55 2275 if (flags & MSG_HOLD) {
3a60a9f5
A
2276 top = NULL;
2277 goto release;
2278 }
2279 top = so->so_temp;
2d21ac55
A
2280 }
2281 if (dontroute)
2282 so->so_options |= SO_DONTROUTE;
2283
3e170ce0
A
2284 /*
2285 * Compute flags here, for pru_send and NKEs
2286 *
2287 * If the user set MSG_EOF, the protocol
2288 * understands this flag and nothing left to
2289 * send then use PRU_SEND_EOF instead of PRU_SEND.
2290 */
2d21ac55 2291 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2d21ac55 2292 ((flags & MSG_EOF) &&
3e170ce0
A
2293 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2294 (resid <= 0)) ? PRUS_EOF :
2295 /* If there is more to send set PRUS_MORETOCOME */
2296 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2297
fe8ab488
A
2298 if ((flags & MSG_SKIPCFIL) == 0) {
2299 /*
2300 * Socket filter processing
2301 */
2302 error = sflt_data_out(so, addr, &top,
2303 &control, (sendflags & MSG_OOB) ?
2304 sock_data_filt_flag_oob : 0);
2305 if (error) {
2306 if (error == EJUSTRETURN) {
2307 error = 0;
2308 clen = 0;
2309 control = NULL;
2310 top = NULL;
2311 }
2312 goto release;
91447636 2313 }
fe8ab488
A
2314#if CONTENT_FILTER
2315 /*
2316 * Content filter processing
2317 */
2318 error = cfil_sock_data_out(so, addr, top,
3e170ce0 2319 control, (sendflags & MSG_OOB) ?
fe8ab488
A
2320 sock_data_filt_flag_oob : 0);
2321 if (error) {
2322 if (error == EJUSTRETURN) {
2323 error = 0;
2324 clen = 0;
2325 control = NULL;
2326 top = NULL;
2327 }
2328 goto release;
2329 }
2330#endif /* CONTENT_FILTER */
1c79356b 2331 }
39236c6e
A
2332 if (so->so_flags & SOF_ENABLE_MSGS) {
2333 /*
2334 * Make a copy of control mbuf,
2335 * so that msg priority can be
2336 * passed to subsequent mbufs.
2337 */
2338 control_copy = m_dup(control, M_NOWAIT);
2339 }
6d2010ae 2340 error = (*so->so_proto->pr_usrreqs->pru_send)
39236c6e
A
2341 (so, sendflags, top, addr, control, p);
2342
2d21ac55
A
2343 if (flags & MSG_SEND)
2344 so->so_temp = NULL;
39236c6e 2345
2d21ac55
A
2346 if (dontroute)
2347 so->so_options &= ~SO_DONTROUTE;
2348
2349 clen = 0;
39236c6e
A
2350 control = control_copy;
2351 control_copy = NULL;
2352 top = NULL;
2d21ac55
A
2353 mp = &top;
2354 if (error)
2355 goto release;
1c79356b
A
2356 } while (resid && space > 0);
2357 } while (resid);
2358
2359release:
3a60a9f5 2360 if (sblocked)
39236c6e 2361 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
3a60a9f5
A
2362 else
2363 socket_unlock(so, 1);
1c79356b 2364out:
39236c6e 2365 if (top != NULL)
1c79356b 2366 m_freem(top);
39236c6e 2367 if (control != NULL)
1c79356b 2368 m_freem(control);
39236c6e 2369 if (freelist != NULL)
2d21ac55 2370 m_freem_list(freelist);
39236c6e
A
2371 if (control_copy != NULL)
2372 m_freem(control_copy);
1c79356b 2373
3e170ce0
A
2374 /*
2375 * One write has been done. This was enough. Get back to "normal"
2376 * behavior.
2377 */
2378 if (so->so_flags1 & SOF1_PRECONNECT_DATA)
2379 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
2380
2381 if (en_tracing) {
2382 /* resid passed here is the bytes left in uio */
2383 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2384 VM_KERNEL_ADDRPERM(so),
2385 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2386 (int64_t)(orig_resid - resid));
2387 }
2388 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2389 so->so_snd.sb_cc, space, error);
1c79356b
A
2390
2391 return (error);
2392}
2393
3e170ce0
A
2394/*
2395 * Supported only connected sockets (no address) without ancillary data
2396 * (control mbuf) for atomic protocols
2397 */
fe8ab488 2398int
3e170ce0 2399sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
fe8ab488
A
2400{
2401 struct mbuf *m, *freelist = NULL;
2402 user_ssize_t len, resid;
3e170ce0
A
2403 int error, dontroute, mlen;
2404 int atomic = sosendallatonce(so);
fe8ab488
A
2405 int sblocked = 0;
2406 struct proc *p = current_proc();
2407 u_int uiofirst = 0;
2408 u_int uiolast = 0;
3e170ce0
A
2409 struct mbuf *top = NULL;
2410 uint16_t headroom = 0;
2411 boolean_t bigcl;
fe8ab488
A
2412
2413 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2414 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2415
2416 if (so->so_type != SOCK_DGRAM) {
2417 error = EINVAL;
2418 goto out;
2419 }
2420 if (atomic == 0) {
2421 error = EINVAL;
2422 goto out;
2423 }
2424 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2425 error = EPROTONOSUPPORT;
2426 goto out;
2427 }
2428 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2429 error = EINVAL;
2430 goto out;
2431 }
3e170ce0 2432 resid = uio_array_resid(uioarray, uiocnt);
fe8ab488
A
2433
2434 /*
2435 * In theory resid should be unsigned.
2436 * However, space must be signed, as it might be less than 0
2437 * if we over-committed, and we must use a signed comparison
2438 * of space and resid. On the other hand, a negative resid
2439 * causes us to loop sending 0-length segments to the protocol.
2440 *
2441 * Note: We limit resid to be a positive int value as we use
2442 * imin() to set bytes_to_copy -- radr://14558484
2443 */
2444 if (resid < 0 || resid > INT_MAX) {
2445 error = EINVAL;
2446 goto out;
2447 }
fe8ab488
A
2448
2449 socket_lock(so, 1);
2450 so_update_last_owner_locked(so, p);
2451 so_update_policy(so);
3e170ce0 2452
fe8ab488 2453#if NECP
3e170ce0 2454 so_update_necp_policy(so, NULL, NULL);
fe8ab488 2455#endif /* NECP */
3e170ce0 2456
fe8ab488
A
2457 dontroute = (flags & MSG_DONTROUTE) &&
2458 (so->so_options & SO_DONTROUTE) == 0 &&
2459 (so->so_proto->pr_flags & PR_ATOMIC);
2460 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2461
3e170ce0
A
2462 error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2463 &sblocked, NULL);
fe8ab488
A
2464 if (error)
2465 goto release;
2466
3e170ce0
A
2467 /*
2468 * Use big 4 KB clusters when the outgoing interface does not prefer
2469 * 2 KB clusters
2470 */
2471 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2472
2473 if (soreserveheadroom != 0)
2474 headroom = so->so_pktheadroom;
2475
fe8ab488
A
2476 do {
2477 int i;
3e170ce0
A
2478 int num_needed = 0;
2479 int chainlength;
2480 size_t maxpktlen = 0;
2481 int bytes_to_alloc;
fe8ab488 2482
3e170ce0
A
2483 if (sosendminchain > 0)
2484 chainlength = 0;
2485 else
2486 chainlength = sosendmaxchain;
fe8ab488 2487
3e170ce0 2488 socket_unlock(so, 0);
fe8ab488 2489
3e170ce0
A
2490 /*
2491 * Find a set of uio that fit in a reasonable number
2492 * of mbuf packets
2493 */
2494 for (i = uiofirst; i < uiocnt; i++) {
2495 struct uio *auio = uioarray[i];
fe8ab488 2496
3e170ce0 2497 len = uio_resid(auio);
fe8ab488 2498
3e170ce0
A
2499 /* Do nothing for empty messages */
2500 if (len == 0)
2501 continue;
fe8ab488 2502
3e170ce0
A
2503 num_needed += 1;
2504 uiolast += 1;
fe8ab488 2505
3e170ce0
A
2506 if (len > maxpktlen)
2507 maxpktlen = len;
fe8ab488 2508
3e170ce0
A
2509 chainlength += len;
2510 if (chainlength > sosendmaxchain)
fe8ab488 2511 break;
3e170ce0
A
2512 }
2513 /*
2514 * Nothing left to send
2515 */
2516 if (num_needed == 0) {
2517 socket_lock(so, 0);
2518 break;
2519 }
2520 /*
2521 * Allocate buffer large enough to include headroom space for
2522 * network and link header
39037602 2523 *
3e170ce0
A
2524 */
2525 bytes_to_alloc = maxpktlen + headroom;
2526
2527 /*
2528 * Allocate a single contiguous buffer of the smallest available
2529 * size when possible
2530 */
2531 if (bytes_to_alloc > MCLBYTES &&
2532 bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2533 freelist = m_getpackets_internal(
2534 (unsigned int *)&num_needed,
2535 num_needed, M_WAIT, 1,
2536 MBIGCLBYTES);
2537 } else if (bytes_to_alloc > _MHLEN &&
2538 bytes_to_alloc <= MCLBYTES) {
2539 freelist = m_getpackets_internal(
2540 (unsigned int *)&num_needed,
2541 num_needed, M_WAIT, 1,
2542 MCLBYTES);
2543 } else {
fe8ab488
A
2544 freelist = m_allocpacket_internal(
2545 (unsigned int *)&num_needed,
3e170ce0
A
2546 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2547 }
39037602 2548
3e170ce0
A
2549 if (freelist == NULL) {
2550 socket_lock(so, 0);
2551 error = ENOMEM;
2552 goto release;
2553 }
2554 /*
2555 * Copy each uio of the set into its own mbuf packet
2556 */
2557 for (i = uiofirst, m = freelist;
2558 i < uiolast && m != NULL;
2559 i++) {
2560 int bytes_to_copy;
2561 struct mbuf *n;
2562 struct uio *auio = uioarray[i];
fe8ab488 2563
3e170ce0
A
2564 bytes_to_copy = uio_resid(auio);
2565
2566 /* Do nothing for empty messages */
2567 if (bytes_to_copy == 0)
2568 continue;
fe8ab488 2569 /*
3e170ce0
A
2570 * Leave headroom for protocol headers
2571 * in the first mbuf of the chain
fe8ab488 2572 */
3e170ce0
A
2573 m->m_data += headroom;
2574
2575 for (n = m; n != NULL; n = n->m_next) {
2576 if ((m->m_flags & M_EXT))
2577 mlen = m->m_ext.ext_size -
2578 m_leadingspace(m);
2579 else if ((m->m_flags & M_PKTHDR))
2580 mlen =
2581 MHLEN - m_leadingspace(m);
2582 else
2583 mlen = MLEN - m_leadingspace(m);
2584 len = imin(mlen, bytes_to_copy);
fe8ab488 2585
3e170ce0
A
2586 /*
2587 * Note: uiomove() decrements the iovec
2588 * length
2589 */
2590 error = uiomove(mtod(n, caddr_t),
2591 len, auio);
fe8ab488
A
2592 if (error != 0)
2593 break;
3e170ce0
A
2594 n->m_len = len;
2595 m->m_pkthdr.len += len;
fe8ab488 2596
3e170ce0 2597 VERIFY(m->m_pkthdr.len <= maxpktlen);
fe8ab488 2598
3e170ce0
A
2599 bytes_to_copy -= len;
2600 resid -= len;
2601 }
2602 if (m->m_pkthdr.len == 0) {
2603 printf(
2604 "%s:%d so %llx pkt %llx type %u len null\n",
2605 __func__, __LINE__,
2606 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2607 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2608 m->m_type);
2609 }
2610 if (error != 0)
2611 break;
2612 m = m->m_nextpkt;
fe8ab488
A
2613 }
2614
3e170ce0
A
2615 socket_lock(so, 0);
2616
2617 if (error)
2618 goto release;
2619 top = freelist;
2620 freelist = NULL;
2621
fe8ab488
A
2622 if (dontroute)
2623 so->so_options |= SO_DONTROUTE;
2624
2625 if ((flags & MSG_SKIPCFIL) == 0) {
2626 struct mbuf **prevnextp = NULL;
3e170ce0 2627
fe8ab488
A
2628 for (i = uiofirst, m = top;
2629 i < uiolast && m != NULL;
2630 i++) {
2631 struct mbuf *nextpkt = m->m_nextpkt;
2632
2633 /*
2634 * Socket filter processing
2635 */
3e170ce0
A
2636 error = sflt_data_out(so, NULL, &m,
2637 NULL, 0);
fe8ab488
A
2638 if (error != 0 && error != EJUSTRETURN)
2639 goto release;
3e170ce0 2640
fe8ab488
A
2641#if CONTENT_FILTER
2642 if (error == 0) {
2643 /*
2644 * Content filter processing
2645 */
3e170ce0
A
2646 error = cfil_sock_data_out(so, NULL, m,
2647 NULL, 0);
fe8ab488
A
2648 if (error != 0 && error != EJUSTRETURN)
2649 goto release;
2650 }
2651#endif /* CONTENT_FILTER */
2652 /*
2653 * Remove packet from the list when
2654 * swallowed by a filter
2655 */
2656 if (error == EJUSTRETURN) {
2657 error = 0;
2658 if (prevnextp != NULL)
2659 *prevnextp = nextpkt;
2660 else
2661 top = nextpkt;
3e170ce0
A
2662 }
2663
fe8ab488
A
2664 m = nextpkt;
2665 if (m != NULL)
2666 prevnextp = &m->m_nextpkt;
2667 }
2668 }
2669 if (top != NULL)
2670 error = (*so->so_proto->pr_usrreqs->pru_send_list)
3e170ce0 2671 (so, 0, top, NULL, NULL, p);
fe8ab488
A
2672
2673 if (dontroute)
2674 so->so_options &= ~SO_DONTROUTE;
2675
fe8ab488
A
2676 top = NULL;
2677 uiofirst = uiolast;
2678 } while (resid > 0 && error == 0);
2679release:
2680 if (sblocked)
2681 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2682 else
2683 socket_unlock(so, 1);
2684out:
2685 if (top != NULL)
2686 m_freem(top);
fe8ab488
A
2687 if (freelist != NULL)
2688 m_freem_list(freelist);
2689
2690 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2691 so->so_snd.sb_cc, 0, error);
2692
2693 return (error);
2694}
2695
3e170ce0
A
2696/*
2697 * May return ERESTART when packet is dropped by MAC policy check
2698 */
2699static int
2700soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2701 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2702{
2703 int error = 0;
2704 struct mbuf *m = *mp;
2705 struct mbuf *nextrecord = *nextrecordp;
2706
2707 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2708#if CONFIG_MACF_SOCKET_SUBSET
2709 /*
2710 * Call the MAC framework for policy checking if we're in
2711 * the user process context and the socket isn't connected.
2712 */
2713 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2714 struct mbuf *m0 = m;
2715 /*
2716 * Dequeue this record (temporarily) from the receive
2717 * list since we're about to drop the socket's lock
2718 * where a new record may arrive and be appended to
2719 * the list. Upon MAC policy failure, the record
2720 * will be freed. Otherwise, we'll add it back to
2721 * the head of the list. We cannot rely on SB_LOCK
2722 * because append operation uses the socket's lock.
2723 */
2724 do {
2725 m->m_nextpkt = NULL;
2726 sbfree(&so->so_rcv, m);
2727 m = m->m_next;
2728 } while (m != NULL);
2729 m = m0;
2730 so->so_rcv.sb_mb = nextrecord;
2731 SB_EMPTY_FIXUP(&so->so_rcv);
2732 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2733 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2734 socket_unlock(so, 0);
2735
2736 if (mac_socket_check_received(proc_ucred(p), so,
2737 mtod(m, struct sockaddr *)) != 0) {
2738 /*
2739 * MAC policy failure; free this record and
2740 * process the next record (or block until
2741 * one is available). We have adjusted sb_cc
2742 * and sb_mbcnt above so there is no need to
2743 * call sbfree() again.
2744 */
2745 m_freem(m);
2746 /*
2747 * Clear SB_LOCK but don't unlock the socket.
2748 * Process the next record or wait for one.
2749 */
2750 socket_lock(so, 0);
2751 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2752 error = ERESTART;
2753 goto done;
2754 }
2755 socket_lock(so, 0);
2756 /*
2757 * If the socket has been defunct'd, drop it.
2758 */
2759 if (so->so_flags & SOF_DEFUNCT) {
2760 m_freem(m);
2761 error = ENOTCONN;
2762 goto done;
2763 }
2764 /*
2765 * Re-adjust the socket receive list and re-enqueue
2766 * the record in front of any packets which may have
2767 * been appended while we dropped the lock.
2768 */
2769 for (m = m0; m->m_next != NULL; m = m->m_next)
2770 sballoc(&so->so_rcv, m);
2771 sballoc(&so->so_rcv, m);
2772 if (so->so_rcv.sb_mb == NULL) {
2773 so->so_rcv.sb_lastrecord = m0;
2774 so->so_rcv.sb_mbtail = m;
2775 }
2776 m = m0;
2777 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2778 so->so_rcv.sb_mb = m;
2779 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2780 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2781 }
2782#endif /* CONFIG_MACF_SOCKET_SUBSET */
2783 if (psa != NULL) {
2784 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
2785 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2786 error = EWOULDBLOCK;
2787 goto done;
2788 }
2789 }
2790 if (flags & MSG_PEEK) {
2791 m = m->m_next;
2792 } else {
2793 sbfree(&so->so_rcv, m);
2794 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2795 panic("%s: about to create invalid socketbuf",
2796 __func__);
2797 /* NOTREACHED */
2798 }
2799 MFREE(m, so->so_rcv.sb_mb);
2800 m = so->so_rcv.sb_mb;
2801 if (m != NULL) {
2802 m->m_nextpkt = nextrecord;
2803 } else {
2804 so->so_rcv.sb_mb = nextrecord;
2805 SB_EMPTY_FIXUP(&so->so_rcv);
2806 }
2807 }
2808done:
2809 *mp = m;
2810 *nextrecordp = nextrecord;
2811
2812 return (error);
2813}
2814
2815/*
2816 * Process one or more MT_CONTROL mbufs present before any data mbufs
2817 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2818 * just copy the data; if !MSG_PEEK, we call into the protocol to
2819 * perform externalization.
2820 */
2821static int
2822soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
2823 struct mbuf **mp, struct mbuf **nextrecordp)
2824{
2825 int error = 0;
2826 struct mbuf *cm = NULL, *cmn;
2827 struct mbuf **cme = &cm;
2828 struct sockbuf *sb_rcv = &so->so_rcv;
2829 struct mbuf **msgpcm = NULL;
2830 struct mbuf *m = *mp;
2831 struct mbuf *nextrecord = *nextrecordp;
2832 struct protosw *pr = so->so_proto;
2833
2834 /*
2835 * Externalizing the control messages would require us to
2836 * drop the socket's lock below. Once we re-acquire the
2837 * lock, the mbuf chain might change. In order to preserve
2838 * consistency, we unlink all control messages from the
2839 * first mbuf chain in one shot and link them separately
2840 * onto a different chain.
2841 */
2842 do {
2843 if (flags & MSG_PEEK) {
2844 if (controlp != NULL) {
2845 if (*controlp == NULL) {
2846 msgpcm = controlp;
2847 }
2848 *controlp = m_copy(m, 0, m->m_len);
2849
2850 /*
2851 * If we failed to allocate an mbuf,
2852 * release any previously allocated
2853 * mbufs for control data. Return
2854 * an error. Keep the mbufs in the
2855 * socket as this is using
2856 * MSG_PEEK flag.
2857 */
2858 if (*controlp == NULL) {
2859 m_freem(*msgpcm);
2860 error = ENOBUFS;
2861 goto done;
2862 }
2863 controlp = &(*controlp)->m_next;
2864 }
2865 m = m->m_next;
2866 } else {
2867 m->m_nextpkt = NULL;
2868 sbfree(sb_rcv, m);
2869 sb_rcv->sb_mb = m->m_next;
2870 m->m_next = NULL;
2871 *cme = m;
2872 cme = &(*cme)->m_next;
2873 m = sb_rcv->sb_mb;
2874 }
2875 } while (m != NULL && m->m_type == MT_CONTROL);
2876
2877 if (!(flags & MSG_PEEK)) {
2878 if (sb_rcv->sb_mb != NULL) {
2879 sb_rcv->sb_mb->m_nextpkt = nextrecord;
2880 } else {
2881 sb_rcv->sb_mb = nextrecord;
2882 SB_EMPTY_FIXUP(sb_rcv);
2883 }
2884 if (nextrecord == NULL)
2885 sb_rcv->sb_lastrecord = m;
2886 }
2887
2888 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2889 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2890
2891 while (cm != NULL) {
2892 int cmsg_type;
2893
2894 cmn = cm->m_next;
2895 cm->m_next = NULL;
2896 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2897
2898 /*
2899 * Call the protocol to externalize SCM_RIGHTS message
2900 * and return the modified message to the caller upon
2901 * success. Otherwise, all other control messages are
2902 * returned unmodified to the caller. Note that we
2903 * only get into this loop if MSG_PEEK is not set.
2904 */
2905 if (pr->pr_domain->dom_externalize != NULL &&
2906 cmsg_type == SCM_RIGHTS) {
2907 /*
2908 * Release socket lock: see 3903171. This
2909 * would also allow more records to be appended
2910 * to the socket buffer. We still have SB_LOCK
2911 * set on it, so we can be sure that the head
2912 * of the mbuf chain won't change.
2913 */
2914 socket_unlock(so, 0);
2915 error = (*pr->pr_domain->dom_externalize)(cm);
2916 socket_lock(so, 0);
2917 } else {
2918 error = 0;
2919 }
2920
2921 if (controlp != NULL && error == 0) {
2922 *controlp = cm;
2923 controlp = &(*controlp)->m_next;
2924 } else {
2925 (void) m_free(cm);
2926 }
2927 cm = cmn;
2928 }
2929 /*
2930 * Update the value of nextrecord in case we received new
2931 * records when the socket was unlocked above for
2932 * externalizing SCM_RIGHTS.
2933 */
2934 if (m != NULL)
2935 nextrecord = sb_rcv->sb_mb->m_nextpkt;
2936 else
2937 nextrecord = sb_rcv->sb_mb;
2938
2939done:
2940 *mp = m;
2941 *nextrecordp = nextrecord;
2942
2943 return (error);
2944}
2945
1c79356b
A
2946/*
2947 * Implement receive operations on a socket.
2948 * We depend on the way that records are added to the sockbuf
2949 * by sbappend*. In particular, each record (mbufs linked through m_next)
2950 * must begin with an address if the protocol so specifies,
2951 * followed by an optional mbuf or mbufs containing ancillary data,
2952 * and then zero or more mbufs of data.
2953 * In order to avoid blocking network interrupts for the entire time here,
2954 * we splx() while doing the actual copy to user space.
2955 * Although the sockbuf is locked, new data may still be appended,
2956 * and thus we must maintain consistency of the sockbuf during that time.
2957 *
2958 * The caller may receive the data as a single mbuf chain by supplying
2959 * an mbuf **mp0 for use in returning the chain. The uio is then used
2960 * only for the count in uio_resid.
2d21ac55
A
2961 *
2962 * Returns: 0 Success
2963 * ENOBUFS
2964 * ENOTCONN
2965 * EWOULDBLOCK
2966 * uiomove:EFAULT
2967 * sblock:EWOULDBLOCK
2968 * sblock:EINTR
2969 * sbwait:EBADF
2970 * sbwait:EINTR
2971 * sodelayed_copy:EFAULT
2972 * <pru_rcvoob>:EINVAL[TCP]
2973 * <pru_rcvoob>:EWOULDBLOCK[TCP]
2974 * <pru_rcvoob>:???
2975 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
2976 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
2977 * <pr_domain->dom_externalize>:???
2978 *
2979 * Notes: Additional return values from calls through <pru_rcvoob> and
2980 * <pr_domain->dom_externalize> depend on protocols other than
2981 * TCP or AF_UNIX, which are documented above.
1c79356b
A
2982 */
2983int
2d21ac55
A
2984soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2985 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1c79356b 2986{
39236c6e
A
2987 struct mbuf *m, **mp, *ml = NULL;
2988 struct mbuf *nextrecord, *free_list;
2989 int flags, error, offset;
2990 user_ssize_t len;
1c79356b 2991 struct protosw *pr = so->so_proto;
3e170ce0 2992 int moff, type = 0;
39236c6e
A
2993 user_ssize_t orig_resid = uio_resid(uio);
2994 user_ssize_t delayed_copy_len;
55e303ae
A
2995 int can_delay;
2996 int need_event;
2997 struct proc *p = current_proc();
3e170ce0 2998 boolean_t en_tracing = FALSE;
1c79356b 2999
fe8ab488
A
3000 /*
3001 * Sanity check on the length passed by caller as we are making 'int'
3002 * comparisons
3003 */
3004 if (orig_resid < 0 || orig_resid > INT_MAX)
3005 return (EINVAL);
3006
3e170ce0
A
3007 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3008 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3009 so->so_rcv.sb_hiwat);
3010
91447636 3011 socket_lock(so, 1);
6d2010ae 3012 so_update_last_owner_locked(so, p);
39236c6e 3013 so_update_policy(so);
1c79356b 3014
91447636 3015#ifdef MORE_LOCKING_DEBUG
39236c6e
A
3016 if (so->so_usecount == 1) {
3017 panic("%s: so=%x no other reference on socket\n", __func__, so);
3018 /* NOTREACHED */
3019 }
91447636 3020#endif
1c79356b 3021 mp = mp0;
39236c6e
A
3022 if (psa != NULL)
3023 *psa = NULL;
3024 if (controlp != NULL)
3025 *controlp = NULL;
3026 if (flagsp != NULL)
1c79356b
A
3027 flags = *flagsp &~ MSG_EOR;
3028 else
3029 flags = 0;
2d21ac55
A
3030
3031 /*
3032 * If a recv attempt is made on a previously-accepted socket
3033 * that has been marked as inactive (disconnected), reject
3034 * the request.
3035 */
3036 if (so->so_flags & SOF_DEFUNCT) {
3037 struct sockbuf *sb = &so->so_rcv;
3038
6d2010ae 3039 error = ENOTCONN;
39037602
A
3040 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3041 __func__, proc_pid(p), proc_best_name(p),
3042 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3043 SOCK_DOM(so), SOCK_TYPE(so), error);
2d21ac55
A
3044 /*
3045 * This socket should have been disconnected and flushed
6d2010ae
A
3046 * prior to being returned from sodefunct(); there should
3047 * be no data on its receive list, so panic otherwise.
2d21ac55 3048 */
6d2010ae
A
3049 if (so->so_state & SS_DEFUNCT)
3050 sb_empty_assert(sb, __func__);
2d21ac55 3051 socket_unlock(so, 1);
6d2010ae 3052 return (error);
2d21ac55
A
3053 }
3054
3e170ce0
A
3055 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3056 pr->pr_usrreqs->pru_preconnect) {
3057 /*
3058 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3059 * calling write() right after this. *If* the app calls a read
3060 * we do not want to block this read indefinetely. Thus,
3061 * we trigger a connect so that the session gets initiated.
3062 */
3063 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3064
3065 if (error) {
3066 socket_unlock(so, 1);
3067 return (error);
3068 }
3069 }
3070
3071 if (ENTR_SHOULDTRACE &&
3072 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3073 /*
3074 * enable energy tracing for inet sockets that go over
3075 * non-loopback interfaces only.
3076 */
3077 struct inpcb *inp = sotoinpcb(so);
3078 if (inp->inp_last_outifp != NULL &&
3079 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3080 en_tracing = TRUE;
3081 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3082 VM_KERNEL_ADDRPERM(so),
3083 ((so->so_state & SS_NBIO) ?
3084 kEnTrFlagNonBlocking : 0),
3085 (int64_t)orig_resid);
3086 }
3087 }
3088
2d21ac55
A
3089 /*
3090 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3091 * regardless of the flags argument. Here is the case were
3092 * out-of-band data is not inline.
3093 */
3094 if ((flags & MSG_OOB) ||
3095 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3096 (so->so_options & SO_OOBINLINE) == 0 &&
3097 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
1c79356b 3098 m = m_get(M_WAIT, MT_DATA);
55e303ae 3099 if (m == NULL) {
91447636 3100 socket_unlock(so, 1);
2d21ac55
A
3101 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3102 ENOBUFS, 0, 0, 0, 0);
9bccf70c 3103 return (ENOBUFS);
55e303ae 3104 }
1c79356b
A
3105 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3106 if (error)
3107 goto bad;
91447636 3108 socket_unlock(so, 0);
1c79356b
A
3109 do {
3110 error = uiomove(mtod(m, caddr_t),
b0d623f7 3111 imin(uio_resid(uio), m->m_len), uio);
1c79356b 3112 m = m_free(m);
39236c6e 3113 } while (uio_resid(uio) && error == 0 && m != NULL);
91447636 3114 socket_lock(so, 0);
1c79356b 3115bad:
39236c6e 3116 if (m != NULL)
1c79356b 3117 m_freem(m);
39236c6e 3118
9bccf70c
A
3119 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3120 if (error == EWOULDBLOCK || error == EINVAL) {
2d21ac55 3121 /*
9bccf70c 3122 * Let's try to get normal data:
2d21ac55
A
3123 * EWOULDBLOCK: out-of-band data not
3124 * receive yet. EINVAL: out-of-band data
3125 * already read.
9bccf70c
A
3126 */
3127 error = 0;
3128 goto nooob;
39236c6e 3129 } else if (error == 0 && flagsp != NULL) {
9bccf70c 3130 *flagsp |= MSG_OOB;
2d21ac55
A
3131 }
3132 }
91447636 3133 socket_unlock(so, 1);
3e170ce0
A
3134 if (en_tracing) {
3135 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3136 VM_KERNEL_ADDRPERM(so), 0,
3137 (int64_t)(orig_resid - uio_resid(uio)));
3138 }
2d21ac55
A
3139 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3140 0, 0, 0, 0);
39236c6e 3141
1c79356b
A
3142 return (error);
3143 }
3144nooob:
39236c6e
A
3145 if (mp != NULL)
3146 *mp = NULL;
fe8ab488
A
3147
3148 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
1c79356b 3149 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
fe8ab488 3150 }
1c79356b 3151
39236c6e 3152 free_list = NULL;
55e303ae 3153 delayed_copy_len = 0;
1c79356b 3154restart:
91447636
A
3155#ifdef MORE_LOCKING_DEBUG
3156 if (so->so_usecount <= 1)
fe8ab488 3157 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3e170ce0 3158 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
91447636 3159#endif
6601e61a
A
3160 /*
3161 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3162 * and if so just return to the caller. This could happen when
3163 * soreceive() is called by a socket upcall function during the
3164 * time the socket is freed. The socket buffer would have been
3165 * locked across the upcall, therefore we cannot put this thread
3166 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3167 * we may livelock), because the lock on the socket buffer will
3168 * only be released when the upcall routine returns to its caller.
3169 * Because the socket has been officially closed, there can be
3170 * no further read on it.
39236c6e
A
3171 *
3172 * A multipath subflow socket would have its SS_NOFDREF set by
3173 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3174 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
6601e61a
A
3175 */
3176 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
39236c6e 3177 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
6601e61a
A
3178 socket_unlock(so, 1);
3179 return (0);
3180 }
3181
9bccf70c
A
3182 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3183 if (error) {
91447636 3184 socket_unlock(so, 1);
2d21ac55
A
3185 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3186 0, 0, 0, 0);
3e170ce0
A
3187 if (en_tracing) {
3188 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3189 VM_KERNEL_ADDRPERM(so), 0,
3190 (int64_t)(orig_resid - uio_resid(uio)));
3191 }
1c79356b
A
3192 return (error);
3193 }
1c79356b
A
3194
3195 m = so->so_rcv.sb_mb;
3196 /*
3197 * If we have less data than requested, block awaiting more
3198 * (subject to any timeout) if:
3199 * 1. the current count is less than the low water mark, or
3200 * 2. MSG_WAITALL is set, and it is possible to do the entire
3201 * receive operation at once if we block (resid <= hiwat).
3202 * 3. MSG_DONTWAIT is not set
3203 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3204 * we have to do the receive in sections, and thus risk returning
3205 * a short count if a timeout or signal occurs after we start.
3206 */
39236c6e 3207 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
91447636 3208 so->so_rcv.sb_cc < uio_resid(uio)) &&
2d21ac55 3209 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
91447636 3210 ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
39236c6e 3211 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
2d21ac55
A
3212 /*
3213 * Panic if we notice inconsistencies in the socket's
3214 * receive list; both sb_mb and sb_cc should correctly
3215 * reflect the contents of the list, otherwise we may
3216 * end up with false positives during select() or poll()
3217 * which could put the application in a bad state.
3218 */
316670eb 3219 SB_MB_CHECK(&so->so_rcv);
55e303ae 3220
1c79356b 3221 if (so->so_error) {
39236c6e 3222 if (m != NULL)
1c79356b
A
3223 goto dontblock;
3224 error = so->so_error;
3225 if ((flags & MSG_PEEK) == 0)
3226 so->so_error = 0;
3227 goto release;
3228 }
3229 if (so->so_state & SS_CANTRCVMORE) {
fe8ab488
A
3230#if CONTENT_FILTER
3231 /*
3232 * Deal with half closed connections
3233 */
3234 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3235 cfil_sock_data_pending(&so->so_rcv) != 0)
3236 CFIL_LOG(LOG_INFO,
3237 "so %llx ignore SS_CANTRCVMORE",
3e170ce0
A
3238 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3239 else
fe8ab488 3240#endif /* CONTENT_FILTER */
39236c6e 3241 if (m != NULL)
1c79356b
A
3242 goto dontblock;
3243 else
3244 goto release;
3245 }
39236c6e 3246 for (; m != NULL; m = m->m_next)
2d21ac55 3247 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
1c79356b
A
3248 m = so->so_rcv.sb_mb;
3249 goto dontblock;
3250 }
3251 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3252 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3253 error = ENOTCONN;
3254 goto release;
3255 }
91447636 3256 if (uio_resid(uio) == 0)
1c79356b 3257 goto release;
3e170ce0 3258
2d21ac55
A
3259 if ((so->so_state & SS_NBIO) ||
3260 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1c79356b
A
3261 error = EWOULDBLOCK;
3262 goto release;
3263 }
2d21ac55
A
3264 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3265 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
39236c6e 3266 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
2d21ac55 3267#if EVEN_MORE_LOCKING_DEBUG
1c79356b 3268 if (socket_debug)
2d21ac55 3269 printf("Waiting for socket data\n");
91447636 3270#endif
55e303ae 3271
1c79356b 3272 error = sbwait(&so->so_rcv);
2d21ac55 3273#if EVEN_MORE_LOCKING_DEBUG
1c79356b 3274 if (socket_debug)
2d21ac55 3275 printf("SORECEIVE - sbwait returned %d\n", error);
91447636 3276#endif
39236c6e
A
3277 if (so->so_usecount < 1) {
3278 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3279 __func__, so, so->so_usecount);
3280 /* NOTREACHED */
3281 }
9bccf70c 3282 if (error) {
91447636 3283 socket_unlock(so, 1);
2d21ac55
A
3284 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3285 0, 0, 0, 0);
3e170ce0
A
3286 if (en_tracing) {
3287 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3288 VM_KERNEL_ADDRPERM(so), 0,
3289 (int64_t)(orig_resid - uio_resid(uio)));
3290 }
2d21ac55 3291 return (error);
1c79356b
A
3292 }
3293 goto restart;
3294 }
3295dontblock:
b0d623f7 3296 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
2d21ac55
A
3297 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3298 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
1c79356b 3299 nextrecord = m->m_nextpkt;
fe8ab488 3300
3e170ce0
A
3301 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3302 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3303 mp0 == NULL);
3304 if (error == ERESTART)
3305 goto restart;
3306 else if (error != 0)
3307 goto release;
1c79356b 3308 orig_resid = 0;
1c79356b 3309 }
2d21ac55
A
3310
3311 /*
3312 * Process one or more MT_CONTROL mbufs present before any data mbufs
3313 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3314 * just copy the data; if !MSG_PEEK, we call into the protocol to
3315 * perform externalization.
3316 */
3317 if (m != NULL && m->m_type == MT_CONTROL) {
3e170ce0
A
3318 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3319 if (error != 0)
3320 goto release;
316670eb 3321 orig_resid = 0;
1c79356b 3322 }
2d21ac55 3323
39236c6e
A
3324 /*
3325 * If the socket is a TCP socket with message delivery
3326 * enabled, then create a control msg to deliver the
3327 * relative TCP sequence number for this data. Waiting
3328 * until this point will protect against failures to
3329 * allocate an mbuf for control msgs.
3330 */
3331 if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3332 (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3333 struct mbuf *seq_cm;
3334
3335 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
3336 sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
3337 if (seq_cm == NULL) {
3338 /* unable to allocate a control mbuf */
3339 error = ENOBUFS;
3340 goto release;
3341 }
3342 *controlp = seq_cm;
3343 controlp = &seq_cm->m_next;
3344 }
3345
2d21ac55
A
3346 if (m != NULL) {
3347 if (!(flags & MSG_PEEK)) {
3348 /*
3349 * We get here because m points to an mbuf following
3350 * any MT_SONAME or MT_CONTROL mbufs which have been
3351 * processed above. In any case, m should be pointing
3352 * to the head of the mbuf chain, and the nextrecord
3353 * should be either NULL or equal to m->m_nextpkt.
3354 * See comments above about SB_LOCK.
3355 */
39236c6e
A
3356 if (m != so->so_rcv.sb_mb ||
3357 m->m_nextpkt != nextrecord) {
3358 panic("%s: post-control !sync so=%p m=%p "
3359 "nextrecord=%p\n", __func__, so, m,
3360 nextrecord);
3361 /* NOTREACHED */
3362 }
2d21ac55
A
3363 if (nextrecord == NULL)
3364 so->so_rcv.sb_lastrecord = m;
3365 }
1c79356b
A
3366 type = m->m_type;
3367 if (type == MT_OOBDATA)
3368 flags |= MSG_OOB;
2d21ac55
A
3369 } else {
3370 if (!(flags & MSG_PEEK)) {
2d21ac55
A
3371 SB_EMPTY_FIXUP(&so->so_rcv);
3372 }
1c79356b 3373 }
2d21ac55
A
3374 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3375 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3376
1c79356b
A
3377 moff = 0;
3378 offset = 0;
fa4905b1 3379
91447636 3380 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
2d21ac55 3381 can_delay = 1;
55e303ae 3382 else
2d21ac55 3383 can_delay = 0;
55e303ae
A
3384
3385 need_event = 0;
fa4905b1 3386
39236c6e
A
3387 while (m != NULL &&
3388 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
1c79356b
A
3389 if (m->m_type == MT_OOBDATA) {
3390 if (type != MT_OOBDATA)
3391 break;
2d21ac55 3392 } else if (type == MT_OOBDATA) {
1c79356b 3393 break;
2d21ac55 3394 }
9bccf70c 3395 /*
2d21ac55 3396 * Make sure to allways set MSG_OOB event when getting
9bccf70c
A
3397 * out of band data inline.
3398 */
1c79356b 3399 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2d21ac55
A
3400 (so->so_options & SO_OOBINLINE) != 0 &&
3401 (so->so_state & SS_RCVATMARK) != 0) {
9bccf70c
A
3402 flags |= MSG_OOB;
3403 }
1c79356b 3404 so->so_state &= ~SS_RCVATMARK;
91447636 3405 len = uio_resid(uio) - delayed_copy_len;
1c79356b
A
3406 if (so->so_oobmark && len > so->so_oobmark - offset)
3407 len = so->so_oobmark - offset;
3408 if (len > m->m_len - moff)
3409 len = m->m_len - moff;
3410 /*
3411 * If mp is set, just pass back the mbufs.
3412 * Otherwise copy them out via the uio, then free.
3413 * Sockbuf must be consistent here (points to current mbuf,
3414 * it points to next record) when we drop priority;
3415 * we must note any additions to the sockbuf when we
3416 * block interrupts again.
3417 */
39236c6e 3418 if (mp == NULL) {
2d21ac55
A
3419 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3420 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
55e303ae 3421 if (can_delay && len == m->m_len) {
2d21ac55 3422 /*
55e303ae
A
3423 * only delay the copy if we're consuming the
3424 * mbuf and we're NOT in MSG_PEEK mode
3425 * and we have enough data to make it worthwile
2d21ac55
A
3426 * to drop and retake the lock... can_delay
3427 * reflects the state of the 2 latter
3428 * constraints moff should always be zero
3429 * in these cases
55e303ae 3430 */
2d21ac55 3431 delayed_copy_len += len;
55e303ae 3432 } else {
2d21ac55
A
3433 if (delayed_copy_len) {
3434 error = sodelayed_copy(so, uio,
3435 &free_list, &delayed_copy_len);
55e303ae
A
3436
3437 if (error) {
55e303ae
A
3438 goto release;
3439 }
2d21ac55
A
3440 /*
3441 * can only get here if MSG_PEEK is not
3442 * set therefore, m should point at the
3443 * head of the rcv queue; if it doesn't,
3444 * it means something drastically
3445 * changed while we were out from behind
3446 * the lock in sodelayed_copy. perhaps
3447 * a RST on the stream. in any event,
3448 * the stream has been interrupted. it's
3449 * probably best just to return whatever
3450 * data we've moved and let the caller
3451 * sort it out...
3452 */
55e303ae 3453 if (m != so->so_rcv.sb_mb) {
2d21ac55 3454 break;
55e303ae
A
3455 }
3456 }
91447636 3457 socket_unlock(so, 0);
2d21ac55
A
3458 error = uiomove(mtod(m, caddr_t) + moff,
3459 (int)len, uio);
91447636 3460 socket_lock(so, 0);
55e303ae 3461
55e303ae 3462 if (error)
2d21ac55 3463 goto release;
55e303ae 3464 }
2d21ac55 3465 } else {
91447636 3466 uio_setresid(uio, (uio_resid(uio) - len));
2d21ac55 3467 }
1c79356b
A
3468 if (len == m->m_len - moff) {
3469 if (m->m_flags & M_EOR)
3470 flags |= MSG_EOR;
3471 if (flags & MSG_PEEK) {
3472 m = m->m_next;
3473 moff = 0;
3474 } else {
3475 nextrecord = m->m_nextpkt;
3476 sbfree(&so->so_rcv, m);
91447636 3477 m->m_nextpkt = NULL;
55e303ae 3478
39236c6e
A
3479 /*
3480 * If this packet is an unordered packet
3481 * (indicated by M_UNORDERED_DATA flag), remove
3482 * the additional bytes added to the
3483 * receive socket buffer size.
3484 */
3485 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3486 m->m_len &&
3487 (m->m_flags & M_UNORDERED_DATA) &&
3488 sbreserve(&so->so_rcv,
3489 so->so_rcv.sb_hiwat - m->m_len)) {
3490 if (so->so_msg_state->msg_uno_bytes >
3491 m->m_len) {
3492 so->so_msg_state->
3493 msg_uno_bytes -= m->m_len;
3494 } else {
3495 so->so_msg_state->
3496 msg_uno_bytes = 0;
3497 }
3498 m->m_flags &= ~M_UNORDERED_DATA;
3499 }
3500
3501 if (mp != NULL) {
1c79356b
A
3502 *mp = m;
3503 mp = &m->m_next;
3504 so->so_rcv.sb_mb = m = m->m_next;
39236c6e 3505 *mp = NULL;
1c79356b 3506 } else {
55e303ae 3507 if (free_list == NULL)
2d21ac55
A
3508 free_list = m;
3509 else
3510 ml->m_next = m;
3511 ml = m;
14353aa8 3512 so->so_rcv.sb_mb = m = m->m_next;
39236c6e 3513 ml->m_next = NULL;
1c79356b 3514 }
2d21ac55 3515 if (m != NULL) {
1c79356b 3516 m->m_nextpkt = nextrecord;
2d21ac55
A
3517 if (nextrecord == NULL)
3518 so->so_rcv.sb_lastrecord = m;
3519 } else {
3520 so->so_rcv.sb_mb = nextrecord;
3521 SB_EMPTY_FIXUP(&so->so_rcv);
3522 }
3523 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3524 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
1c79356b
A
3525 }
3526 } else {
2d21ac55 3527 if (flags & MSG_PEEK) {
1c79356b 3528 moff += len;
2d21ac55 3529 } else {
6d2010ae
A
3530 if (mp != NULL) {
3531 int copy_flag;
3532
3533 if (flags & MSG_DONTWAIT)
3534 copy_flag = M_DONTWAIT;
3535 else
3536 copy_flag = M_WAIT;
3537 *mp = m_copym(m, 0, len, copy_flag);
39236c6e
A
3538 /*
3539 * Failed to allocate an mbuf?
3540 * Adjust uio_resid back, it was
3541 * adjusted down by len bytes which
3542 * we didn't copy over.
3543 */
6d2010ae 3544 if (*mp == NULL) {
39236c6e
A
3545 uio_setresid(uio,
3546 (uio_resid(uio) + len));
6d2010ae
A
3547 break;
3548 }
3549 }
1c79356b
A
3550 m->m_data += len;
3551 m->m_len -= len;
3552 so->so_rcv.sb_cc -= len;
3553 }
3554 }
3555 if (so->so_oobmark) {
3556 if ((flags & MSG_PEEK) == 0) {
3557 so->so_oobmark -= len;
3558 if (so->so_oobmark == 0) {
2d21ac55
A
3559 so->so_state |= SS_RCVATMARK;
3560 /*
3561 * delay posting the actual event until
3562 * after any delayed copy processing
3563 * has finished
3564 */
3565 need_event = 1;
3566 break;
1c79356b
A
3567 }
3568 } else {
3569 offset += len;
3570 if (offset == so->so_oobmark)
3571 break;
3572 }
3573 }
2d21ac55 3574 if (flags & MSG_EOR)
1c79356b
A
3575 break;
3576 /*
2d21ac55
A
3577 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3578 * (for non-atomic socket), we must not quit until
3579 * "uio->uio_resid == 0" or an error termination.
3580 * If a signal/timeout occurs, return with a short
3581 * count but without error. Keep sockbuf locked
3582 * against other readers.
1c79356b 3583 */
39236c6e 3584 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL &&
2d21ac55 3585 (uio_resid(uio) - delayed_copy_len) > 0 &&
1c79356b 3586 !sosendallatonce(so) && !nextrecord) {
fe8ab488
A
3587 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3588#if CONTENT_FILTER
3589 && cfil_sock_data_pending(&so->so_rcv) == 0
3590#endif /* CONTENT_FILTER */
3591 ))
2d21ac55 3592 goto release;
fa4905b1 3593
2d21ac55
A
3594 /*
3595 * Depending on the protocol (e.g. TCP), the following
3596 * might cause the socket lock to be dropped and later
3597 * be reacquired, and more data could have arrived and
3598 * have been appended to the receive socket buffer by
3599 * the time it returns. Therefore, we only sleep in
3600 * sbwait() below if and only if the socket buffer is
3601 * empty, in order to avoid a false sleep.
3602 */
3603 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3604 (((struct inpcb *)so->so_pcb)->inp_state !=
3605 INPCB_STATE_DEAD))
3606 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3607
3608 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3609 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3610
3611 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3612 error = 0;
55e303ae 3613 goto release;
fa4905b1 3614 }
55e303ae 3615 /*
2d21ac55
A
3616 * have to wait until after we get back from the sbwait
3617 * to do the copy because we will drop the lock if we
3618 * have enough data that has been delayed... by dropping
3619 * the lock we open up a window allowing the netisr
3620 * thread to process the incoming packets and to change
3621 * the state of this socket... we're issuing the sbwait
3622 * because the socket is empty and we're expecting the
3623 * netisr thread to wake us up when more packets arrive;
3624 * if we allow that processing to happen and then sbwait
3625 * we could stall forever with packets sitting in the
3626 * socket if no further packets arrive from the remote
3627 * side.
55e303ae 3628 *
2d21ac55
A
3629 * we want to copy before we've collected all the data
3630 * to satisfy this request to allow the copy to overlap
3631 * the incoming packet processing on an MP system
55e303ae 3632 */
2d21ac55
A
3633 if (delayed_copy_len > sorecvmincopy &&
3634 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3635 error = sodelayed_copy(so, uio,
3636 &free_list, &delayed_copy_len);
55e303ae
A
3637
3638 if (error)
2d21ac55 3639 goto release;
1c79356b
A
3640 }
3641 m = so->so_rcv.sb_mb;
39236c6e 3642 if (m != NULL) {
1c79356b 3643 nextrecord = m->m_nextpkt;
fa4905b1 3644 }
316670eb 3645 SB_MB_CHECK(&so->so_rcv);
1c79356b
A
3646 }
3647 }
91447636 3648#ifdef MORE_LOCKING_DEBUG
39236c6e
A
3649 if (so->so_usecount <= 1) {
3650 panic("%s: after big while so=%p ref=%d on socket\n",
3651 __func__, so, so->so_usecount);
3652 /* NOTREACHED */
3653 }
91447636 3654#endif
1c79356b 3655
39236c6e 3656 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
2d21ac55 3657 if (so->so_options & SO_DONTTRUNC) {
1c79356b 3658 flags |= MSG_RCVMORE;
2d21ac55 3659 } else {
9bccf70c 3660 flags |= MSG_TRUNC;
1c79356b
A
3661 if ((flags & MSG_PEEK) == 0)
3662 (void) sbdroprecord(&so->so_rcv);
3663 }
3664 }
2d21ac55
A
3665
3666 /*
3667 * pru_rcvd below (for TCP) may cause more data to be received
3668 * if the socket lock is dropped prior to sending the ACK; some
3669 * legacy OpenTransport applications don't handle this well
3670 * (if it receives less data than requested while MSG_HAVEMORE
3671 * is set), and so we set the flag now based on what we know
3672 * prior to calling pru_rcvd.
3673 */
3674 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
3675 flags |= MSG_HAVEMORE;
3676
1c79356b 3677 if ((flags & MSG_PEEK) == 0) {
39236c6e 3678 if (m == NULL) {
1c79356b 3679 so->so_rcv.sb_mb = nextrecord;
2d21ac55
A
3680 /*
3681 * First part is an inline SB_EMPTY_FIXUP(). Second
3682 * part makes sure sb_lastrecord is up-to-date if
3683 * there is still data in the socket buffer.
3684 */
3685 if (so->so_rcv.sb_mb == NULL) {
3686 so->so_rcv.sb_mbtail = NULL;
3687 so->so_rcv.sb_lastrecord = NULL;
3688 } else if (nextrecord->m_nextpkt == NULL) {
3689 so->so_rcv.sb_lastrecord = nextrecord;
3690 }
316670eb 3691 SB_MB_CHECK(&so->so_rcv);
2d21ac55
A
3692 }
3693 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3694 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
1c79356b
A
3695 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
3696 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3697 }
39236c6e 3698
55e303ae 3699 if (delayed_copy_len) {
91447636 3700 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
55e303ae 3701 if (error)
2d21ac55 3702 goto release;
55e303ae 3703 }
39236c6e
A
3704 if (free_list != NULL) {
3705 m_freem_list(free_list);
3706 free_list = NULL;
55e303ae
A
3707 }
3708 if (need_event)
2d21ac55 3709 postevent(so, 0, EV_OOB);
39236c6e 3710
91447636 3711 if (orig_resid == uio_resid(uio) && orig_resid &&
1c79356b 3712 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
39236c6e 3713 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
1c79356b
A
3714 goto restart;
3715 }
3716
39236c6e 3717 if (flagsp != NULL)
1c79356b
A
3718 *flagsp |= flags;
3719release:
91447636 3720#ifdef MORE_LOCKING_DEBUG
39236c6e
A
3721 if (so->so_usecount <= 1) {
3722 panic("%s: release so=%p ref=%d on socket\n", __func__,
2d21ac55 3723 so, so->so_usecount);
39236c6e
A
3724 /* NOTREACHED */
3725 }
91447636 3726#endif
39236c6e 3727 if (delayed_copy_len)
2d21ac55 3728 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
1c79356b 3729
39236c6e
A
3730 if (free_list != NULL)
3731 m_freem_list(free_list);
3732
3733 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
3734
3e170ce0
A
3735 if (en_tracing) {
3736 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3737 VM_KERNEL_ADDRPERM(so),
3738 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
3739 (int64_t)(orig_resid - uio_resid(uio)));
3740 }
2d21ac55
A
3741 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
3742 so->so_rcv.sb_cc, 0, error);
1c79356b
A
3743
3744 return (error);
3745}
3746
2d21ac55
A
3747/*
3748 * Returns: 0 Success
3749 * uiomove:EFAULT
3750 */
3751static int
3752sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
39236c6e 3753 user_ssize_t *resid)
55e303ae 3754{
2d21ac55 3755 int error = 0;
55e303ae
A
3756 struct mbuf *m;
3757
3758 m = *free_list;
3759
91447636 3760 socket_unlock(so, 0);
55e303ae 3761
39236c6e 3762 while (m != NULL && error == 0) {
2d21ac55 3763 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
2d21ac55
A
3764 m = m->m_next;
3765 }
3766 m_freem_list(*free_list);
3767
39236c6e 3768 *free_list = NULL;
2d21ac55
A
3769 *resid = 0;
3770
3771 socket_lock(so, 0);
55e303ae 3772
2d21ac55
A
3773 return (error);
3774}
3775
3e170ce0
A
3776static int
3777sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
3778 u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
3779{
3780#pragma unused(so)
3781 int error = 0;
3782 struct mbuf *ml, *m;
3783 int i = 0;
3784 struct uio *auio;
3785
3786 for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
3787 ml = ml->m_nextpkt, i++) {
3788 auio = msgarray[i].uio;
3789 for (m = ml; m != NULL; m = m->m_next) {
3790 error = uiomove(mtod(m, caddr_t), m->m_len, auio);
3791 if (error != 0)
3792 goto out;
3793 }
3794 }
3795out:
3796 m_freem_list(*free_list);
3797
3798 *free_list = NULL;
3799 *resid = 0;
3800
3801 return (error);
3802}
3803
2d21ac55 3804int
3e170ce0
A
3805soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
3806 int *flagsp)
2d21ac55 3807{
3e170ce0 3808 struct mbuf *m;
fe8ab488 3809 struct mbuf *nextrecord;
3e170ce0
A
3810 struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
3811 int error;
3812 user_ssize_t len, pktlen, delayed_copy_len = 0;
fe8ab488 3813 struct protosw *pr = so->so_proto;
3e170ce0 3814 user_ssize_t resid;
fe8ab488
A
3815 struct proc *p = current_proc();
3816 struct uio *auio = NULL;
3e170ce0 3817 int npkts = 0;
fe8ab488 3818 int sblocked = 0;
3e170ce0
A
3819 struct sockaddr **psa = NULL;
3820 struct mbuf **controlp = NULL;
3821 int can_delay;
3822 int flags;
3823 struct mbuf *free_others = NULL;
55e303ae 3824
fe8ab488
A
3825 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
3826 so, uiocnt,
3827 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
3828
fe8ab488
A
3829 /*
3830 * Sanity checks:
3831 * - Only supports don't wait flags
3832 * - Only support datagram sockets (could be extended to raw)
3833 * - Must be atomic
3834 * - Protocol must support packet chains
3835 * - The uio array is NULL (should we panic?)
3836 */
3e170ce0
A
3837 if (flagsp != NULL)
3838 flags = *flagsp;
3839 else
3840 flags = 0;
3841 if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
3842 MSG_NBIO)) {
3843 printf("%s invalid flags 0x%x\n", __func__, flags);
3844 error = EINVAL;
fe8ab488
A
3845 goto out;
3846 }
3847 if (so->so_type != SOCK_DGRAM) {
3848 error = EINVAL;
3849 goto out;
3850 }
3851 if (sosendallatonce(so) == 0) {
3852 error = EINVAL;
3853 goto out;
3854 }
3855 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
3856 error = EPROTONOSUPPORT;
3857 goto out;
3858 }
3e170ce0 3859 if (msgarray == NULL) {
fe8ab488
A
3860 printf("%s uioarray is NULL\n", __func__);
3861 error = EINVAL;
3862 goto out;
3863 }
3864 if (uiocnt == 0) {
3865 printf("%s uiocnt is 0\n", __func__);
3866 error = EINVAL;
3867 goto out;
3868 }
3869 /*
3870 * Sanity check on the length passed by caller as we are making 'int'
3871 * comparisons
3872 */
3e170ce0
A
3873 resid = recv_msg_array_resid(msgarray, uiocnt);
3874 if (resid < 0 || resid > INT_MAX) {
fe8ab488
A
3875 error = EINVAL;
3876 goto out;
3877 }
3878
3e170ce0
A
3879 if (!(flags & MSG_PEEK) && sorecvmincopy > 0)
3880 can_delay = 1;
3881 else
3882 can_delay = 0;
3883
fe8ab488
A
3884 socket_lock(so, 1);
3885 so_update_last_owner_locked(so, p);
3886 so_update_policy(so);
3887
3888#if NECP
3889 so_update_necp_policy(so, NULL, NULL);
3890#endif /* NECP */
3e170ce0 3891
fe8ab488
A
3892 /*
3893 * If a recv attempt is made on a previously-accepted socket
3894 * that has been marked as inactive (disconnected), reject
3895 * the request.
3896 */
3897 if (so->so_flags & SOF_DEFUNCT) {
3898 struct sockbuf *sb = &so->so_rcv;
3899
3900 error = ENOTCONN;
39037602
A
3901 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3902 __func__, proc_pid(p), proc_best_name(p),
3903 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3904 SOCK_DOM(so), SOCK_TYPE(so), error);
fe8ab488
A
3905 /*
3906 * This socket should have been disconnected and flushed
3907 * prior to being returned from sodefunct(); there should
3908 * be no data on its receive list, so panic otherwise.
3909 */
3910 if (so->so_state & SS_DEFUNCT)
3911 sb_empty_assert(sb, __func__);
3912 goto release;
3913 }
3e170ce0
A
3914
3915next:
3916 /*
3917 * The uio may be empty
3918 */
3919 if (npkts >= uiocnt) {
3920 error = 0;
3921 goto release;
3922 }
fe8ab488
A
3923restart:
3924 /*
3925 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3926 * and if so just return to the caller. This could happen when
3927 * soreceive() is called by a socket upcall function during the
3928 * time the socket is freed. The socket buffer would have been
3929 * locked across the upcall, therefore we cannot put this thread
3930 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3931 * we may livelock), because the lock on the socket buffer will
3932 * only be released when the upcall routine returns to its caller.
3933 * Because the socket has been officially closed, there can be
3934 * no further read on it.
3935 */
3936 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3937 (SS_NOFDREF | SS_CANTRCVMORE)) {
3938 error = 0;
3939 goto release;
3940 }
3941
3942 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3943 if (error) {
3944 goto release;
3945 }
3946 sblocked = 1;
3947
fe8ab488
A
3948 m = so->so_rcv.sb_mb;
3949 /*
3950 * Block awaiting more datagram if needed
3951 */
3e170ce0
A
3952 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3953 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3954 ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
fe8ab488
A
3955 /*
3956 * Panic if we notice inconsistencies in the socket's
3957 * receive list; both sb_mb and sb_cc should correctly
3958 * reflect the contents of the list, otherwise we may
3959 * end up with false positives during select() or poll()
3960 * which could put the application in a bad state.
3961 */
3962 SB_MB_CHECK(&so->so_rcv);
3963
3964 if (so->so_error) {
3965 error = so->so_error;
3e170ce0
A
3966 if ((flags & MSG_PEEK) == 0)
3967 so->so_error = 0;
fe8ab488
A
3968 goto release;
3969 }
3970 if (so->so_state & SS_CANTRCVMORE) {
3971 goto release;
3972 }
3973 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3974 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3975 error = ENOTCONN;
3976 goto release;
3977 }
3978 if ((so->so_state & SS_NBIO) ||
3979 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3980 error = EWOULDBLOCK;
3981 goto release;
3982 }
3983 /*
3984 * Do not block if we got some data
fe8ab488 3985 */
3e170ce0 3986 if (free_list != NULL) {
fe8ab488
A
3987 error = 0;
3988 goto release;
3989 }
3e170ce0 3990
fe8ab488
A
3991 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3992 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3993
3994 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3995 sblocked = 0;
3996
3997 error = sbwait(&so->so_rcv);
3998 if (error) {
3999 goto release;
4000 }
4001 goto restart;
4002 }
4003
fe8ab488
A
4004 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4005 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4006 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4007
4008 /*
4009 * Consume the current uio index as we have a datagram
4010 */
3e170ce0
A
4011 auio = msgarray[npkts].uio;
4012 resid = uio_resid(auio);
4013 msgarray[npkts].which |= SOCK_MSG_DATA;
4014 psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4015 &msgarray[npkts].psa : NULL;
4016 controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4017 &msgarray[npkts].controlp : NULL;
4018 npkts += 1;
fe8ab488
A
4019 nextrecord = m->m_nextpkt;
4020
fe8ab488 4021 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3e170ce0
A
4022 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4023 if (error == ERESTART)
4024 goto restart;
4025 else if (error != 0)
4026 goto release;
fe8ab488 4027 }
fe8ab488 4028
fe8ab488 4029 if (m != NULL && m->m_type == MT_CONTROL) {
3e170ce0
A
4030 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4031 if (error != 0)
4032 goto release;
fe8ab488 4033 }
fe8ab488 4034
3e170ce0
A
4035 if (m->m_pkthdr.len == 0) {
4036 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4037 __func__, __LINE__,
4038 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4039 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4040 m->m_type);
4041 }
fe8ab488
A
4042
4043 /*
3e170ce0
A
4044 * Loop to copy the mbufs of the current record
4045 * Support zero length packets
fe8ab488 4046 */
3e170ce0
A
4047 ml = NULL;
4048 pktlen = 0;
4049 while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
fe8ab488 4050 if (m->m_len == 0)
3e170ce0
A
4051 panic("%p m_len zero", m);
4052 if (m->m_type == 0)
4053 panic("%p m_type zero", m);
fe8ab488
A
4054 /*
4055 * Clip to the residual length
4056 */
4057 if (len > m->m_len)
4058 len = m->m_len;
3e170ce0 4059 pktlen += len;
fe8ab488 4060 /*
3e170ce0 4061 * Copy the mbufs via the uio or delay the copy
fe8ab488
A
4062 * Sockbuf must be consistent here (points to current mbuf,
4063 * it points to next record) when we drop priority;
4064 * we must note any additions to the sockbuf when we
4065 * block interrupts again.
4066 */
3e170ce0 4067 if (len > 0 && can_delay == 0) {
fe8ab488
A
4068 socket_unlock(so, 0);
4069 error = uiomove(mtod(m, caddr_t), (int)len, auio);
4070 socket_lock(so, 0);
fe8ab488
A
4071 if (error)
4072 goto release;
3e170ce0
A
4073 } else {
4074 delayed_copy_len += len;
fe8ab488 4075 }
3e170ce0 4076
fe8ab488
A
4077 if (len == m->m_len) {
4078 /*
3e170ce0 4079 * m was entirely copied
fe8ab488 4080 */
fe8ab488 4081 sbfree(&so->so_rcv, m);
3e170ce0 4082 nextrecord = m->m_nextpkt;
fe8ab488
A
4083 m->m_nextpkt = NULL;
4084
4085 /*
3e170ce0 4086 * Set the first packet to the head of the free list
fe8ab488 4087 */
3e170ce0
A
4088 if (free_list == NULL)
4089 free_list = m;
4090 /*
4091 * Link current packet to tail of free list
4092 */
4093 if (ml == NULL) {
4094 if (free_tail != NULL)
4095 free_tail->m_nextpkt = m;
4096 free_tail = m;
fe8ab488 4097 }
3e170ce0
A
4098 /*
4099 * Link current mbuf to last mbuf of current packet
4100 */
4101 if (ml != NULL)
4102 ml->m_next = m;
4103 ml = m;
4104
4105 /*
4106 * Move next buf to head of socket buffer
4107 */
4108 so->so_rcv.sb_mb = m = ml->m_next;
4109 ml->m_next = NULL;
4110
fe8ab488
A
4111 if (m != NULL) {
4112 m->m_nextpkt = nextrecord;
4113 if (nextrecord == NULL)
4114 so->so_rcv.sb_lastrecord = m;
4115 } else {
4116 so->so_rcv.sb_mb = nextrecord;
4117 SB_EMPTY_FIXUP(&so->so_rcv);
4118 }
4119 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4120 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4121 } else {
4122 /*
4123 * Stop the loop on partial copy
4124 */
fe8ab488
A
4125 break;
4126 }
4127 }
4128#ifdef MORE_LOCKING_DEBUG
4129 if (so->so_usecount <= 1) {
4130 panic("%s: after big while so=%llx ref=%d on socket\n",
4131 __func__,
3e170ce0 4132 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
fe8ab488
A
4133 /* NOTREACHED */
4134 }
4135#endif
4136 /*
4137 * Tell the caller we made a partial copy
4138 */
4139 if (m != NULL) {
4140 if (so->so_options & SO_DONTTRUNC) {
3e170ce0
A
4141 /*
4142 * Copyout first the freelist then the partial mbuf
4143 */
4144 socket_unlock(so, 0);
4145 if (delayed_copy_len)
4146 error = sodelayed_copy_list(so, msgarray,
4147 uiocnt, &free_list, &delayed_copy_len);
4148
4149 if (error == 0) {
4150 error = uiomove(mtod(m, caddr_t), (int)len,
4151 auio);
4152 }
4153 socket_lock(so, 0);
4154 if (error)
4155 goto release;
4156
fe8ab488
A
4157 m->m_data += len;
4158 m->m_len -= len;
4159 so->so_rcv.sb_cc -= len;
4160 flags |= MSG_RCVMORE;
4161 } else {
4162 (void) sbdroprecord(&so->so_rcv);
4163 nextrecord = so->so_rcv.sb_mb;
4164 m = NULL;
4165 flags |= MSG_TRUNC;
4166 }
4167 }
4168
4169 if (m == NULL) {
4170 so->so_rcv.sb_mb = nextrecord;
4171 /*
4172 * First part is an inline SB_EMPTY_FIXUP(). Second
4173 * part makes sure sb_lastrecord is up-to-date if
4174 * there is still data in the socket buffer.
4175 */
4176 if (so->so_rcv.sb_mb == NULL) {
4177 so->so_rcv.sb_mbtail = NULL;
4178 so->so_rcv.sb_lastrecord = NULL;
4179 } else if (nextrecord->m_nextpkt == NULL) {
4180 so->so_rcv.sb_lastrecord = nextrecord;
4181 }
4182 SB_MB_CHECK(&so->so_rcv);
4183 }
4184 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4185 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4186
4187 /*
4188 * We can continue to the next packet as long as:
4189 * - We haven't exhausted the uio array
4190 * - There was no error
4191 * - A packet was not truncated
4192 * - We can still receive more data
3e170ce0
A
4193 */
4194 if (npkts < uiocnt && error == 0 &&
4195 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4196 (so->so_state & SS_CANTRCVMORE) == 0) {
fe8ab488
A
4197 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4198 sblocked = 0;
4199
3e170ce0 4200 goto next;
fe8ab488 4201 }
3e170ce0
A
4202 if (flagsp != NULL)
4203 *flagsp |= flags;
fe8ab488
A
4204
4205release:
4206 /*
4207 * pru_rcvd may cause more data to be received if the socket lock
4208 * is dropped so we set MSG_HAVEMORE now based on what we know.
3e170ce0
A
4209 * That way the caller won't be surprised if it receives less data
4210 * than requested.
fe8ab488
A
4211 */
4212 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
4213 flags |= MSG_HAVEMORE;
4214
4215 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
4216 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4217
fe8ab488
A
4218 if (sblocked)
4219 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4220 else
4221 socket_unlock(so, 1);
3e170ce0
A
4222
4223 if (delayed_copy_len)
4224 error = sodelayed_copy_list(so, msgarray, uiocnt,
4225 &free_list, &delayed_copy_len);
fe8ab488
A
4226out:
4227 /*
3e170ce0 4228 * Amortize the cost of freeing the mbufs
fe8ab488
A
4229 */
4230 if (free_list != NULL)
4231 m_freem_list(free_list);
3e170ce0
A
4232 if (free_others != NULL)
4233 m_freem_list(free_others);
fe8ab488
A
4234
4235 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4236 0, 0, 0, 0);
4237 return (error);
4238}
4239
4240/*
4241 * Returns: 0 Success
4242 * EINVAL
4243 * ENOTCONN
4244 * <pru_shutdown>:EINVAL
4245 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4246 * <pru_shutdown>:ENOBUFS[TCP]
4247 * <pru_shutdown>:EMSGSIZE[TCP]
4248 * <pru_shutdown>:EHOSTUNREACH[TCP]
4249 * <pru_shutdown>:ENETUNREACH[TCP]
4250 * <pru_shutdown>:ENETDOWN[TCP]
4251 * <pru_shutdown>:ENOMEM[TCP]
4252 * <pru_shutdown>:EACCES[TCP]
4253 * <pru_shutdown>:EMSGSIZE[TCP]
4254 * <pru_shutdown>:ENOBUFS[TCP]
4255 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4256 * <pru_shutdown>:??? [other protocol families]
4257 */
4258int
4259soshutdown(struct socket *so, int how)
4260{
4261 int error;
4262
4263 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4264
4265 switch (how) {
4266 case SHUT_RD:
4267 case SHUT_WR:
4268 case SHUT_RDWR:
4269 socket_lock(so, 1);
4270 if ((so->so_state &
4271 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
4272 error = ENOTCONN;
2d21ac55
A
4273 } else {
4274 error = soshutdownlock(so, how);
4275 }
4276 socket_unlock(so, 1);
4277 break;
4278 default:
4279 error = EINVAL;
4280 break;
55e303ae 4281 }
55e303ae 4282
fe8ab488
A
4283 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4284
55e303ae
A
4285 return (error);
4286}
4287
1c79356b 4288int
fe8ab488 4289soshutdownlock_final(struct socket *so, int how)
1c79356b 4290{
2d21ac55
A
4291 struct protosw *pr = so->so_proto;
4292 int error = 0;
1c79356b 4293
91447636 4294 sflt_notify(so, sock_evt_shutdown, &how);
1c79356b 4295
9bccf70c 4296 if (how != SHUT_WR) {
2d21ac55
A
4297 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4298 /* read already shut down */
4299 error = ENOTCONN;
4300 goto done;
4301 }
1c79356b
A
4302 sorflush(so);
4303 postevent(so, 0, EV_RCLOSED);
4304 }
9bccf70c 4305 if (how != SHUT_RD) {
2d21ac55
A
4306 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4307 /* write already shut down */
4308 error = ENOTCONN;
4309 goto done;
4310 }
4311 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4312 postevent(so, 0, EV_WCLOSED);
1c79356b 4313 }
2d21ac55 4314done:
fe8ab488
A
4315 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4316 return (error);
4317}
4318
4319int
4320soshutdownlock(struct socket *so, int how)
4321{
4322 int error = 0;
4323
4324#if CONTENT_FILTER
4325 /*
4326 * A content filter may delay the actual shutdown until it
4327 * has processed the pending data
4328 */
4329 if (so->so_flags & SOF_CONTENT_FILTER) {
4330 error = cfil_sock_shutdown(so, &how);
4331 if (error == EJUSTRETURN) {
4332 error = 0;
4333 goto done;
4334 } else if (error != 0) {
4335 goto done;
4336 }
4337 }
4338#endif /* CONTENT_FILTER */
3e170ce0 4339
fe8ab488
A
4340 error = soshutdownlock_final(so, how);
4341
4342done:
2d21ac55 4343 return (error);
1c79356b
A
4344}
4345
39236c6e
A
4346void
4347sowflush(struct socket *so)
4348{
4349 struct sockbuf *sb = &so->so_snd;
39236c6e
A
4350
4351 /*
4352 * Obtain lock on the socket buffer (SB_LOCK). This is required
4353 * to prevent the socket buffer from being unexpectedly altered
4354 * while it is used by another thread in socket send/receive.
4355 *
4356 * sblock() must not fail here, hence the assertion.
4357 */
4358 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4359 VERIFY(sb->sb_flags & SB_LOCK);
4360
4361 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
4362 sb->sb_flags |= SB_DROP;
4363 sb->sb_upcall = NULL;
4364 sb->sb_upcallarg = NULL;
4365
4366 sbunlock(sb, TRUE); /* keep socket locked */
4367
4368 selthreadclear(&sb->sb_sel);
4369 sbrelease(sb);
4370}
4371
1c79356b 4372void
2d21ac55 4373sorflush(struct socket *so)
1c79356b 4374{
39236c6e
A
4375 struct sockbuf *sb = &so->so_rcv;
4376 struct protosw *pr = so->so_proto;
1c79356b 4377 struct sockbuf asb;
39236c6e 4378#ifdef notyet
2d21ac55 4379 lck_mtx_t *mutex_held;
39236c6e
A
4380 /*
4381 * XXX: This code is currently commented out, because we may get here
4382 * as part of sofreelastref(), and at that time, pr_getlock() may no
4383 * longer be able to return us the lock; this will be fixed in future.
4384 */
2d21ac55 4385 if (so->so_proto->pr_getlock != NULL)
91447636 4386 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2d21ac55 4387 else
91447636 4388 mutex_held = so->so_proto->pr_domain->dom_mtx;
39236c6e 4389
91447636 4390 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
39236c6e 4391#endif /* notyet */
91447636
A
4392
4393 sflt_notify(so, sock_evt_flush_read, NULL);
1c79356b 4394
1c79356b 4395 socantrcvmore(so);
39236c6e
A
4396
4397 /*
4398 * Obtain lock on the socket buffer (SB_LOCK). This is required
4399 * to prevent the socket buffer from being unexpectedly altered
4400 * while it is used by another thread in socket send/receive.
4401 *
4402 * sblock() must not fail here, hence the assertion.
4403 */
4404 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4405 VERIFY(sb->sb_flags & SB_LOCK);
4406
4407 /*
4408 * Copy only the relevant fields from "sb" to "asb" which we
4409 * need for sbrelease() to function. In particular, skip
4410 * sb_sel as it contains the wait queue linkage, which would
4411 * wreak havoc if we were to issue selthreadclear() on "asb".
4412 * Make sure to not carry over SB_LOCK in "asb", as we need
4413 * to acquire it later as part of sbrelease().
4414 */
4415 bzero(&asb, sizeof (asb));
4416 asb.sb_cc = sb->sb_cc;
4417 asb.sb_hiwat = sb->sb_hiwat;
4418 asb.sb_mbcnt = sb->sb_mbcnt;
4419 asb.sb_mbmax = sb->sb_mbmax;
4420 asb.sb_ctl = sb->sb_ctl;
4421 asb.sb_lowat = sb->sb_lowat;
4422 asb.sb_mb = sb->sb_mb;
4423 asb.sb_mbtail = sb->sb_mbtail;
4424 asb.sb_lastrecord = sb->sb_lastrecord;
4425 asb.sb_so = sb->sb_so;
4426 asb.sb_flags = sb->sb_flags;
4427 asb.sb_flags &= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL);
4428 asb.sb_flags |= SB_DROP;
4429
4430 /*
4431 * Ideally we'd bzero() these and preserve the ones we need;
4432 * but to do that we'd need to shuffle things around in the
4433 * sockbuf, and we can't do it now because there are KEXTS
4434 * that are directly referring to the socket structure.
4435 *
4436 * Setting SB_DROP acts as a barrier to prevent further appends.
4437 * Clearing SB_SEL is done for selthreadclear() below.
4438 */
4439 sb->sb_cc = 0;
4440 sb->sb_hiwat = 0;
4441 sb->sb_mbcnt = 0;
4442 sb->sb_mbmax = 0;
4443 sb->sb_ctl = 0;
4444 sb->sb_lowat = 0;
4445 sb->sb_mb = NULL;
4446 sb->sb_mbtail = NULL;
4447 sb->sb_lastrecord = NULL;
4448 sb->sb_timeo.tv_sec = 0;
4449 sb->sb_timeo.tv_usec = 0;
4450 sb->sb_upcall = NULL;
4451 sb->sb_upcallarg = NULL;
4452 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
4453 sb->sb_flags |= SB_DROP;
4454
4455 sbunlock(sb, TRUE); /* keep socket locked */
4456
4457 /*
4458 * Note that selthreadclear() is called on the original "sb" and
4459 * not the local "asb" because of the way wait queue linkage is
4460 * implemented. Given that selwakeup() may be triggered, SB_SEL
4461 * should no longer be set (cleared above.)
4462 */
0b4e3aa0 4463 selthreadclear(&sb->sb_sel);
39236c6e
A
4464
4465 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
1c79356b 4466 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
39236c6e 4467
1c79356b
A
4468 sbrelease(&asb);
4469}
4470
4471/*
4472 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4473 * an additional variant to handle the case where the option value needs
4474 * to be some kind of integer, but not a specific size.
4475 * In addition to their use here, these functions are also called by the
4476 * protocol-level pr_ctloutput() routines.
2d21ac55
A
4477 *
4478 * Returns: 0 Success
4479 * EINVAL
4480 * copyin:EFAULT
1c79356b
A
4481 */
4482int
2d21ac55 4483sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
1c79356b
A
4484{
4485 size_t valsize;
4486
4487 /*
4488 * If the user gives us more than we wanted, we ignore it,
4489 * but if we don't get the minimum length the caller
4490 * wants, we return EINVAL. On success, sopt->sopt_valsize
4491 * is set to however much we actually retrieved.
4492 */
4493 if ((valsize = sopt->sopt_valsize) < minlen)
2d21ac55 4494 return (EINVAL);
1c79356b
A
4495 if (valsize > len)
4496 sopt->sopt_valsize = valsize = len;
4497
b0d623f7 4498 if (sopt->sopt_p != kernproc)
1c79356b
A
4499 return (copyin(sopt->sopt_val, buf, valsize));
4500
91447636 4501 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
2d21ac55
A
4502 return (0);
4503}
4504
4505/*
4506 * sooptcopyin_timeval
4507 * Copy in a timeval value into tv_p, and take into account whether the
4508 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4509 * code here so that we can verify the 64-bit tv_sec value before we lose
4510 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4511 */
4512static int
39236c6e 4513sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
2d21ac55
A
4514{
4515 int error;
b0d623f7 4516
2d21ac55 4517 if (proc_is64bit(sopt->sopt_p)) {
b0d623f7 4518 struct user64_timeval tv64;
2d21ac55 4519
39236c6e 4520 if (sopt->sopt_valsize < sizeof (tv64))
2d21ac55 4521 return (EINVAL);
39236c6e
A
4522
4523 sopt->sopt_valsize = sizeof (tv64);
b0d623f7 4524 if (sopt->sopt_p != kernproc) {
39236c6e 4525 error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
b0d623f7
A
4526 if (error != 0)
4527 return (error);
4528 } else {
4529 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
39236c6e 4530 sizeof (tv64));
2d21ac55 4531 }
39236c6e
A
4532 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4533 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000)
2d21ac55 4534 return (EDOM);
39236c6e 4535
2d21ac55
A
4536 tv_p->tv_sec = tv64.tv_sec;
4537 tv_p->tv_usec = tv64.tv_usec;
4538 } else {
b0d623f7
A
4539 struct user32_timeval tv32;
4540
39236c6e 4541 if (sopt->sopt_valsize < sizeof (tv32))
2d21ac55 4542 return (EINVAL);
39236c6e
A
4543
4544 sopt->sopt_valsize = sizeof (tv32);
b0d623f7 4545 if (sopt->sopt_p != kernproc) {
39236c6e 4546 error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
2d21ac55
A
4547 if (error != 0) {
4548 return (error);
4549 }
4550 } else {
b0d623f7 4551 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
39236c6e 4552 sizeof (tv32));
2d21ac55 4553 }
39236c6e
A
4554#ifndef __LP64__
4555 /*
4556 * K64todo "comparison is always false due to
4557 * limited range of data type"
4558 */
4559 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4560 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000)
2d21ac55 4561 return (EDOM);
b0d623f7
A
4562#endif
4563 tv_p->tv_sec = tv32.tv_sec;
4564 tv_p->tv_usec = tv32.tv_usec;
2d21ac55
A
4565 }
4566 return (0);
1c79356b
A
4567}
4568
39037602
A
4569static int
4570soopt_cred_check(struct socket *so, int priv)
4571{
4572 kauth_cred_t cred = NULL;
4573 proc_t ep = PROC_NULL;
4574 int error;
4575
4576 if (so->so_flags & SOF_DELEGATED) {
4577 ep = proc_find(so->e_pid);
4578 if (ep)
4579 cred = kauth_cred_proc_ref(ep);
4580 }
4581 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4582 if (cred)
4583 kauth_cred_unref(&cred);
4584 if (ep != PROC_NULL)
4585 proc_rele(ep);
4586
4587 return (error);
4588}
4589
2d21ac55
A
4590/*
4591 * Returns: 0 Success
4592 * EINVAL
4593 * ENOPROTOOPT
4594 * ENOBUFS
4595 * EDOM
4596 * sooptcopyin:EINVAL
4597 * sooptcopyin:EFAULT
4598 * sooptcopyin_timeval:EINVAL
4599 * sooptcopyin_timeval:EFAULT
4600 * sooptcopyin_timeval:EDOM
4601 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4602 * <pr_ctloutput>:???w
4603 * sflt_attach_private:??? [whatever a filter author chooses]
4604 * <sf_setoption>:??? [whatever a filter author chooses]
4605 *
4606 * Notes: Other <pru_listen> returns depend on the protocol family; all
4607 * <sf_listen> returns depend on what the filter author causes
4608 * their filter to return.
4609 */
1c79356b 4610int
39236c6e 4611sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
1c79356b
A
4612{
4613 int error, optval;
4614 struct linger l;
4615 struct timeval tv;
2d21ac55
A
4616#if CONFIG_MACF_SOCKET
4617 struct mac extmac;
4618#endif /* MAC_SOCKET */
91447636 4619
39236c6e
A
4620 if (sopt->sopt_dir != SOPT_SET)
4621 sopt->sopt_dir = SOPT_SET;
4622
4623 if (dolock)
4624 socket_lock(so, 1);
4625
4626 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4627 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
b0d623f7 4628 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
2d21ac55
A
4629 /* the socket has been shutdown, no more sockopt's */
4630 error = EINVAL;
39236c6e 4631 goto out;
9bccf70c
A
4632 }
4633
6d2010ae 4634 error = sflt_setsockopt(so, sopt);
39236c6e 4635 if (error != 0) {
6d2010ae
A
4636 if (error == EJUSTRETURN)
4637 error = 0;
39236c6e 4638 goto out;
1c79356b
A
4639 }
4640
1c79356b 4641 if (sopt->sopt_level != SOL_SOCKET) {
39236c6e
A
4642 if (so->so_proto != NULL &&
4643 so->so_proto->pr_ctloutput != NULL) {
2d21ac55 4644 error = (*so->so_proto->pr_ctloutput)(so, sopt);
39236c6e 4645 goto out;
91447636 4646 }
1c79356b
A
4647 error = ENOPROTOOPT;
4648 } else {
39236c6e
A
4649 /*
4650 * Allow socket-level (SOL_SOCKET) options to be filtered by
4651 * the protocol layer, if needed. A zero value returned from
4652 * the handler means use default socket-level processing as
4653 * done by the rest of this routine. Otherwise, any other
4654 * return value indicates that the option is unsupported.
4655 */
4656 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4657 pru_socheckopt(so, sopt)) != 0)
4658 goto out;
4659
4660 error = 0;
1c79356b
A
4661 switch (sopt->sopt_name) {
4662 case SO_LINGER:
91447636 4663 case SO_LINGER_SEC:
2d21ac55 4664 error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
39236c6e
A
4665 if (error != 0)
4666 goto out;
1c79356b 4667
2d21ac55
A
4668 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
4669 l.l_linger : l.l_linger * hz;
39236c6e 4670 if (l.l_onoff != 0)
1c79356b
A
4671 so->so_options |= SO_LINGER;
4672 else
4673 so->so_options &= ~SO_LINGER;
4674 break;
4675
4676 case SO_DEBUG:
4677 case SO_KEEPALIVE:
4678 case SO_DONTROUTE:
4679 case SO_USELOOPBACK:
4680 case SO_BROADCAST:
4681 case SO_REUSEADDR:
4682 case SO_REUSEPORT:
4683 case SO_OOBINLINE:
4684 case SO_TIMESTAMP:
6d2010ae 4685 case SO_TIMESTAMP_MONOTONIC:
1c79356b
A
4686 case SO_DONTTRUNC:
4687 case SO_WANTMORE:
9bccf70c 4688 case SO_WANTOOBFLAG:
fe8ab488 4689 case SO_NOWAKEFROMSLEEP:
39037602 4690 case SO_NOAPNFALLBK:
2d21ac55
A
4691 error = sooptcopyin(sopt, &optval, sizeof (optval),
4692 sizeof (optval));
39236c6e
A
4693 if (error != 0)
4694 goto out;
1c79356b
A
4695 if (optval)
4696 so->so_options |= sopt->sopt_name;
4697 else
4698 so->so_options &= ~sopt->sopt_name;
4699 break;
4700
4701 case SO_SNDBUF:
4702 case SO_RCVBUF:
4703 case SO_SNDLOWAT:
4704 case SO_RCVLOWAT:
2d21ac55
A
4705 error = sooptcopyin(sopt, &optval, sizeof (optval),
4706 sizeof (optval));
39236c6e
A
4707 if (error != 0)
4708 goto out;
1c79356b
A
4709
4710 /*
4711 * Values < 1 make no sense for any of these
4712 * options, so disallow them.
4713 */
4714 if (optval < 1) {
4715 error = EINVAL;
39236c6e 4716 goto out;
1c79356b
A
4717 }
4718
4719 switch (sopt->sopt_name) {
4720 case SO_SNDBUF:
39236c6e
A
4721 case SO_RCVBUF: {
4722 struct sockbuf *sb =
4723 (sopt->sopt_name == SO_SNDBUF) ?
4724 &so->so_snd : &so->so_rcv;
4725 if (sbreserve(sb, (u_int32_t)optval) == 0) {
1c79356b 4726 error = ENOBUFS;
39236c6e 4727 goto out;
1c79356b 4728 }
316670eb
A
4729 sb->sb_flags |= SB_USRSIZE;
4730 sb->sb_flags &= ~SB_AUTOSIZE;
4731 sb->sb_idealsize = (u_int32_t)optval;
1c79356b 4732 break;
316670eb 4733 }
1c79356b
A
4734 /*
4735 * Make sure the low-water is never greater than
4736 * the high-water.
4737 */
fe8ab488
A
4738 case SO_SNDLOWAT: {
4739 int space = sbspace(&so->so_snd);
4740 u_int32_t hiwat = so->so_snd.sb_hiwat;
4741
4742 if (so->so_snd.sb_flags & SB_UNIX) {
4743 struct unpcb *unp =
4744 (struct unpcb *)(so->so_pcb);
3e170ce0
A
4745 if (unp != NULL &&
4746 unp->unp_conn != NULL) {
fe8ab488
A
4747 hiwat += unp->unp_conn->unp_cc;
4748 }
4749 }
4750
1c79356b 4751 so->so_snd.sb_lowat =
fe8ab488
A
4752 (optval > hiwat) ?
4753 hiwat : optval;
4754
4755 if (space >= so->so_snd.sb_lowat) {
4756 sowwakeup(so);
4757 }
1c79356b 4758 break;
3e170ce0 4759 }
fe8ab488
A
4760 case SO_RCVLOWAT: {
4761 int64_t data_len;
1c79356b
A
4762 so->so_rcv.sb_lowat =
4763 (optval > so->so_rcv.sb_hiwat) ?
4764 so->so_rcv.sb_hiwat : optval;
3e170ce0 4765 data_len = so->so_rcv.sb_cc
fe8ab488
A
4766 - so->so_rcv.sb_ctl;
4767 if (data_len >= so->so_rcv.sb_lowat)
4768 sorwakeup(so);
1c79356b
A
4769 break;
4770 }
fe8ab488 4771 }
1c79356b
A
4772 break;
4773
4774 case SO_SNDTIMEO:
4775 case SO_RCVTIMEO:
2d21ac55 4776 error = sooptcopyin_timeval(sopt, &tv);
39236c6e
A
4777 if (error != 0)
4778 goto out;
1c79356b 4779
1c79356b
A
4780 switch (sopt->sopt_name) {
4781 case SO_SNDTIMEO:
91447636 4782 so->so_snd.sb_timeo = tv;
1c79356b
A
4783 break;
4784 case SO_RCVTIMEO:
91447636 4785 so->so_rcv.sb_timeo = tv;
1c79356b
A
4786 break;
4787 }
4788 break;
4789
39236c6e 4790 case SO_NKE: {
9bccf70c 4791 struct so_nke nke;
1c79356b 4792
2d21ac55
A
4793 error = sooptcopyin(sopt, &nke, sizeof (nke),
4794 sizeof (nke));
39236c6e
A
4795 if (error != 0)
4796 goto out;
1c79356b 4797
6d2010ae 4798 error = sflt_attach_internal(so, nke.nke_handle);
1c79356b
A
4799 break;
4800 }
4801
9bccf70c 4802 case SO_NOSIGPIPE:
2d21ac55
A
4803 error = sooptcopyin(sopt, &optval, sizeof (optval),
4804 sizeof (optval));
39236c6e
A
4805 if (error != 0)
4806 goto out;
4807 if (optval != 0)
2d21ac55
A
4808 so->so_flags |= SOF_NOSIGPIPE;
4809 else
4810 so->so_flags &= ~SOF_NOSIGPIPE;
9bccf70c
A
4811 break;
4812
55e303ae 4813 case SO_NOADDRERR:
2d21ac55
A
4814 error = sooptcopyin(sopt, &optval, sizeof (optval),
4815 sizeof (optval));
39236c6e
A
4816 if (error != 0)
4817 goto out;
4818 if (optval != 0)
2d21ac55
A
4819 so->so_flags |= SOF_NOADDRAVAIL;
4820 else
4821 so->so_flags &= ~SOF_NOADDRAVAIL;
2d21ac55
A
4822 break;
4823
4824 case SO_REUSESHAREUID:
4825 error = sooptcopyin(sopt, &optval, sizeof (optval),
4826 sizeof (optval));
39236c6e
A
4827 if (error != 0)
4828 goto out;
4829 if (optval != 0)
2d21ac55
A
4830 so->so_flags |= SOF_REUSESHAREUID;
4831 else
4832 so->so_flags &= ~SOF_REUSESHAREUID;
4833 break;
39236c6e 4834
2d21ac55
A
4835 case SO_NOTIFYCONFLICT:
4836 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
4837 error = EPERM;
39236c6e 4838 goto out;
2d21ac55
A
4839 }
4840 error = sooptcopyin(sopt, &optval, sizeof (optval),
4841 sizeof (optval));
39236c6e
A
4842 if (error != 0)
4843 goto out;
4844 if (optval != 0)
2d21ac55
A
4845 so->so_flags |= SOF_NOTIFYCONFLICT;
4846 else
4847 so->so_flags &= ~SOF_NOTIFYCONFLICT;
4848 break;
39236c6e 4849
2d21ac55 4850 case SO_RESTRICTIONS:
2d21ac55
A
4851 error = sooptcopyin(sopt, &optval, sizeof (optval),
4852 sizeof (optval));
39236c6e
A
4853 if (error != 0)
4854 goto out;
4855
4856 error = so_set_restrictions(so, optval);
2d21ac55
A
4857 break;
4858
fe8ab488
A
4859 case SO_AWDL_UNRESTRICTED:
4860 if (SOCK_DOM(so) != PF_INET &&
4861 SOCK_DOM(so) != PF_INET6) {
4862 error = EOPNOTSUPP;
4863 goto out;
4864 }
4865 error = sooptcopyin(sopt, &optval, sizeof(optval),
4866 sizeof(optval));
4867 if (error != 0)
4868 goto out;
4869 if (optval != 0) {
39037602
A
4870 error = soopt_cred_check(so,
4871 PRIV_NET_RESTRICTED_AWDL);
fe8ab488
A
4872 if (error == 0)
4873 inp_set_awdl_unrestricted(
4874 sotoinpcb(so));
fe8ab488
A
4875 } else
4876 inp_clear_awdl_unrestricted(sotoinpcb(so));
4877 break;
39037602
A
4878 case SO_INTCOPROC_ALLOW:
4879 if (SOCK_DOM(so) != PF_INET6) {
4880 error = EOPNOTSUPP;
4881 goto out;
4882 }
4883 error = sooptcopyin(sopt, &optval, sizeof(optval),
4884 sizeof(optval));
4885 if (error != 0)
4886 goto out;
4887 if (optval != 0) {
4888 error = soopt_cred_check(so,
4889 PRIV_NET_RESTRICTED_INTCOPROC);
4890 if (error == 0)
4891 inp_set_intcoproc_allowed(
4892 sotoinpcb(so));
4893 } else
4894 inp_clear_intcoproc_allowed(sotoinpcb(so));
4895 break;
fe8ab488 4896
2d21ac55
A
4897 case SO_LABEL:
4898#if CONFIG_MACF_SOCKET
4899 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
4900 sizeof (extmac))) != 0)
39236c6e 4901 goto out;
2d21ac55
A
4902
4903 error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
4904 so, &extmac);
4905#else
4906 error = EOPNOTSUPP;
4907#endif /* MAC_SOCKET */
55e303ae
A
4908 break;
4909
4a3eedf9
A
4910 case SO_UPCALLCLOSEWAIT:
4911 error = sooptcopyin(sopt, &optval, sizeof (optval),
4912 sizeof (optval));
39236c6e
A
4913 if (error != 0)
4914 goto out;
4915 if (optval != 0)
4a3eedf9
A
4916 so->so_flags |= SOF_UPCALLCLOSEWAIT;
4917 else
4918 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
4919 break;
4a3eedf9 4920
b0d623f7
A
4921 case SO_RANDOMPORT:
4922 error = sooptcopyin(sopt, &optval, sizeof (optval),
4923 sizeof (optval));
39236c6e
A
4924 if (error != 0)
4925 goto out;
4926 if (optval != 0)
b0d623f7
A
4927 so->so_flags |= SOF_BINDRANDOMPORT;
4928 else
4929 so->so_flags &= ~SOF_BINDRANDOMPORT;
4930 break;
4931
4932 case SO_NP_EXTENSIONS: {
4933 struct so_np_extensions sonpx;
4934
39236c6e
A
4935 error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
4936 sizeof (sonpx));
4937 if (error != 0)
4938 goto out;
b0d623f7
A
4939 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
4940 error = EINVAL;
39236c6e 4941 goto out;
b0d623f7
A
4942 }
4943 /*
4944 * Only one bit defined for now
4945 */
4946 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
4947 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
4948 so->so_flags |= SOF_NPX_SETOPTSHUT;
4949 else
4950 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
4951 }
4952 break;
4953 }
4954
d41d1dae
A
4955 case SO_TRAFFIC_CLASS: {
4956 error = sooptcopyin(sopt, &optval, sizeof (optval),
39236c6e
A
4957 sizeof (optval));
4958 if (error != 0)
4959 goto out;
39037602
A
4960 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
4961 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
4962 error = so_set_net_service_type(so, netsvc);
4963 goto out;
4964 }
6d2010ae 4965 error = so_set_traffic_class(so, optval);
39236c6e
A
4966 if (error != 0)
4967 goto out;
39037602
A
4968 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
4969 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
6d2010ae 4970 break;
d41d1dae 4971 }
6d2010ae
A
4972
4973 case SO_RECV_TRAFFIC_CLASS: {
4974 error = sooptcopyin(sopt, &optval, sizeof (optval),
39236c6e
A
4975 sizeof (optval));
4976 if (error != 0)
4977 goto out;
6d2010ae
A
4978 if (optval == 0)
4979 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
4980 else
4981 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
4982 break;
4983 }
316670eb 4984
39037602 4985#if (DEVELOPMENT || DEBUG)
6d2010ae
A
4986 case SO_TRAFFIC_CLASS_DBG: {
4987 struct so_tcdbg so_tcdbg;
316670eb
A
4988
4989 error = sooptcopyin(sopt, &so_tcdbg,
4990 sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
39236c6e
A
4991 if (error != 0)
4992 goto out;
6d2010ae 4993 error = so_set_tcdbg(so, &so_tcdbg);
39236c6e
A
4994 if (error != 0)
4995 goto out;
6d2010ae
A
4996 break;
4997 }
39037602 4998#endif /* (DEVELOPMENT || DEBUG) */
316670eb
A
4999
5000 case SO_PRIVILEGED_TRAFFIC_CLASS:
5001 error = priv_check_cred(kauth_cred_get(),
5002 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
39236c6e
A
5003 if (error != 0)
5004 goto out;
316670eb 5005 error = sooptcopyin(sopt, &optval, sizeof (optval),
39236c6e
A
5006 sizeof (optval));
5007 if (error != 0)
5008 goto out;
316670eb
A
5009 if (optval == 0)
5010 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5011 else
5012 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5013 break;
5014
6d2010ae
A
5015 case SO_DEFUNCTOK:
5016 error = sooptcopyin(sopt, &optval, sizeof (optval),
5017 sizeof (optval));
5018 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5019 if (error == 0)
5020 error = EBADF;
39236c6e 5021 goto out;
6d2010ae
A
5022 }
5023 /*
5024 * Any process can set SO_DEFUNCTOK (clear
5025 * SOF_NODEFUNCT), but only root can clear
5026 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5027 */
5028 if (optval == 0 &&
5029 kauth_cred_issuser(kauth_cred_get()) == 0) {
5030 error = EPERM;
39236c6e 5031 goto out;
6d2010ae
A
5032 }
5033 if (optval)
5034 so->so_flags &= ~SOF_NODEFUNCT;
5035 else
5036 so->so_flags |= SOF_NODEFUNCT;
5037
39236c6e
A
5038 if (SOCK_DOM(so) == PF_INET ||
5039 SOCK_DOM(so) == PF_INET6) {
5040 char s[MAX_IPv6_STR_LEN];
5041 char d[MAX_IPv6_STR_LEN];
5042 struct inpcb *inp = sotoinpcb(so);
5043
39037602
A
5044 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5045 "[%s %s:%d -> %s:%d] is now marked "
5046 "as %seligible for "
39236c6e 5047 "defunct\n", __func__, proc_selfpid(),
39037602 5048 proc_best_name(current_proc()),
3e170ce0 5049 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39236c6e
A
5050 (SOCK_TYPE(so) == SOCK_STREAM) ?
5051 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5052 ((SOCK_DOM(so) == PF_INET) ?
5053 (void *)&inp->inp_laddr.s_addr :
5054 (void *)&inp->in6p_laddr), s, sizeof (s)),
5055 ntohs(inp->in6p_lport),
5056 inet_ntop(SOCK_DOM(so),
5057 (SOCK_DOM(so) == PF_INET) ?
5058 (void *)&inp->inp_faddr.s_addr :
5059 (void *)&inp->in6p_faddr, d, sizeof (d)),
5060 ntohs(inp->in6p_fport),
5061 (so->so_flags & SOF_NODEFUNCT) ?
39037602 5062 "not " : "");
39236c6e 5063 } else {
39037602
A
5064 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5065 "is now marked as %seligible for "
5066 "defunct\n",
39236c6e 5067 __func__, proc_selfpid(),
39037602 5068 proc_best_name(current_proc()),
3e170ce0 5069 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39236c6e
A
5070 SOCK_DOM(so), SOCK_TYPE(so),
5071 (so->so_flags & SOF_NODEFUNCT) ?
39037602 5072 "not " : "");
39236c6e 5073 }
6d2010ae
A
5074 break;
5075
5076 case SO_ISDEFUNCT:
5077 /* This option is not settable */
5078 error = EINVAL;
5079 break;
d41d1dae 5080
316670eb
A
5081 case SO_OPPORTUNISTIC:
5082 error = sooptcopyin(sopt, &optval, sizeof (optval),
5083 sizeof (optval));
5084 if (error == 0)
5085 error = so_set_opportunistic(so, optval);
5086 break;
5087
5088 case SO_FLUSH:
5089 /* This option is handled by lower layer(s) */
5090 error = 0;
5091 break;
5092
5093 case SO_RECV_ANYIF:
5094 error = sooptcopyin(sopt, &optval, sizeof (optval),
5095 sizeof (optval));
5096 if (error == 0)
5097 error = so_set_recv_anyif(so, optval);
5098 break;
5099
39236c6e
A
5100 case SO_TRAFFIC_MGT_BACKGROUND: {
5101 /* This option is handled by lower layer(s) */
5102 error = 0;
5103 break;
5104 }
5105
5106#if FLOW_DIVERT
5107 case SO_FLOW_DIVERT_TOKEN:
5108 error = flow_divert_token_set(so, sopt);
5109 break;
5110#endif /* FLOW_DIVERT */
5111
5112
5113 case SO_DELEGATED:
5114 if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
5115 sizeof (optval))) != 0)
5116 break;
5117
5118 error = so_set_effective_pid(so, optval, sopt->sopt_p);
5119 break;
5120
5121 case SO_DELEGATED_UUID: {
5122 uuid_t euuid;
5123
5124 if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
5125 sizeof (euuid))) != 0)
5126 break;
5127
5128 error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
5129 break;
5130 }
3e170ce0 5131
fe8ab488
A
5132#if NECP
5133 case SO_NECP_ATTRIBUTES:
5134 error = necp_set_socket_attributes(so, sopt);
5135 break;
5136#endif /* NECP */
5137
5138#if MPTCP
5139 case SO_MPTCP_FASTJOIN:
5140 if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5141 ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5142 (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5143 error = ENOPROTOOPT;
5144 break;
5145 }
5146
5147 error = sooptcopyin(sopt, &optval, sizeof (optval),
5148 sizeof (optval));
5149 if (error != 0)
5150 goto out;
5151 if (optval == 0)
5152 so->so_flags &= ~SOF_MPTCP_FASTJOIN;
5153 else
5154 so->so_flags |= SOF_MPTCP_FASTJOIN;
5155 break;
5156#endif /* MPTCP */
39236c6e 5157
3e170ce0
A
5158 case SO_EXTENDED_BK_IDLE:
5159 error = sooptcopyin(sopt, &optval, sizeof (optval),
5160 sizeof (optval));
5161 if (error == 0)
5162 error = so_set_extended_bk_idle(so, optval);
5163 break;
5164
490019cf
A
5165 case SO_MARK_CELLFALLBACK:
5166 error = sooptcopyin(sopt, &optval, sizeof(optval),
5167 sizeof(optval));
5168 if (error != 0)
5169 goto out;
5170 if (optval < 0) {
5171 error = EINVAL;
5172 goto out;
5173 }
5174 if (optval == 0)
5175 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5176 else
5177 so->so_flags1 |= SOF1_CELLFALLBACK;
5178 break;
39037602
A
5179
5180 case SO_NET_SERVICE_TYPE: {
5181 error = sooptcopyin(sopt, &optval, sizeof(optval),
5182 sizeof(optval));
5183 if (error != 0)
5184 goto out;
5185 error = so_set_net_service_type(so, optval);
5186 break;
5187 }
5188
5189 case SO_QOSMARKING_POLICY_OVERRIDE:
5190 error = priv_check_cred(kauth_cred_get(),
5191 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5192 if (error != 0)
5193 goto out;
5194 error = sooptcopyin(sopt, &optval, sizeof(optval),
5195 sizeof(optval));
5196 if (error != 0)
5197 goto out;
5198 if (optval == 0)
5199 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5200 else
5201 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5202 break;
5203
1c79356b
A
5204 default:
5205 error = ENOPROTOOPT;
5206 break;
5207 }
39236c6e
A
5208 if (error == 0 && so->so_proto != NULL &&
5209 so->so_proto->pr_ctloutput != NULL) {
5210 (void) so->so_proto->pr_ctloutput(so, sopt);
1c79356b
A
5211 }
5212 }
39236c6e
A
5213out:
5214 if (dolock)
5215 socket_unlock(so, 1);
1c79356b
A
5216 return (error);
5217}
5218
2d21ac55 5219/* Helper routines for getsockopt */
1c79356b 5220int
2d21ac55 5221sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
1c79356b
A
5222{
5223 int error;
5224 size_t valsize;
5225
5226 error = 0;
5227
5228 /*
5229 * Documented get behavior is that we always return a value,
5230 * possibly truncated to fit in the user's buffer.
5231 * Traditional behavior is that we always tell the user
5232 * precisely how much we copied, rather than something useful
5233 * like the total amount we had available for her.
5234 * Note that this interface is not idempotent; the entire answer must
5235 * generated ahead of time.
5236 */
5237 valsize = min(len, sopt->sopt_valsize);
5238 sopt->sopt_valsize = valsize;
91447636 5239 if (sopt->sopt_val != USER_ADDR_NULL) {
b0d623f7 5240 if (sopt->sopt_p != kernproc)
1c79356b
A
5241 error = copyout(buf, sopt->sopt_val, valsize);
5242 else
91447636 5243 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
1c79356b 5244 }
2d21ac55
A
5245 return (error);
5246}
5247
5248static int
39236c6e 5249sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
2d21ac55
A
5250{
5251 int error;
5252 size_t len;
b0d623f7
A
5253 struct user64_timeval tv64;
5254 struct user32_timeval tv32;
2d21ac55
A
5255 const void * val;
5256 size_t valsize;
b0d623f7 5257
2d21ac55
A
5258 error = 0;
5259 if (proc_is64bit(sopt->sopt_p)) {
39236c6e 5260 len = sizeof (tv64);
2d21ac55
A
5261 tv64.tv_sec = tv_p->tv_sec;
5262 tv64.tv_usec = tv_p->tv_usec;
5263 val = &tv64;
5264 } else {
39236c6e 5265 len = sizeof (tv32);
b0d623f7
A
5266 tv32.tv_sec = tv_p->tv_sec;
5267 tv32.tv_usec = tv_p->tv_usec;
5268 val = &tv32;
2d21ac55
A
5269 }
5270 valsize = min(len, sopt->sopt_valsize);
5271 sopt->sopt_valsize = valsize;
5272 if (sopt->sopt_val != USER_ADDR_NULL) {
b0d623f7 5273 if (sopt->sopt_p != kernproc)
2d21ac55
A
5274 error = copyout(val, sopt->sopt_val, valsize);
5275 else
5276 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5277 }
5278 return (error);
1c79356b
A
5279}
5280
2d21ac55
A
5281/*
5282 * Return: 0 Success
5283 * ENOPROTOOPT
5284 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5285 * <pr_ctloutput>:???
5286 * <sf_getoption>:???
5287 */
1c79356b 5288int
39236c6e 5289sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
1c79356b
A
5290{
5291 int error, optval;
5292 struct linger l;
5293 struct timeval tv;
2d21ac55
A
5294#if CONFIG_MACF_SOCKET
5295 struct mac extmac;
5296#endif /* MAC_SOCKET */
1c79356b 5297
39236c6e 5298 if (sopt->sopt_dir != SOPT_GET)
2d21ac55 5299 sopt->sopt_dir = SOPT_GET;
9bccf70c 5300
39236c6e
A
5301 if (dolock)
5302 socket_lock(so, 1);
2d21ac55 5303
6d2010ae 5304 error = sflt_getsockopt(so, sopt);
39236c6e 5305 if (error != 0) {
6d2010ae
A
5306 if (error == EJUSTRETURN)
5307 error = 0;
39236c6e 5308 goto out;
1c79356b 5309 }
39236c6e 5310
1c79356b 5311 if (sopt->sopt_level != SOL_SOCKET) {
39236c6e
A
5312 if (so->so_proto != NULL &&
5313 so->so_proto->pr_ctloutput != NULL) {
2d21ac55 5314 error = (*so->so_proto->pr_ctloutput)(so, sopt);
39236c6e 5315 goto out;
91447636 5316 }
39236c6e 5317 error = ENOPROTOOPT;
1c79356b 5318 } else {
39236c6e
A
5319 /*
5320 * Allow socket-level (SOL_SOCKET) options to be filtered by
5321 * the protocol layer, if needed. A zero value returned from
5322 * the handler means use default socket-level processing as
5323 * done by the rest of this routine. Otherwise, any other
5324 * return value indicates that the option is unsupported.
5325 */
5326 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5327 pru_socheckopt(so, sopt)) != 0)
5328 goto out;
5329
5330 error = 0;
1c79356b
A
5331 switch (sopt->sopt_name) {
5332 case SO_LINGER:
91447636 5333 case SO_LINGER_SEC:
39236c6e 5334 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
2d21ac55
A
5335 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5336 so->so_linger : so->so_linger / hz;
5337 error = sooptcopyout(sopt, &l, sizeof (l));
1c79356b
A
5338 break;
5339
5340 case SO_USELOOPBACK:
5341 case SO_DONTROUTE:
5342 case SO_DEBUG:
5343 case SO_KEEPALIVE:
5344 case SO_REUSEADDR:
5345 case SO_REUSEPORT:
5346 case SO_BROADCAST:
5347 case SO_OOBINLINE:
5348 case SO_TIMESTAMP:
6d2010ae 5349 case SO_TIMESTAMP_MONOTONIC:
1c79356b
A
5350 case SO_DONTTRUNC:
5351 case SO_WANTMORE:
9bccf70c 5352 case SO_WANTOOBFLAG:
fe8ab488 5353 case SO_NOWAKEFROMSLEEP:
39037602 5354 case SO_NOAPNFALLBK:
1c79356b
A
5355 optval = so->so_options & sopt->sopt_name;
5356integer:
2d21ac55 5357 error = sooptcopyout(sopt, &optval, sizeof (optval));
1c79356b
A
5358 break;
5359
5360 case SO_TYPE:
5361 optval = so->so_type;
5362 goto integer;
5363
5364 case SO_NREAD:
2d21ac55
A
5365 if (so->so_proto->pr_flags & PR_ATOMIC) {
5366 int pkt_total;
5367 struct mbuf *m1;
1c79356b 5368
2d21ac55
A
5369 pkt_total = 0;
5370 m1 = so->so_rcv.sb_mb;
39236c6e
A
5371 while (m1 != NULL) {
5372 if (m1->m_type == MT_DATA ||
5373 m1->m_type == MT_HEADER ||
5374 m1->m_type == MT_OOBDATA)
1c79356b 5375 pkt_total += m1->m_len;
1c79356b
A
5376 m1 = m1->m_next;
5377 }
5378 optval = pkt_total;
2d21ac55
A
5379 } else {
5380 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5381 }
1c79356b 5382 goto integer;
39236c6e 5383
fe8ab488
A
5384 case SO_NUMRCVPKT:
5385 if (so->so_proto->pr_flags & PR_ATOMIC) {
5386 int cnt = 0;
5387 struct mbuf *m1;
5388
5389 m1 = so->so_rcv.sb_mb;
5390 while (m1 != NULL) {
5391 if (m1->m_type == MT_DATA ||
5392 m1->m_type == MT_HEADER ||
5393 m1->m_type == MT_OOBDATA)
5394 cnt += 1;
5395 m1 = m1->m_nextpkt;
5396 }
5397 optval = cnt;
5398 goto integer;
5399 } else {
5400 error = EINVAL;
5401 break;
5402 }
5403
91447636
A
5404 case SO_NWRITE:
5405 optval = so->so_snd.sb_cc;
2d21ac55 5406 goto integer;
39236c6e 5407
1c79356b
A
5408 case SO_ERROR:
5409 optval = so->so_error;
5410 so->so_error = 0;
5411 goto integer;
5412
fe8ab488
A
5413 case SO_SNDBUF: {
5414 u_int32_t hiwat = so->so_snd.sb_hiwat;
1c79356b 5415
fe8ab488
A
5416 if (so->so_snd.sb_flags & SB_UNIX) {
5417 struct unpcb *unp =
5418 (struct unpcb *)(so->so_pcb);
5419 if (unp != NULL && unp->unp_conn != NULL) {
5420 hiwat += unp->unp_conn->unp_cc;
5421 }
5422 }
5423
5424 optval = hiwat;
5425 goto integer;
5426 }
1c79356b
A
5427 case SO_RCVBUF:
5428 optval = so->so_rcv.sb_hiwat;
5429 goto integer;
5430
5431 case SO_SNDLOWAT:
5432 optval = so->so_snd.sb_lowat;
5433 goto integer;
5434
5435 case SO_RCVLOWAT:
5436 optval = so->so_rcv.sb_lowat;
5437 goto integer;
5438
5439 case SO_SNDTIMEO:
5440 case SO_RCVTIMEO:
91447636 5441 tv = (sopt->sopt_name == SO_SNDTIMEO ?
2d21ac55 5442 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1c79356b 5443
2d21ac55
A
5444 error = sooptcopyout_timeval(sopt, &tv);
5445 break;
1c79356b 5446
91447636
A
5447 case SO_NOSIGPIPE:
5448 optval = (so->so_flags & SOF_NOSIGPIPE);
5449 goto integer;
9bccf70c 5450
55e303ae 5451 case SO_NOADDRERR:
91447636
A
5452 optval = (so->so_flags & SOF_NOADDRAVAIL);
5453 goto integer;
55e303ae 5454
2d21ac55
A
5455 case SO_REUSESHAREUID:
5456 optval = (so->so_flags & SOF_REUSESHAREUID);
5457 goto integer;
5458
39236c6e 5459
2d21ac55
A
5460 case SO_NOTIFYCONFLICT:
5461 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5462 goto integer;
39236c6e 5463
2d21ac55 5464 case SO_RESTRICTIONS:
39236c6e 5465 optval = so_get_restrictions(so);
2d21ac55
A
5466 goto integer;
5467
fe8ab488 5468 case SO_AWDL_UNRESTRICTED:
3e170ce0 5469 if (SOCK_DOM(so) == PF_INET ||
fe8ab488
A
5470 SOCK_DOM(so) == PF_INET6) {
5471 optval = inp_get_awdl_unrestricted(
5472 sotoinpcb(so));
5473 goto integer;
5474 } else
5475 error = EOPNOTSUPP;
5476 break;
5477
39037602
A
5478 case SO_INTCOPROC_ALLOW:
5479 if (SOCK_DOM(so) == PF_INET6) {
5480 optval = inp_get_intcoproc_allowed(
5481 sotoinpcb(so));
5482 goto integer;
5483 } else
5484 error = EOPNOTSUPP;
5485 break;
5486
2d21ac55
A
5487 case SO_LABEL:
5488#if CONFIG_MACF_SOCKET
5489 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5490 sizeof (extmac))) != 0 ||
5491 (error = mac_socket_label_get(proc_ucred(
5492 sopt->sopt_p), so, &extmac)) != 0)
5493 break;
5494
5495 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5496#else
5497 error = EOPNOTSUPP;
5498#endif /* MAC_SOCKET */
5499 break;
5500
5501 case SO_PEERLABEL:
5502#if CONFIG_MACF_SOCKET
5503 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5504 sizeof (extmac))) != 0 ||
5505 (error = mac_socketpeer_label_get(proc_ucred(
5506 sopt->sopt_p), so, &extmac)) != 0)
5507 break;
5508
5509 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5510#else
5511 error = EOPNOTSUPP;
5512#endif /* MAC_SOCKET */
5513 break;
5514
4a3eedf9
A
5515#ifdef __APPLE_API_PRIVATE
5516 case SO_UPCALLCLOSEWAIT:
5517 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
5518 goto integer;
5519#endif
b0d623f7
A
5520 case SO_RANDOMPORT:
5521 optval = (so->so_flags & SOF_BINDRANDOMPORT);
5522 goto integer;
5523
5524 case SO_NP_EXTENSIONS: {
5525 struct so_np_extensions sonpx;
5526
39236c6e
A
5527 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
5528 SONPX_SETOPTSHUT : 0;
b0d623f7 5529 sonpx.npx_mask = SONPX_MASK_VALID;
4a3eedf9 5530
39236c6e
A
5531 error = sooptcopyout(sopt, &sonpx,
5532 sizeof (struct so_np_extensions));
5533 break;
b0d623f7 5534 }
6d2010ae 5535
d41d1dae
A
5536 case SO_TRAFFIC_CLASS:
5537 optval = so->so_traffic_class;
5538 goto integer;
316670eb 5539
6d2010ae
A
5540 case SO_RECV_TRAFFIC_CLASS:
5541 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
5542 goto integer;
5543
5544 case SO_TRAFFIC_CLASS_STATS:
39236c6e
A
5545 error = sooptcopyout(sopt, &so->so_tc_stats,
5546 sizeof (so->so_tc_stats));
316670eb 5547 break;
6d2010ae 5548
39037602 5549#if (DEVELOPMENT || DEBUG)
39236c6e 5550 case SO_TRAFFIC_CLASS_DBG:
6d2010ae
A
5551 error = sogetopt_tcdbg(so, sopt);
5552 break;
39037602 5553#endif /* (DEVELOPMENT || DEBUG) */
316670eb
A
5554
5555 case SO_PRIVILEGED_TRAFFIC_CLASS:
5556 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
5557 goto integer;
5558
6d2010ae
A
5559 case SO_DEFUNCTOK:
5560 optval = !(so->so_flags & SOF_NODEFUNCT);
5561 goto integer;
5562
5563 case SO_ISDEFUNCT:
5564 optval = (so->so_flags & SOF_DEFUNCT);
5565 goto integer;
d41d1dae 5566
316670eb
A
5567 case SO_OPPORTUNISTIC:
5568 optval = so_get_opportunistic(so);
5569 goto integer;
5570
5571 case SO_FLUSH:
5572 /* This option is not gettable */
5573 error = EINVAL;
5574 break;
5575
5576 case SO_RECV_ANYIF:
5577 optval = so_get_recv_anyif(so);
5578 goto integer;
5579
39236c6e
A
5580 case SO_TRAFFIC_MGT_BACKGROUND:
5581 /* This option is handled by lower layer(s) */
5582 if (so->so_proto != NULL &&
5583 so->so_proto->pr_ctloutput != NULL) {
5584 (void) so->so_proto->pr_ctloutput(so, sopt);
5585 }
5586 break;
5587
5588#if FLOW_DIVERT
5589 case SO_FLOW_DIVERT_TOKEN:
5590 error = flow_divert_token_get(so, sopt);
5591 break;
5592#endif /* FLOW_DIVERT */
3e170ce0 5593
fe8ab488
A
5594#if NECP
5595 case SO_NECP_ATTRIBUTES:
5596 error = necp_get_socket_attributes(so, sopt);
5597 break;
5598#endif /* NECP */
5599
5600#if CONTENT_FILTER
5601 case SO_CFIL_SOCK_ID: {
5602 cfil_sock_id_t sock_id;
5603
5604 sock_id = cfil_sock_id_from_socket(so);
5605
3e170ce0 5606 error = sooptcopyout(sopt, &sock_id,
fe8ab488
A
5607 sizeof(cfil_sock_id_t));
5608 break;
5609 }
5610#endif /* CONTENT_FILTER */
5611
5612#if MPTCP
5613 case SO_MPTCP_FASTJOIN:
5614 if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5615 ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5616 (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5617 error = ENOPROTOOPT;
5618 break;
5619 }
5620 optval = (so->so_flags & SOF_MPTCP_FASTJOIN);
3e170ce0
A
5621 /* Fixed along with rdar://19391339 */
5622 goto integer;
fe8ab488 5623#endif /* MPTCP */
39236c6e 5624
3e170ce0
A
5625 case SO_EXTENDED_BK_IDLE:
5626 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
5627 goto integer;
490019cf
A
5628 case SO_MARK_CELLFALLBACK:
5629 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
5630 ? 1 : 0;
5631 goto integer;
39037602
A
5632 case SO_NET_SERVICE_TYPE: {
5633 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE))
5634 optval = so->so_netsvctype;
5635 else
5636 optval = NET_SERVICE_TYPE_BE;
5637 goto integer;
5638 }
5639 case SO_NETSVC_MARKING_LEVEL:
5640 optval = so_get_netsvc_marking_level(so);
5641 goto integer;
5642
1c79356b
A
5643 default:
5644 error = ENOPROTOOPT;
5645 break;
5646 }
1c79356b 5647 }
39236c6e
A
5648out:
5649 if (dolock)
5650 socket_unlock(so, 1);
5651 return (error);
1c79356b 5652}
39236c6e
A
5653
5654/*
5655 * The size limits on our soopt_getm is different from that on FreeBSD.
6d2010ae
A
5656 * We limit the size of options to MCLBYTES. This will have to change
5657 * if we need to define options that need more space than MCLBYTES.
5658 */
1c79356b 5659int
9bccf70c 5660soopt_getm(struct sockopt *sopt, struct mbuf **mp)
1c79356b
A
5661{
5662 struct mbuf *m, *m_prev;
5663 int sopt_size = sopt->sopt_valsize;
b0d623f7 5664 int how;
1c79356b 5665
6d2010ae 5666 if (sopt_size <= 0 || sopt_size > MCLBYTES)
2d21ac55 5667 return (EMSGSIZE);
a3d08fcd 5668
b0d623f7
A
5669 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
5670 MGET(m, how, MT_DATA);
39236c6e 5671 if (m == NULL)
2d21ac55 5672 return (ENOBUFS);
1c79356b 5673 if (sopt_size > MLEN) {
b0d623f7 5674 MCLGET(m, how);
1c79356b
A
5675 if ((m->m_flags & M_EXT) == 0) {
5676 m_free(m);
2d21ac55 5677 return (ENOBUFS);
1c79356b
A
5678 }
5679 m->m_len = min(MCLBYTES, sopt_size);
5680 } else {
5681 m->m_len = min(MLEN, sopt_size);
5682 }
5683 sopt_size -= m->m_len;
5684 *mp = m;
5685 m_prev = m;
5686
6d2010ae 5687 while (sopt_size > 0) {
b0d623f7 5688 MGET(m, how, MT_DATA);
39236c6e 5689 if (m == NULL) {
1c79356b 5690 m_freem(*mp);
2d21ac55 5691 return (ENOBUFS);
1c79356b
A
5692 }
5693 if (sopt_size > MLEN) {
b0d623f7 5694 MCLGET(m, how);
1c79356b
A
5695 if ((m->m_flags & M_EXT) == 0) {
5696 m_freem(*mp);
6d2010ae 5697 m_freem(m);
2d21ac55 5698 return (ENOBUFS);
1c79356b
A
5699 }
5700 m->m_len = min(MCLBYTES, sopt_size);
5701 } else {
5702 m->m_len = min(MLEN, sopt_size);
5703 }
5704 sopt_size -= m->m_len;
5705 m_prev->m_next = m;
5706 m_prev = m;
5707 }
2d21ac55 5708 return (0);
1c79356b
A
5709}
5710
6d2010ae 5711/* copyin sopt data into mbuf chain */
1c79356b 5712int
9bccf70c 5713soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
1c79356b
A
5714{
5715 struct mbuf *m0 = m;
5716
91447636 5717 if (sopt->sopt_val == USER_ADDR_NULL)
2d21ac55 5718 return (0);
1c79356b 5719 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
b0d623f7 5720 if (sopt->sopt_p != kernproc) {
1c79356b
A
5721 int error;
5722
2d21ac55
A
5723 error = copyin(sopt->sopt_val, mtod(m, char *),
5724 m->m_len);
1c79356b
A
5725 if (error != 0) {
5726 m_freem(m0);
2d21ac55 5727 return (error);
1c79356b 5728 }
2d21ac55
A
5729 } else {
5730 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
5731 mtod(m, char *), m->m_len);
5732 }
1c79356b 5733 sopt->sopt_valsize -= m->m_len;
2d21ac55 5734 sopt->sopt_val += m->m_len;
1c79356b
A
5735 m = m->m_next;
5736 }
39236c6e
A
5737 /* should be allocated enoughly at ip6_sooptmcopyin() */
5738 if (m != NULL) {
9bccf70c 5739 panic("soopt_mcopyin");
39236c6e
A
5740 /* NOTREACHED */
5741 }
2d21ac55 5742 return (0);
1c79356b
A
5743}
5744
6d2010ae 5745/* copyout mbuf chain data into soopt */
1c79356b 5746int
9bccf70c 5747soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
1c79356b
A
5748{
5749 struct mbuf *m0 = m;
5750 size_t valsize = 0;
5751
91447636 5752 if (sopt->sopt_val == USER_ADDR_NULL)
2d21ac55 5753 return (0);
1c79356b 5754 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
b0d623f7 5755 if (sopt->sopt_p != kernproc) {
1c79356b
A
5756 int error;
5757
2d21ac55
A
5758 error = copyout(mtod(m, char *), sopt->sopt_val,
5759 m->m_len);
1c79356b
A
5760 if (error != 0) {
5761 m_freem(m0);
2d21ac55 5762 return (error);
1c79356b 5763 }
2d21ac55
A
5764 } else {
5765 bcopy(mtod(m, char *),
5766 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
5767 }
5768 sopt->sopt_valsize -= m->m_len;
5769 sopt->sopt_val += m->m_len;
5770 valsize += m->m_len;
5771 m = m->m_next;
1c79356b
A
5772 }
5773 if (m != NULL) {
5774 /* enough soopt buffer should be given from user-land */
5775 m_freem(m0);
2d21ac55 5776 return (EINVAL);
1c79356b
A
5777 }
5778 sopt->sopt_valsize = valsize;
2d21ac55 5779 return (0);
1c79356b
A
5780}
5781
9bccf70c 5782void
2d21ac55 5783sohasoutofband(struct socket *so)
9bccf70c 5784{
9bccf70c
A
5785 if (so->so_pgid < 0)
5786 gsignal(-so->so_pgid, SIGURG);
2d21ac55
A
5787 else if (so->so_pgid > 0)
5788 proc_signal(so->so_pgid, SIGURG);
9bccf70c 5789 selwakeup(&so->so_rcv.sb_sel);
39037602
A
5790 if (so->so_rcv.sb_flags & SB_KNOTE) {
5791 KNOTE(&so->so_rcv.sb_sel.si_note,
5792 (NOTE_OOB | SO_FILT_HINT_LOCKED));
5793 }
9bccf70c
A
5794}
5795
5796int
39236c6e 5797sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
9bccf70c 5798{
39236c6e 5799#pragma unused(cred)
9bccf70c
A
5800 struct proc *p = current_proc();
5801 int revents = 0;
91447636
A
5802
5803 socket_lock(so, 1);
39236c6e
A
5804 so_update_last_owner_locked(so, PROC_NULL);
5805 so_update_policy(so);
9bccf70c
A
5806
5807 if (events & (POLLIN | POLLRDNORM))
5808 if (soreadable(so))
5809 revents |= events & (POLLIN | POLLRDNORM);
5810
5811 if (events & (POLLOUT | POLLWRNORM))
5812 if (sowriteable(so))
5813 revents |= events & (POLLOUT | POLLWRNORM);
5814
5815 if (events & (POLLPRI | POLLRDBAND))
5816 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
5817 revents |= events & (POLLPRI | POLLRDBAND);
5818
5819 if (revents == 0) {
5820 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
2d21ac55
A
5821 /*
5822 * Darwin sets the flag first,
5823 * BSD calls selrecord first
5824 */
9bccf70c
A
5825 so->so_rcv.sb_flags |= SB_SEL;
5826 selrecord(p, &so->so_rcv.sb_sel, wql);
5827 }
5828
5829 if (events & (POLLOUT | POLLWRNORM)) {
2d21ac55
A
5830 /*
5831 * Darwin sets the flag first,
5832 * BSD calls selrecord first
5833 */
9bccf70c
A
5834 so->so_snd.sb_flags |= SB_SEL;
5835 selrecord(p, &so->so_snd.sb_sel, wql);
5836 }
5837 }
5838
91447636 5839 socket_unlock(so, 1);
9bccf70c
A
5840 return (revents);
5841}
55e303ae 5842
55e303ae 5843int
39236c6e 5844soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx)
55e303ae 5845{
39236c6e
A
5846#pragma unused(fp)
5847#if !CONFIG_MACF_SOCKET
5848#pragma unused(ctx)
5849#endif /* MAC_SOCKET */
91447636 5850 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
39037602 5851 int result;
2d21ac55 5852
91447636 5853 socket_lock(so, 1);
39236c6e
A
5854 so_update_last_owner_locked(so, PROC_NULL);
5855 so_update_policy(so);
55e303ae 5856
2d21ac55 5857#if CONFIG_MACF_SOCKET
39236c6e
A
5858 if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
5859 kn, so) != 0) {
2d21ac55 5860 socket_unlock(so, 1);
39037602
A
5861 kn->kn_flags = EV_ERROR;
5862 kn->kn_data = EPERM;
5863 return 0;
2d21ac55
A
5864 }
5865#endif /* MAC_SOCKET */
5866
55e303ae
A
5867 switch (kn->kn_filter) {
5868 case EVFILT_READ:
39037602 5869 kn->kn_filtid = EVFILTID_SOREAD;
55e303ae
A
5870 break;
5871 case EVFILT_WRITE:
39037602 5872 kn->kn_filtid = EVFILTID_SOWRITE;
316670eb
A
5873 break;
5874 case EVFILT_SOCK:
39037602
A
5875 kn->kn_filtid = EVFILTID_SCK;
5876 break;
5877 case EVFILT_EXCEPT:
5878 kn->kn_filtid = EVFILTID_SOEXCEPT;
55e303ae
A
5879 break;
5880 default:
91447636 5881 socket_unlock(so, 1);
39037602
A
5882 kn->kn_flags = EV_ERROR;
5883 kn->kn_data = EINVAL;
5884 return 0;
316670eb 5885 }
55e303ae 5886
39037602
A
5887 /*
5888 * call the appropriate sub-filter attach
5889 * with the socket still locked
5890 */
5891 result = knote_fops(kn)->f_attach(kn);
55e303ae 5892
91447636 5893 socket_unlock(so, 1);
39037602
A
5894
5895 return result;
55e303ae
A
5896}
5897
55e303ae 5898static int
39037602 5899filt_soread_common(struct knote *kn, struct socket *so)
55e303ae 5900{
b0d623f7 5901 if (so->so_options & SO_ACCEPTCONN) {
39037602 5902 int is_not_empty;
b0d623f7 5903
39236c6e
A
5904 /*
5905 * Radar 6615193 handle the listen case dynamically
5906 * for kqueue read filter. This allows to call listen()
5907 * after registering the kqueue EVFILT_READ.
b0d623f7
A
5908 */
5909
5910 kn->kn_data = so->so_qlen;
39037602 5911 is_not_empty = ! TAILQ_EMPTY(&so->so_comp);
b0d623f7 5912
39037602 5913 return (is_not_empty);
b0d623f7
A
5914 }
5915
5916 /* socket isn't a listener */
3e170ce0
A
5917 /*
5918 * NOTE_LOWAT specifies new low water mark in data, i.e.
5919 * the bytes of protocol data. We therefore exclude any
5920 * control bytes.
5921 */
2d21ac55 5922 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3e170ce0 5923
39037602
A
5924 if (kn->kn_sfflags & NOTE_OOB) {
5925 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
5926 kn->kn_fflags |= NOTE_OOB;
2d21ac55 5927 kn->kn_data -= so->so_oobmark;
91447636
A
5928 return (1);
5929 }
04b8595b 5930 }
3e170ce0 5931
04b8595b 5932 if ((so->so_state & SS_CANTRCVMORE)
fe8ab488 5933#if CONTENT_FILTER
04b8595b 5934 && cfil_sock_data_pending(&so->so_rcv) == 0
fe8ab488 5935#endif /* CONTENT_FILTER */
04b8595b
A
5936 ) {
5937 kn->kn_flags |= EV_EOF;
5938 kn->kn_fflags = so->so_error;
04b8595b 5939 return (1);
91447636
A
5940 }
5941
5942 if (so->so_error) { /* temporary udp error */
55e303ae 5943 return (1);
91447636
A
5944 }
5945
6d2010ae 5946 int64_t lowwat = so->so_rcv.sb_lowat;
3e170ce0
A
5947 /*
5948 * Ensure that when NOTE_LOWAT is used, the derived
5949 * low water mark is bounded by socket's rcv buf's
5950 * high and low water mark values.
5951 */
39236c6e 5952 if (kn->kn_sfflags & NOTE_LOWAT) {
6d2010ae
A
5953 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
5954 lowwat = so->so_rcv.sb_hiwat;
5955 else if (kn->kn_sdata > lowwat)
5956 lowwat = kn->kn_sdata;
5957 }
39236c6e 5958
3e170ce0
A
5959 /*
5960 * The order below is important. Since NOTE_LOWAT
5961 * overrides sb_lowat, check for NOTE_LOWAT case
5962 * first.
5963 */
5964 if (kn->kn_sfflags & NOTE_LOWAT)
5965 return (kn->kn_data >= lowwat);
5966
5967 return (so->so_rcv.sb_cc >= lowwat);
55e303ae
A
5968}
5969
39037602
A
5970static int
5971filt_sorattach(struct knote *kn)
5972{
5973 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5974
5975 /* socket locked */
5976
5977 /*
5978 * If the caller explicitly asked for OOB results (e.g. poll())
5979 * from EVFILT_READ, then save that off in the hookid field
5980 * and reserve the kn_flags EV_OOBAND bit for output only.
5981 */
5982 if (kn->kn_filter == EVFILT_READ &&
5983 kn->kn_flags & EV_OOBAND) {
5984 kn->kn_flags &= ~EV_OOBAND;
5985 kn->kn_hookid = EV_OOBAND;
5986 } else {
5987 kn->kn_hookid = 0;
5988 }
5989 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn))
5990 so->so_rcv.sb_flags |= SB_KNOTE;
5991
5992 /* indicate if event is already fired */
5993 return filt_soread_common(kn, so);
5994}
5995
55e303ae 5996static void
39037602 5997filt_sordetach(struct knote *kn)
55e303ae 5998{
91447636 5999 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
39037602 6000
91447636 6001 socket_lock(so, 1);
39037602
A
6002 if (so->so_rcv.sb_flags & SB_KNOTE)
6003 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
6004 so->so_rcv.sb_flags &= ~SB_KNOTE;
6005 socket_unlock(so, 1);
6006}
6007
6008/*ARGSUSED*/
6009static int
6010filt_soread(struct knote *kn, long hint)
6011{
6012 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6013 int retval;
6014
6015 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6016 socket_lock(so, 1);
6017
6018 retval = filt_soread_common(kn, so);
6019
6020 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6021 socket_unlock(so, 1);
6022
6023 return retval;
6024}
6025
6026static int
6027filt_sortouch(struct knote *kn, struct kevent_internal_s *kev)
6028{
6029 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6030 int retval;
6031
6032 socket_lock(so, 1);
6033
6034 /* save off the new input fflags and data */
6035 kn->kn_sfflags = kev->fflags;
6036 kn->kn_sdata = kev->data;
6037 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6038 kn->kn_udata = kev->udata;
6039
6040 /* determine if changes result in fired events */
6041 retval = filt_soread_common(kn, so);
55e303ae 6042
91447636 6043 socket_unlock(so, 1);
39037602
A
6044
6045 return retval;
6046}
6047
6048static int
6049filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6050{
6051#pragma unused(data)
6052 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6053 int retval;
6054
6055 socket_lock(so, 1);
6056 retval = filt_soread_common(kn, so);
6057 if (retval) {
6058 *kev = kn->kn_kevent;
6059 if (kn->kn_flags & EV_CLEAR) {
6060 kn->kn_fflags = 0;
6061 kn->kn_data = 0;
6062 }
6063 }
6064 socket_unlock(so, 1);
6065
6066 return retval;
55e303ae
A
6067}
6068
316670eb
A
6069int
6070so_wait_for_if_feedback(struct socket *so)
6071{
39236c6e 6072 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
316670eb
A
6073 (so->so_state & SS_ISCONNECTED)) {
6074 struct inpcb *inp = sotoinpcb(so);
6075 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
6076 return (1);
6077 }
6078 return (0);
6079}
6080
55e303ae 6081static int
39037602 6082filt_sowrite_common(struct knote *kn, struct socket *so)
55e303ae 6083{
316670eb 6084 int ret = 0;
91447636 6085
55e303ae
A
6086 kn->kn_data = sbspace(&so->so_snd);
6087 if (so->so_state & SS_CANTSENDMORE) {
2d21ac55 6088 kn->kn_flags |= EV_EOF;
55e303ae 6089 kn->kn_fflags = so->so_error;
39037602 6090 return 1;
55e303ae 6091 }
91447636 6092 if (so->so_error) { /* temporary udp error */
39037602 6093 return 1;
91447636 6094 }
3e170ce0 6095 if (!socanwrite(so)) {
39037602 6096 return 0;
91447636 6097 }
3e170ce0 6098 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
39037602 6099 return 1;
3e170ce0 6100 }
6d2010ae 6101 int64_t lowwat = so->so_snd.sb_lowat;
39236c6e 6102 if (kn->kn_sfflags & NOTE_LOWAT) {
6d2010ae
A
6103 if (kn->kn_sdata > so->so_snd.sb_hiwat)
6104 lowwat = so->so_snd.sb_hiwat;
6105 else if (kn->kn_sdata > lowwat)
6106 lowwat = kn->kn_sdata;
6107 }
316670eb 6108 if (kn->kn_data >= lowwat) {
39037602
A
6109 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6110#if (DEBUG || DEVELOPMENT)
6111 && so_notsent_lowat_check == 1
6112#endif /* DEBUG || DEVELOPMENT */
6113 ) {
6114 if ((SOCK_DOM(so) == PF_INET ||
6115 SOCK_DOM(so) == PF_INET6) &&
6116 so->so_type == SOCK_STREAM) {
fe8ab488
A
6117 ret = tcp_notsent_lowat_check(so);
6118 }
6119#if MPTCP
6120 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6121 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6122 ret = mptcp_notsent_lowat_check(so);
6123 }
6124#endif
6125 else {
39037602 6126 return 1;
fe8ab488 6127 }
316670eb
A
6128 } else {
6129 ret = 1;
6130 }
6131 }
6132 if (so_wait_for_if_feedback(so))
6133 ret = 0;
39236c6e 6134 return (ret);
316670eb
A
6135}
6136
39037602
A
6137static int
6138filt_sowattach(struct knote *kn)
6139{
6140 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6141
6142 /* socket locked */
6143 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn))
6144 so->so_snd.sb_flags |= SB_KNOTE;
6145
6146 /* determine if its already fired */
6147 return filt_sowrite_common(kn, so);
6148}
6149
316670eb 6150static void
39037602 6151filt_sowdetach(struct knote *kn)
316670eb
A
6152{
6153 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6154 socket_lock(so, 1);
39236c6e 6155
39037602
A
6156 if (so->so_snd.sb_flags & SB_KNOTE)
6157 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
6158 so->so_snd.sb_flags &= ~SB_KNOTE;
316670eb
A
6159 socket_unlock(so, 1);
6160}
6161
39037602 6162/*ARGSUSED*/
316670eb 6163static int
39037602 6164filt_sowrite(struct knote *kn, long hint)
316670eb 6165{
316670eb 6166 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
39037602 6167 int ret;
316670eb 6168
39037602 6169 if ((hint & SO_FILT_HINT_LOCKED) == 0)
316670eb 6170 socket_lock(so, 1);
39037602
A
6171
6172 ret = filt_sowrite_common(kn, so);
6173
6174 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6175 socket_unlock(so, 1);
6176
6177 return ret;
6178}
6179
6180static int
6181filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev)
6182{
6183 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6184 int ret;
6185
6186 socket_lock(so, 1);
6187
6188 /*save off the new input fflags and data */
6189 kn->kn_sfflags = kev->fflags;
6190 kn->kn_sdata = kev->data;
6191 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6192 kn->kn_udata = kev->udata;
6193
6194 /* determine if these changes result in a triggered event */
6195 ret = filt_sowrite_common(kn, so);
6196
6197 socket_unlock(so, 1);
6198
6199 return ret;
6200}
6201
6202static int
6203filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6204{
6205#pragma unused(data)
6206 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6207 int ret;
6208
6209 socket_lock(so, 1);
6210 ret = filt_sowrite_common(kn, so);
6211 if (ret) {
6212 *kev = kn->kn_kevent;
6213 if (kn->kn_flags & EV_CLEAR) {
6214 kn->kn_fflags = 0;
6215 kn->kn_data = 0;
6216 }
316670eb 6217 }
39037602
A
6218 socket_unlock(so, 1);
6219 return ret;
6220}
6221
6222static int
6223filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint)
6224{
6225 int ret = 0;
6226 uint32_t level_trigger = 0;
316670eb 6227
39236c6e 6228 if (ev_hint & SO_FILT_HINT_CONNRESET) {
3e170ce0 6229 kn->kn_fflags |= NOTE_CONNRESET;
39236c6e
A
6230 }
6231 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
3e170ce0 6232 kn->kn_fflags |= NOTE_TIMEOUT;
39236c6e
A
6233 }
6234 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
3e170ce0 6235 kn->kn_fflags |= NOTE_NOSRCADDR;
39236c6e
A
6236 }
6237 if (ev_hint & SO_FILT_HINT_IFDENIED) {
3e170ce0 6238 kn->kn_fflags |= NOTE_IFDENIED;
39236c6e
A
6239 }
6240 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
3e170ce0 6241 kn->kn_fflags |= NOTE_KEEPALIVE;
316670eb 6242 }
39236c6e 6243 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
3e170ce0 6244 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
39236c6e
A
6245 }
6246 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
3e170ce0 6247 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
39236c6e 6248 }
3e170ce0
A
6249 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6250 (so->so_state & SS_ISCONNECTED)) {
6251 kn->kn_fflags |= NOTE_CONNECTED;
6252 level_trigger |= NOTE_CONNECTED;
39236c6e 6253 }
3e170ce0
A
6254 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6255 (so->so_state & SS_ISDISCONNECTED)) {
6256 kn->kn_fflags |= NOTE_DISCONNECTED;
6257 level_trigger |= NOTE_DISCONNECTED;
39236c6e
A
6258 }
6259 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6260 if (so->so_proto != NULL &&
3e170ce0 6261 (so->so_proto->pr_flags & PR_EVCONNINFO))
39236c6e
A
6262 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6263 }
316670eb 6264
39037602
A
6265 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6266 tcp_notify_ack_active(so)) {
6267 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6268 }
6269
3e170ce0 6270 if ((so->so_state & SS_CANTRCVMORE)
fe8ab488 6271#if CONTENT_FILTER
3e170ce0 6272 && cfil_sock_data_pending(&so->so_rcv) == 0
fe8ab488 6273#endif /* CONTENT_FILTER */
3e170ce0 6274 ) {
316670eb 6275 kn->kn_fflags |= NOTE_READCLOSED;
3e170ce0
A
6276 level_trigger |= NOTE_READCLOSED;
6277 }
316670eb 6278
3e170ce0 6279 if (so->so_state & SS_CANTSENDMORE) {
316670eb 6280 kn->kn_fflags |= NOTE_WRITECLOSED;
3e170ce0
A
6281 level_trigger |= NOTE_WRITECLOSED;
6282 }
316670eb 6283
3e170ce0
A
6284 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6285 (so->so_flags & SOF_SUSPENDED)) {
39236c6e 6286 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
3e170ce0
A
6287
6288 /* If resume event was delivered before, reset it */
6289 kn->kn_hookid &= ~NOTE_RESUME;
6290
316670eb 6291 kn->kn_fflags |= NOTE_SUSPEND;
3e170ce0 6292 level_trigger |= NOTE_SUSPEND;
316670eb
A
6293 }
6294
3e170ce0
A
6295 if ((ev_hint & SO_FILT_HINT_RESUME) ||
6296 (so->so_flags & SOF_SUSPENDED) == 0) {
39236c6e 6297 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
3e170ce0
A
6298
6299 /* If suspend event was delivered before, reset it */
6300 kn->kn_hookid &= ~NOTE_SUSPEND;
6301
316670eb 6302 kn->kn_fflags |= NOTE_RESUME;
3e170ce0 6303 level_trigger |= NOTE_RESUME;
316670eb
A
6304 }
6305
6306 if (so->so_error != 0) {
6307 ret = 1;
6308 kn->kn_data = so->so_error;
6309 kn->kn_flags |= EV_EOF;
6310 } else {
6311 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
6312 }
6313
3e170ce0
A
6314 /* Reset any events that are not requested on this knote */
6315 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6316 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6317
6318 /* Find the level triggerred events that are already delivered */
6319 level_trigger &= kn->kn_hookid;
6320 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6321
6322 /* Do not deliver level triggerred events more than once */
6323 if ((kn->kn_fflags & ~level_trigger) != 0)
316670eb
A
6324 ret = 1;
6325
39236c6e 6326 return (ret);
316670eb
A
6327}
6328
39037602
A
6329static int
6330filt_sockattach(struct knote *kn)
6331{
6332 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6333
6334 /* socket locked */
6335 kn->kn_hookid = 0;
6336 if (KNOTE_ATTACH(&so->so_klist, kn))
6337 so->so_flags |= SOF_KNOTE;
6338
6339 /* determine if event already fired */
6340 return filt_sockev_common(kn, so, 0);
6341}
6342
3e170ce0 6343static void
39037602 6344filt_sockdetach(struct knote *kn)
3e170ce0 6345{
39037602
A
6346 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6347 socket_lock(so, 1);
3e170ce0 6348
39037602
A
6349 if ((so->so_flags & SOF_KNOTE) != 0)
6350 if (KNOTE_DETACH(&so->so_klist, kn))
6351 so->so_flags &= ~SOF_KNOTE;
6352 socket_unlock(so, 1);
6353}
6354
6355static int
6356filt_sockev(struct knote *kn, long hint)
6357{
6358 int ret = 0, locked = 0;
6359 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6360 long ev_hint = (hint & SO_FILT_HINT_EV);
6361
6362 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6363 socket_lock(so, 1);
6364 locked = 1;
3e170ce0 6365 }
39037602
A
6366
6367 ret = filt_sockev_common(kn, so, ev_hint);
6368
6369 if (locked)
6370 socket_unlock(so, 1);
6371
6372 return ret;
6373}
6374
6375
6376
6377/*
6378 * filt_socktouch - update event state
6379 */
6380static int
6381filt_socktouch(
6382 struct knote *kn,
6383 struct kevent_internal_s *kev)
6384{
6385 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6386 uint32_t changed_flags;
6387 int ret;
6388
6389 socket_lock(so, 1);
6390
6391 /* save off the [result] data and fflags */
6392 changed_flags = (kn->kn_sfflags ^ kn->kn_hookid);
6393
6394 /* save off the new input fflags and data */
6395 kn->kn_sfflags = kev->fflags;
6396 kn->kn_sdata = kev->data;
6397 if ((kn->kn_status & KN_UDATA_SPECIFIC) == 0)
6398 kn->kn_udata = kev->udata;
6399
6400 /* restrict the current results to the (smaller?) set of new interest */
6401 /*
6402 * For compatibility with previous implementations, we leave kn_fflags
6403 * as they were before.
6404 */
6405 //kn->kn_fflags &= kev->fflags;
6406
6407 /*
6408 * Since we keep track of events that are already
6409 * delivered, if any of those events are not requested
6410 * anymore the state related to them can be reset
6411 */
6412 kn->kn_hookid &=
6413 ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6414
6415 /* determine if we have events to deliver */
6416 ret = filt_sockev_common(kn, so, 0);
6417
6418 socket_unlock(so, 1);
6419
6420 return ret;
6421}
6422
6423/*
6424 * filt_sockprocess - query event fired state and return data
6425 */
6426static int
6427filt_sockprocess(
6428 struct knote *kn,
6429 struct filt_process_s *data,
6430 struct kevent_internal_s *kev)
6431{
6432#pragma unused(data)
6433
6434 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6435 int ret = 0;
6436
6437 socket_lock(so, 1);
6438
6439 ret = filt_sockev_common(kn, so, 0);
6440 if (ret) {
6441 *kev = kn->kn_kevent;
6442
3e170ce0
A
6443 /*
6444 * Store the state of the events being delivered. This
6445 * state can be used to deliver level triggered events
6446 * ateast once and still avoid waking up the application
6447 * multiple times as long as the event is active.
6448 */
6449 if (kn->kn_fflags != 0)
6450 kn->kn_hookid |= (kn->kn_fflags &
39037602 6451 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
3e170ce0
A
6452
6453 /*
6454 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6455 * only one of them and remember the last one that was
6456 * delivered last
6457 */
6458 if (kn->kn_fflags & NOTE_SUSPEND)
6459 kn->kn_hookid &= ~NOTE_RESUME;
6460 if (kn->kn_fflags & NOTE_RESUME)
6461 kn->kn_hookid &= ~NOTE_SUSPEND;
39037602
A
6462
6463 if (kn->kn_flags & EV_CLEAR) {
6464 kn->kn_data = 0;
6465 kn->kn_fflags = 0;
6466 }
3e170ce0 6467 }
39037602
A
6468
6469 socket_unlock(so, 1);
6470
6471 return ret;
3e170ce0
A
6472}
6473
316670eb 6474void
39236c6e
A
6475get_sockev_state(struct socket *so, u_int32_t *statep)
6476{
316670eb
A
6477 u_int32_t state = *(statep);
6478
39037602
A
6479 /*
6480 * If the state variable is already used by a previous event,
6481 * reset it.
6482 */
6483 if (state != 0)
6484 return;
6485
39236c6e 6486 if (so->so_state & SS_ISCONNECTED)
316670eb 6487 state |= SOCKEV_CONNECTED;
39236c6e 6488 else
316670eb 6489 state &= ~(SOCKEV_CONNECTED);
39236c6e 6490 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
316670eb 6491 *(statep) = state;
55e303ae
A
6492}
6493
39236c6e
A
6494#define SO_LOCK_HISTORY_STR_LEN \
6495 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
b0d623f7 6496
39236c6e
A
6497__private_extern__ const char *
6498solockhistory_nr(struct socket *so)
55e303ae 6499{
39236c6e
A
6500 size_t n = 0;
6501 int i;
6502 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
6503
6504 bzero(lock_history_str, sizeof (lock_history_str));
6505 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
6506 n += snprintf(lock_history_str + n,
6507 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
6508 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
6509 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
b0d623f7 6510 }
39236c6e 6511 return (lock_history_str);
55e303ae
A
6512}
6513
91447636 6514int
2d21ac55 6515socket_lock(struct socket *so, int refcount)
91447636 6516{
b0d623f7
A
6517 int error = 0;
6518 void *lr_saved;
0c530ab8 6519
b0d623f7 6520 lr_saved = __builtin_return_address(0);
91447636
A
6521
6522 if (so->so_proto->pr_lock) {
6523 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
2d21ac55 6524 } else {
91447636 6525#ifdef MORE_LOCKING_DEBUG
2d21ac55
A
6526 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
6527 LCK_MTX_ASSERT_NOTOWNED);
91447636
A
6528#endif
6529 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
6530 if (refcount)
6531 so->so_usecount++;
b0d623f7 6532 so->lock_lr[so->next_lock_lr] = lr_saved;
0c530ab8 6533 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
91447636
A
6534 }
6535
2d21ac55 6536 return (error);
91447636
A
6537}
6538
6539int
2d21ac55 6540socket_unlock(struct socket *so, int refcount)
91447636 6541{
b0d623f7
A
6542 int error = 0;
6543 void *lr_saved;
2d21ac55 6544 lck_mtx_t *mutex_held;
91447636 6545
b0d623f7 6546 lr_saved = __builtin_return_address(0);
91447636 6547
39236c6e
A
6548 if (so->so_proto == NULL) {
6549 panic("%s: null so_proto so=%p\n", __func__, so);
6550 /* NOTREACHED */
6551 }
91447636 6552
2d21ac55 6553 if (so && so->so_proto->pr_unlock) {
91447636 6554 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
2d21ac55 6555 } else {
91447636
A
6556 mutex_held = so->so_proto->pr_domain->dom_mtx;
6557#ifdef MORE_LOCKING_DEBUG
6558 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
6559#endif
b0d623f7 6560 so->unlock_lr[so->next_unlock_lr] = lr_saved;
0c530ab8
A
6561 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
6562
91447636 6563 if (refcount) {
39236c6e
A
6564 if (so->so_usecount <= 0) {
6565 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
6566 "lrh=%s", __func__, so->so_usecount, so,
6567 SOCK_DOM(so), so->so_type,
6568 SOCK_PROTO(so), solockhistory_nr(so));
6569 /* NOTREACHED */
6570 }
6571
91447636 6572 so->so_usecount--;
39236c6e 6573 if (so->so_usecount == 0)
91447636 6574 sofreelastref(so, 1);
91447636
A
6575 }
6576 lck_mtx_unlock(mutex_held);
6577 }
6578
2d21ac55 6579 return (error);
91447636 6580}
2d21ac55
A
6581
6582/* Called with socket locked, will unlock socket */
91447636 6583void
2d21ac55 6584sofree(struct socket *so)
91447636 6585{
2d21ac55 6586 lck_mtx_t *mutex_held;
39236c6e 6587
2d21ac55 6588 if (so->so_proto->pr_getlock != NULL)
91447636 6589 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2d21ac55 6590 else
91447636
A
6591 mutex_held = so->so_proto->pr_domain->dom_mtx;
6592 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
2d21ac55 6593
91447636
A
6594 sofreelastref(so, 0);
6595}
6596
6597void
2d21ac55 6598soreference(struct socket *so)
91447636
A
6599{
6600 socket_lock(so, 1); /* locks & take one reference on socket */
6601 socket_unlock(so, 0); /* unlock only */
6602}
6603
6604void
2d21ac55 6605sodereference(struct socket *so)
91447636
A
6606{
6607 socket_lock(so, 0);
6608 socket_unlock(so, 1);
6609}
2d21ac55
A
6610
6611/*
6612 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
6613 * possibility of using jumbo clusters. Caller must ensure to hold
6614 * the socket lock.
6615 */
6616void
6617somultipages(struct socket *so, boolean_t set)
6618{
6619 if (set)
6620 so->so_flags |= SOF_MULTIPAGES;
6621 else
6622 so->so_flags &= ~SOF_MULTIPAGES;
6623}
b0d623f7 6624
fe8ab488
A
6625void
6626soif2kcl(struct socket *so, boolean_t set)
6627{
6628 if (set)
6629 so->so_flags1 |= SOF1_IF_2KCL;
6630 else
6631 so->so_flags1 &= ~SOF1_IF_2KCL;
6632}
6633
b0d623f7
A
6634int
6635so_isdstlocal(struct socket *so) {
6636
6637 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6638
39236c6e
A
6639 if (SOCK_DOM(so) == PF_INET)
6640 return (inaddr_local(inp->inp_faddr));
6641 else if (SOCK_DOM(so) == PF_INET6)
6642 return (in6addr_local(&inp->in6p_faddr));
6643
6644 return (0);
b0d623f7 6645}
6d2010ae
A
6646
6647int
6648sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
6649{
39236c6e 6650 struct sockbuf *rcv, *snd;
6d2010ae
A
6651 int err = 0, defunct;
6652
39236c6e
A
6653 rcv = &so->so_rcv;
6654 snd = &so->so_snd;
6655
6d2010ae
A
6656 defunct = (so->so_flags & SOF_DEFUNCT);
6657 if (defunct) {
39236c6e 6658 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
6d2010ae 6659 panic("%s: SB_DROP not set", __func__);
39236c6e
A
6660 /* NOTREACHED */
6661 }
6d2010ae
A
6662 goto done;
6663 }
6664
6665 if (so->so_flags & SOF_NODEFUNCT) {
6666 if (noforce) {
6667 err = EOPNOTSUPP;
39037602
A
6668 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
6669 "name %s level %d) so 0x%llx [%d,%d] "
6670 "is not eligible for defunct "
6671 "(%d)\n", __func__, proc_selfpid(),
6672 proc_best_name(current_proc()), proc_pid(p),
6673 proc_best_name(p), level,
6674 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6675 SOCK_DOM(so), SOCK_TYPE(so), err);
6d2010ae
A
6676 return (err);
6677 }
6678 so->so_flags &= ~SOF_NODEFUNCT;
39037602
A
6679 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6680 "so 0x%llx [%d,%d] defunct by force\n", __func__,
6681 proc_selfpid(), proc_best_name(current_proc()),
6682 proc_pid(p), proc_best_name(p), level,
6683 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6684 SOCK_DOM(so), SOCK_TYPE(so));
3e170ce0
A
6685 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
6686 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6687 struct ifnet *ifp = inp->inp_last_outifp;
6688
6689 if (ifp && IFNET_IS_CELLULAR(ifp)) {
6690 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
6691 } else if (so->so_flags & SOF_DELEGATED) {
6692 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
6693 } else if (soextbkidlestat.so_xbkidle_time == 0) {
6694 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
6695 } else if (noforce) {
6696 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
39037602 6697
3e170ce0
A
6698 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
6699 so->so_extended_bk_start = net_uptime();
6700 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
39037602 6701
3e170ce0 6702 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
39037602 6703
3e170ce0 6704 err = EOPNOTSUPP;
39037602
A
6705 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s "
6706 "level %d) extend bk idle so 0x%llx rcv hw %d "
6707 "cc %d\n",
6708 __func__, proc_selfpid(),
6709 proc_best_name(current_proc()), proc_pid(p),
6710 proc_best_name(p), level,
6711 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6712 so->so_rcv.sb_hiwat, so->so_rcv.sb_cc);
3e170ce0
A
6713 return (err);
6714 } else {
6715 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
6716 }
6d2010ae
A
6717 }
6718
6719 so->so_flags |= SOF_DEFUNCT;
39236c6e 6720
6d2010ae 6721 /* Prevent further data from being appended to the socket buffers */
39236c6e
A
6722 snd->sb_flags |= SB_DROP;
6723 rcv->sb_flags |= SB_DROP;
6724
6725 /* Flush any existing data in the socket buffers */
6726 if (rcv->sb_cc != 0) {
6727 rcv->sb_flags &= ~SB_SEL;
6728 selthreadclear(&rcv->sb_sel);
6729 sbrelease(rcv);
6730 }
6731 if (snd->sb_cc != 0) {
6732 snd->sb_flags &= ~SB_SEL;
6733 selthreadclear(&snd->sb_sel);
6734 sbrelease(snd);
6735 }
6d2010ae
A
6736
6737done:
39037602
A
6738 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6739 "so 0x%llx [%d,%d] %s defunct%s\n", __func__, proc_selfpid(),
6740 proc_best_name(current_proc()), proc_pid(p), proc_best_name(p),
6741 level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6742 SOCK_TYPE(so), defunct ? "is already" : "marked as",
6743 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ? " extbkidle" : "");
6d2010ae
A
6744
6745 return (err);
6746}
6747
6748int
6749sodefunct(struct proc *p, struct socket *so, int level)
6750{
6751 struct sockbuf *rcv, *snd;
6752
39236c6e 6753 if (!(so->so_flags & SOF_DEFUNCT)) {
6d2010ae 6754 panic("%s improperly called", __func__);
39236c6e
A
6755 /* NOTREACHED */
6756 }
6d2010ae
A
6757 if (so->so_state & SS_DEFUNCT)
6758 goto done;
6759
6760 rcv = &so->so_rcv;
6761 snd = &so->so_snd;
6762
39236c6e
A
6763 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6764 char s[MAX_IPv6_STR_LEN];
6765 char d[MAX_IPv6_STR_LEN];
6766 struct inpcb *inp = sotoinpcb(so);
6767
39037602
A
6768 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6769 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
6770 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n",
6771 __func__, proc_selfpid(), proc_best_name(current_proc()),
6772 proc_pid(p), proc_best_name(p), level,
6773 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39236c6e
A
6774 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
6775 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
6776 (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr),
6777 s, sizeof (s)), ntohs(inp->in6p_lport),
6778 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
6779 (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr,
6780 d, sizeof (d)), ntohs(inp->in6p_fport),
6781 (uint32_t)rcv->sb_sel.si_flags,
6782 (uint32_t)snd->sb_sel.si_flags,
39037602 6783 rcv->sb_flags, snd->sb_flags);
39236c6e 6784 } else {
39037602
A
6785 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6786 "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
6787 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
6788 proc_selfpid(), proc_best_name(current_proc()),
6789 proc_pid(p), proc_best_name(p), level,
6790 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6791 SOCK_DOM(so), SOCK_TYPE(so),
6792 (uint32_t)rcv->sb_sel.si_flags,
39236c6e 6793 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
39037602 6794 snd->sb_flags);
39236c6e 6795 }
6d2010ae
A
6796
6797 /*
6798 * Unwedge threads blocked on sbwait() and sb_lock().
6799 */
6800 sbwakeup(rcv);
6801 sbwakeup(snd);
6802
fe8ab488 6803 so->so_flags1 |= SOF1_DEFUNCTINPROG;
6d2010ae 6804 if (rcv->sb_flags & SB_LOCK)
39236c6e 6805 sbunlock(rcv, TRUE); /* keep socket locked */
6d2010ae 6806 if (snd->sb_flags & SB_LOCK)
39236c6e 6807 sbunlock(snd, TRUE); /* keep socket locked */
6d2010ae
A
6808
6809 /*
6810 * Flush the buffers and disconnect. We explicitly call shutdown
6811 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
6812 * states are set for the socket. This would also flush out data
6813 * hanging off the receive list of this socket.
6814 */
fe8ab488
A
6815 (void) soshutdownlock_final(so, SHUT_RD);
6816 (void) soshutdownlock_final(so, SHUT_WR);
6d2010ae
A
6817 (void) sodisconnectlocked(so);
6818
6819 /*
6820 * Explicitly handle connectionless-protocol disconnection
6821 * and release any remaining data in the socket buffers.
6822 */
6823 if (!(so->so_flags & SS_ISDISCONNECTED))
6824 (void) soisdisconnected(so);
6825
6826 if (so->so_error == 0)
6827 so->so_error = EBADF;
6828
39236c6e
A
6829 if (rcv->sb_cc != 0) {
6830 rcv->sb_flags &= ~SB_SEL;
6831 selthreadclear(&rcv->sb_sel);
6d2010ae 6832 sbrelease(rcv);
39236c6e
A
6833 }
6834 if (snd->sb_cc != 0) {
6835 snd->sb_flags &= ~SB_SEL;
6836 selthreadclear(&snd->sb_sel);
6d2010ae 6837 sbrelease(snd);
39236c6e 6838 }
6d2010ae 6839 so->so_state |= SS_DEFUNCT;
39037602 6840 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
6d2010ae
A
6841
6842done:
6843 return (0);
6844}
316670eb 6845
3e170ce0
A
6846int
6847soresume(struct proc *p, struct socket *so, int locked)
6848{
6849 if (locked == 0)
6850 socket_lock(so, 1);
6851
6852 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
39037602
A
6853 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
6854 "[%d,%d] resumed from bk idle\n",
6855 __func__, proc_selfpid(), proc_best_name(current_proc()),
6856 proc_pid(p), proc_best_name(p),
3e170ce0 6857 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 6858 SOCK_DOM(so), SOCK_TYPE(so));
3e170ce0
A
6859
6860 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
6861 so->so_extended_bk_start = 0;
6862 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
6863
6864 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
6865 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
6866 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
6867 }
6868 if (locked == 0)
6869 socket_unlock(so, 1);
6870
6871 return (0);
6872}
6873
6874/*
6875 * Does not attempt to account for sockets that are delegated from
6876 * the current process
6877 */
6878int
6879so_set_extended_bk_idle(struct socket *so, int optval)
6880{
6881 int error = 0;
6882
6883 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
6884 SOCK_PROTO(so) != IPPROTO_TCP) {
6885 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
6886 error = EOPNOTSUPP;
6887 } else if (optval == 0) {
6888 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
6889
6890 soresume(current_proc(), so, 1);
6891 } else {
6892 struct proc *p = current_proc();
6893 int i;
6894 struct filedesc *fdp;
6895 int count = 0;
6896
6897 proc_fdlock(p);
6898
6899 fdp = p->p_fd;
6900 for (i = 0; i < fdp->fd_nfiles; i++) {
6901 struct fileproc *fp = fdp->fd_ofiles[i];
6902 struct socket *so2;
6903
6904 if (fp == NULL ||
6905 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
6906 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
6907 continue;
6908
6909 so2 = (struct socket *)fp->f_fglob->fg_data;
6910 if (so != so2 &&
6911 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED)
6912 count++;
6913 if (count >= soextbkidlestat.so_xbkidle_maxperproc)
6914 break;
6915 }
6916 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
6917 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
6918 error = EBUSY;
6919 } else if (so->so_flags & SOF_DELEGATED) {
6920 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
6921 error = EBUSY;
6922 } else {
6923 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
6924 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
6925 }
39037602 6926 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
3e170ce0 6927 "%s marked for extended bk idle\n",
39037602 6928 __func__, proc_selfpid(), proc_best_name(current_proc()),
3e170ce0
A
6929 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6930 SOCK_DOM(so), SOCK_TYPE(so),
6931 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
39037602 6932 "is" : "not");
3e170ce0
A
6933
6934 proc_fdunlock(p);
6935 }
6936
6937 return (error);
6938}
6939
6940static void
6941so_stop_extended_bk_idle(struct socket *so)
6942{
6943 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
6944 so->so_extended_bk_start = 0;
6945
6946 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
6947 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
6948 /*
6949 * Force defunct
6950 */
6951 sosetdefunct(current_proc(), so,
6952 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
6953 if (so->so_flags & SOF_DEFUNCT) {
6954 sodefunct(current_proc(), so,
6955 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
6956 }
6957}
6958
6959void
6960so_drain_extended_bk_idle(struct socket *so)
6961{
6962 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
6963 /*
6964 * Only penalize sockets that have outstanding data
6965 */
6966 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
6967 so_stop_extended_bk_idle(so);
6968
6969 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
6970 }
6971 }
6972}
6973
6974/*
6975 * Return values tells if socket is still in extended background idle
6976 */
6977int
6978so_check_extended_bk_idle_time(struct socket *so)
6979{
6980 int ret = 1;
6981
6982 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
39037602
A
6983 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
6984 __func__, proc_selfpid(), proc_best_name(current_proc()),
3e170ce0 6985 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
39037602 6986 SOCK_DOM(so), SOCK_TYPE(so));
3e170ce0
A
6987 if (net_uptime() - so->so_extended_bk_start >
6988 soextbkidlestat.so_xbkidle_time) {
6989 so_stop_extended_bk_idle(so);
6990
6991 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
6992
6993 ret = 0;
6994 } else {
6995 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6996
6997 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
6998 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
6999 }
7000 }
39037602 7001
3e170ce0
A
7002 return (ret);
7003}
7004
7005void
7006resume_proc_sockets(proc_t p)
7007{
7008 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7009 struct filedesc *fdp;
7010 int i;
7011
7012 proc_fdlock(p);
7013 fdp = p->p_fd;
7014 for (i = 0; i < fdp->fd_nfiles; i++) {
7015 struct fileproc *fp;
7016 struct socket *so;
7017
7018 fp = fdp->fd_ofiles[i];
39037602 7019 if (fp == NULL ||
3e170ce0
A
7020 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7021 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
7022 continue;
7023
7024 so = (struct socket *)fp->f_fglob->fg_data;
7025 (void) soresume(p, so, 0);
7026 }
7027 proc_fdunlock(p);
7028
7029 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7030 }
7031}
7032
316670eb
A
7033__private_extern__ int
7034so_set_recv_anyif(struct socket *so, int optval)
7035{
7036 int ret = 0;
7037
7038#if INET6
39236c6e 7039 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
316670eb 7040#else
39236c6e 7041 if (SOCK_DOM(so) == PF_INET) {
316670eb
A
7042#endif /* !INET6 */
7043 if (optval)
7044 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7045 else
7046 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
316670eb
A
7047 }
7048
7049 return (ret);
7050}
7051
7052__private_extern__ int
7053so_get_recv_anyif(struct socket *so)
7054{
7055 int ret = 0;
7056
7057#if INET6
39236c6e 7058 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
316670eb 7059#else
39236c6e 7060 if (SOCK_DOM(so) == PF_INET) {
316670eb
A
7061#endif /* !INET6 */
7062 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7063 }
7064
7065 return (ret);
7066}
39236c6e
A
7067
7068int
7069so_set_restrictions(struct socket *so, uint32_t vals)
7070{
7071 int nocell_old, nocell_new;
fe8ab488 7072 int noexpensive_old, noexpensive_new;
39236c6e
A
7073
7074 /*
7075 * Deny-type restrictions are trapdoors; once set they cannot be
7076 * unset for the lifetime of the socket. This allows them to be
7077 * issued by a framework on behalf of the application without
7078 * having to worry that they can be undone.
7079 *
7080 * Note here that socket-level restrictions overrides any protocol
7081 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7082 * socket restriction issued on the socket has a higher precendence
7083 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7084 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7085 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7086 */
7087 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
fe8ab488 7088 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
39236c6e 7089 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
3e170ce0 7090 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
fe8ab488 7091 SO_RESTRICT_DENY_EXPENSIVE));
39236c6e 7092 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
fe8ab488 7093 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
39236c6e
A
7094
7095 /* we can only set, not clear restrictions */
fe8ab488
A
7096 if ((nocell_new - nocell_old) == 0 &&
7097 (noexpensive_new - noexpensive_old) == 0)
7098 return (0);
39236c6e
A
7099#if INET6
7100 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7101#else
7102 if (SOCK_DOM(so) == PF_INET) {
7103#endif /* !INET6 */
fe8ab488 7104 if (nocell_new - nocell_old != 0) {
3e170ce0
A
7105 /*
7106 * if deny cellular is now set, do what's needed
7107 * for INPCB
7108 */
fe8ab488
A
7109 inp_set_nocellular(sotoinpcb(so));
7110 }
7111 if (noexpensive_new - noexpensive_old != 0) {
7112 inp_set_noexpensive(sotoinpcb(so));
7113 }
39236c6e
A
7114 }
7115
fe8ab488 7116 return (0);
39236c6e
A
7117}
7118
7119uint32_t
7120so_get_restrictions(struct socket *so)
7121{
7122 return (so->so_restrictions & (SO_RESTRICT_DENY_IN |
3e170ce0 7123 SO_RESTRICT_DENY_OUT |
fe8ab488 7124 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE));
39236c6e
A
7125}
7126
7127struct sockaddr_entry *
7128sockaddrentry_alloc(int how)
7129{
7130 struct sockaddr_entry *se;
7131
7132 se = (how == M_WAITOK) ? zalloc(se_zone) : zalloc_noblock(se_zone);
7133 if (se != NULL)
7134 bzero(se, se_zone_size);
7135
7136 return (se);
7137}
7138
7139void
7140sockaddrentry_free(struct sockaddr_entry *se)
7141{
7142 if (se->se_addr != NULL) {
7143 FREE(se->se_addr, M_SONAME);
7144 se->se_addr = NULL;
7145 }
7146 zfree(se_zone, se);
7147}
7148
7149struct sockaddr_entry *
7150sockaddrentry_dup(const struct sockaddr_entry *src_se, int how)
7151{
7152 struct sockaddr_entry *dst_se;
7153
7154 dst_se = sockaddrentry_alloc(how);
7155 if (dst_se != NULL) {
7156 int len = src_se->se_addr->sa_len;
7157
7158 MALLOC(dst_se->se_addr, struct sockaddr *,
490019cf 7159 len, M_SONAME, how | M_ZERO);
39236c6e
A
7160 if (dst_se->se_addr != NULL) {
7161 bcopy(src_se->se_addr, dst_se->se_addr, len);
7162 } else {
7163 sockaddrentry_free(dst_se);
7164 dst_se = NULL;
7165 }
7166 }
7167
7168 return (dst_se);
7169}
7170
7171struct sockaddr_list *
7172sockaddrlist_alloc(int how)
7173{
7174 struct sockaddr_list *sl;
7175
7176 sl = (how == M_WAITOK) ? zalloc(sl_zone) : zalloc_noblock(sl_zone);
7177 if (sl != NULL) {
7178 bzero(sl, sl_zone_size);
7179 TAILQ_INIT(&sl->sl_head);
7180 }
7181 return (sl);
7182}
7183
7184void
7185sockaddrlist_free(struct sockaddr_list *sl)
7186{
7187 struct sockaddr_entry *se, *tse;
7188
7189 TAILQ_FOREACH_SAFE(se, &sl->sl_head, se_link, tse) {
7190 sockaddrlist_remove(sl, se);
7191 sockaddrentry_free(se);
7192 }
7193 VERIFY(sl->sl_cnt == 0 && TAILQ_EMPTY(&sl->sl_head));
7194 zfree(sl_zone, sl);
7195}
7196
7197void
7198sockaddrlist_insert(struct sockaddr_list *sl, struct sockaddr_entry *se)
7199{
7200 VERIFY(!(se->se_flags & SEF_ATTACHED));
7201 se->se_flags |= SEF_ATTACHED;
7202 TAILQ_INSERT_TAIL(&sl->sl_head, se, se_link);
7203 sl->sl_cnt++;
7204 VERIFY(sl->sl_cnt != 0);
7205}
7206
7207void
7208sockaddrlist_remove(struct sockaddr_list *sl, struct sockaddr_entry *se)
7209{
7210 VERIFY(se->se_flags & SEF_ATTACHED);
7211 se->se_flags &= ~SEF_ATTACHED;
7212 VERIFY(sl->sl_cnt != 0);
7213 sl->sl_cnt--;
7214 TAILQ_REMOVE(&sl->sl_head, se, se_link);
7215}
7216
7217struct sockaddr_list *
7218sockaddrlist_dup(const struct sockaddr_list *src_sl, int how)
7219{
7220 struct sockaddr_entry *src_se, *tse;
7221 struct sockaddr_list *dst_sl;
7222
7223 dst_sl = sockaddrlist_alloc(how);
7224 if (dst_sl == NULL)
7225 return (NULL);
7226
7227 TAILQ_FOREACH_SAFE(src_se, &src_sl->sl_head, se_link, tse) {
7228 struct sockaddr_entry *dst_se;
7229
7230 if (src_se->se_addr == NULL)
7231 continue;
7232
7233 dst_se = sockaddrentry_dup(src_se, how);
7234 if (dst_se == NULL) {
7235 sockaddrlist_free(dst_sl);
7236 return (NULL);
7237 }
7238
7239 sockaddrlist_insert(dst_sl, dst_se);
7240 }
7241 VERIFY(src_sl->sl_cnt == dst_sl->sl_cnt);
7242
7243 return (dst_sl);
7244}
7245
7246int
7247so_set_effective_pid(struct socket *so, int epid, struct proc *p)
7248{
7249 struct proc *ep = PROC_NULL;
7250 int error = 0;
7251
7252 /* pid 0 is reserved for kernel */
7253 if (epid == 0) {
7254 error = EINVAL;
7255 goto done;
7256 }
7257
7258 /*
7259 * If this is an in-kernel socket, prevent its delegate
7260 * association from changing unless the socket option is
7261 * coming from within the kernel itself.
7262 */
7263 if (so->last_pid == 0 && p != kernproc) {
7264 error = EACCES;
7265 goto done;
7266 }
7267
7268 /*
7269 * If this is issued by a process that's recorded as the
7270 * real owner of the socket, or if the pid is the same as
7271 * the process's own pid, then proceed. Otherwise ensure
7272 * that the issuing process has the necessary privileges.
7273 */
7274 if (epid != so->last_pid || epid != proc_pid(p)) {
7275 if ((error = priv_check_cred(kauth_cred_get(),
7276 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7277 error = EACCES;
7278 goto done;
7279 }
7280 }
7281
7282 /* Find the process that corresponds to the effective pid */
7283 if ((ep = proc_find(epid)) == PROC_NULL) {
7284 error = ESRCH;
7285 goto done;
7286 }
7287
7288 /*
7289 * If a process tries to delegate the socket to itself, then
7290 * there's really nothing to do; treat it as a way for the
7291 * delegate association to be cleared. Note that we check
7292 * the passed-in proc rather than calling proc_selfpid(),
7293 * as we need to check the process issuing the socket option
7294 * which could be kernproc. Given that we don't allow 0 for
7295 * effective pid, it means that a delegated in-kernel socket
7296 * stays delegated during its lifetime (which is probably OK.)
7297 */
7298 if (epid == proc_pid(p)) {
7299 so->so_flags &= ~SOF_DELEGATED;
7300 so->e_upid = 0;
7301 so->e_pid = 0;
7302 uuid_clear(so->e_uuid);
7303 } else {
7304 so->so_flags |= SOF_DELEGATED;
7305 so->e_upid = proc_uniqueid(ep);
7306 so->e_pid = proc_pid(ep);
7307 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
7308 }
39236c6e
A
7309done:
7310 if (error == 0 && net_io_policy_log) {
7311 uuid_string_t buf;
7312
7313 uuid_unparse(so->e_uuid, buf);
7314 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7315 "euuid %s%s\n", __func__, proc_name_address(p),
3e170ce0
A
7316 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7317 SOCK_DOM(so), SOCK_TYPE(so),
7318 so->e_pid, proc_name_address(ep), buf,
39236c6e
A
7319 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7320 } else if (error != 0 && net_io_policy_log) {
7321 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7322 "ERROR (%d)\n", __func__, proc_name_address(p),
3e170ce0
A
7323 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7324 SOCK_DOM(so), SOCK_TYPE(so),
7325 epid, (ep == PROC_NULL) ? "PROC_NULL" :
39236c6e
A
7326 proc_name_address(ep), error);
7327 }
7328
fe8ab488
A
7329 /* Update this socket's policy upon success */
7330 if (error == 0) {
7331 so->so_policy_gencnt *= -1;
7332 so_update_policy(so);
7333#if NECP
7334 so_update_necp_policy(so, NULL, NULL);
7335#endif /* NECP */
7336 }
7337
39236c6e
A
7338 if (ep != PROC_NULL)
7339 proc_rele(ep);
7340
7341 return (error);
7342}
7343
7344int
7345so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
7346{
7347 uuid_string_t buf;
7348 uuid_t uuid;
7349 int error = 0;
7350
7351 /* UUID must not be all-zeroes (reserved for kernel) */
7352 if (uuid_is_null(euuid)) {
7353 error = EINVAL;
3e170ce0 7354 goto done;
39236c6e
A
7355 }
7356
7357 /*
7358 * If this is an in-kernel socket, prevent its delegate
7359 * association from changing unless the socket option is
7360 * coming from within the kernel itself.
7361 */
7362 if (so->last_pid == 0 && p != kernproc) {
7363 error = EACCES;
7364 goto done;
7365 }
7366
7367 /* Get the UUID of the issuing process */
7368 proc_getexecutableuuid(p, uuid, sizeof (uuid));
7369
7370 /*
7371 * If this is issued by a process that's recorded as the
7372 * real owner of the socket, or if the uuid is the same as
7373 * the process's own uuid, then proceed. Otherwise ensure
7374 * that the issuing process has the necessary privileges.
7375 */
7376 if (uuid_compare(euuid, so->last_uuid) != 0 ||
7377 uuid_compare(euuid, uuid) != 0) {
7378 if ((error = priv_check_cred(kauth_cred_get(),
7379 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7380 error = EACCES;
7381 goto done;
7382 }
7383 }
7384
7385 /*
7386 * If a process tries to delegate the socket to itself, then
7387 * there's really nothing to do; treat it as a way for the
7388 * delegate association to be cleared. Note that we check
7389 * the uuid of the passed-in proc rather than that of the
7390 * current process, as we need to check the process issuing
7391 * the socket option which could be kernproc itself. Given
7392 * that we don't allow 0 for effective uuid, it means that
7393 * a delegated in-kernel socket stays delegated during its
7394 * lifetime (which is okay.)
7395 */
7396 if (uuid_compare(euuid, uuid) == 0) {
7397 so->so_flags &= ~SOF_DELEGATED;
7398 so->e_upid = 0;
7399 so->e_pid = 0;
7400 uuid_clear(so->e_uuid);
7401 } else {
7402 so->so_flags |= SOF_DELEGATED;
7403 /*
7404 * Unlike so_set_effective_pid(), we only have the UUID
7405 * here and the process ID is not known. Inherit the
7406 * real {pid,upid} of the socket.
7407 */
7408 so->e_upid = so->last_upid;
7409 so->e_pid = so->last_pid;
7410 uuid_copy(so->e_uuid, euuid);
7411 }
7412
7413done:
7414 if (error == 0 && net_io_policy_log) {
7415 uuid_unparse(so->e_uuid, buf);
7416 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7417 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
3e170ce0 7418 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
39236c6e
A
7419 SOCK_TYPE(so), so->e_pid, buf,
7420 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7421 } else if (error != 0 && net_io_policy_log) {
7422 uuid_unparse(euuid, buf);
7423 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7424 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
3e170ce0 7425 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
39236c6e
A
7426 SOCK_TYPE(so), buf, error);
7427 }
7428
fe8ab488
A
7429 /* Update this socket's policy upon success */
7430 if (error == 0) {
7431 so->so_policy_gencnt *= -1;
7432 so_update_policy(so);
7433#if NECP
7434 so_update_necp_policy(so, NULL, NULL);
7435#endif /* NECP */
7436 }
7437
39236c6e
A
7438 return (error);
7439}
7440
7441void
7442netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
7443 uint32_t ev_datalen)
7444{
7445 struct kev_msg ev_msg;
7446
7447 /*
7448 * A netpolicy event always starts with a netpolicy_event_data
7449 * structure, but the caller can provide for a longer event
7450 * structure to post, depending on the event code.
7451 */
7452 VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
7453
7454 bzero(&ev_msg, sizeof (ev_msg));
7455 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7456 ev_msg.kev_class = KEV_NETWORK_CLASS;
7457 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
7458 ev_msg.event_code = ev_code;
7459
7460 ev_msg.dv[0].data_ptr = ev_data;
7461 ev_msg.dv[0].data_length = ev_datalen;
7462
7463 kev_post_msg(&ev_msg);
7464}
fe8ab488
A
7465
7466void
3e170ce0 7467socket_post_kev_msg(uint32_t ev_code,
fe8ab488
A
7468 struct kev_socket_event_data *ev_data,
7469 uint32_t ev_datalen)
7470{
7471 struct kev_msg ev_msg;
7472
7473 bzero(&ev_msg, sizeof(ev_msg));
7474 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7475 ev_msg.kev_class = KEV_NETWORK_CLASS;
7476 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
7477 ev_msg.event_code = ev_code;
7478
7479 ev_msg.dv[0].data_ptr = ev_data;
7480 ev_msg.dv[0]. data_length = ev_datalen;
7481
7482 kev_post_msg(&ev_msg);
7483}
7484
7485void
7486socket_post_kev_msg_closed(struct socket *so)
7487{
7488 struct kev_socket_closed ev;
7489 struct sockaddr *socksa = NULL, *peersa = NULL;
7490 int err;
7491 bzero(&ev, sizeof(ev));
7492 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
7493 if (err == 0) {
7494 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
7495 &peersa);
7496 if (err == 0) {
7497 memcpy(&ev.ev_data.kev_sockname, socksa,
7498 min(socksa->sa_len,
7499 sizeof (ev.ev_data.kev_sockname)));
7500 memcpy(&ev.ev_data.kev_peername, peersa,
7501 min(peersa->sa_len,
7502 sizeof (ev.ev_data.kev_peername)));
7503 socket_post_kev_msg(KEV_SOCKET_CLOSED,
3e170ce0 7504 &ev.ev_data, sizeof (ev));
fe8ab488
A
7505 }
7506 }
7507 if (socksa != NULL)
7508 FREE(socksa, M_SONAME);
7509 if (peersa != NULL)
7510 FREE(peersa, M_SONAME);
7511}