]> git.saurik.com Git - apple/xnu.git/blame - bsd/kern/uipc_socket.c
xnu-2782.40.9.tar.gz
[apple/xnu.git] / bsd / kern / uipc_socket.c
CommitLineData
1c79356b 1/*
04b8595b 2 * Copyright (c) 1998-2015 Apple Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
39236c6e 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
39236c6e 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
39236c6e 17 *
2d21ac55
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
39236c6e 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b 27 */
1c79356b
A
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
9bccf70c 61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
1c79356b 62 */
2d21ac55
A
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
1c79356b
A
69
70#include <sys/param.h>
71#include <sys/systm.h>
55e303ae 72#include <sys/filedesc.h>
2d21ac55 73#include <sys/proc.h>
91447636
A
74#include <sys/proc_internal.h>
75#include <sys/kauth.h>
76#include <sys/file_internal.h>
1c79356b
A
77#include <sys/fcntl.h>
78#include <sys/malloc.h>
79#include <sys/mbuf.h>
80#include <sys/domain.h>
81#include <sys/kernel.h>
55e303ae 82#include <sys/event.h>
1c79356b
A
83#include <sys/poll.h>
84#include <sys/protosw.h>
85#include <sys/socket.h>
86#include <sys/socketvar.h>
87#include <sys/resourcevar.h>
88#include <sys/signalvar.h>
89#include <sys/sysctl.h>
39236c6e 90#include <sys/syslog.h>
1c79356b 91#include <sys/uio.h>
fe8ab488 92#include <sys/uio_internal.h>
1c79356b
A
93#include <sys/ev.h>
94#include <sys/kdebug.h>
2d21ac55 95#include <sys/un.h>
d1ecb069 96#include <sys/user.h>
316670eb 97#include <sys/priv.h>
39236c6e 98#include <sys/kern_event.h>
1c79356b 99#include <net/route.h>
39236c6e 100#include <net/init.h>
316670eb 101#include <net/ntstat.h>
fe8ab488 102#include <net/content_filter.h>
1c79356b
A
103#include <netinet/in.h>
104#include <netinet/in_pcb.h>
6d2010ae
A
105#include <netinet/ip6.h>
106#include <netinet6/ip6_var.h>
39236c6e 107#include <netinet/flow_divert.h>
1c79356b 108#include <kern/zalloc.h>
91447636 109#include <kern/locks.h>
1c79356b 110#include <machine/limits.h>
2d21ac55
A
111#include <libkern/OSAtomic.h>
112#include <pexpert/pexpert.h>
b0d623f7 113#include <kern/assert.h>
6d2010ae 114#include <kern/task.h>
316670eb 115#include <sys/kpi_mbuf.h>
6d2010ae 116#include <sys/mcache.h>
fe8ab488 117#include <sys/unpcb.h>
2d21ac55
A
118
119#if CONFIG_MACF
120#include <security/mac.h>
121#include <security/mac_framework.h>
122#endif /* MAC */
123
39236c6e
A
124#if MULTIPATH
125#include <netinet/mp_pcb.h>
fe8ab488 126#include <netinet/mptcp_var.h>
39236c6e
A
127#endif /* MULTIPATH */
128
129/* TODO: this should be in a header file somewhere */
130extern char *proc_name_address(void *p);
131
132static u_int32_t so_cache_hw; /* High water mark for socache */
133static u_int32_t so_cache_timeouts; /* number of timeouts */
134static u_int32_t so_cache_max_freed; /* max freed per timeout */
135static u_int32_t cached_sock_count = 0;
136STAILQ_HEAD(, socket) so_cache_head;
137int max_cached_sock_count = MAX_CACHED_SOCKETS;
138static u_int32_t so_cache_time;
139static int socketinit_done;
140static struct zone *so_cache_zone;
141
142static lck_grp_t *so_cache_mtx_grp;
143static lck_attr_t *so_cache_mtx_attr;
91447636 144static lck_grp_attr_t *so_cache_mtx_grp_attr;
39236c6e 145static lck_mtx_t *so_cache_mtx;
91447636 146
1c79356b
A
147#include <machine/limits.h>
148
2d21ac55
A
149static void filt_sordetach(struct knote *kn);
150static int filt_soread(struct knote *kn, long hint);
151static void filt_sowdetach(struct knote *kn);
152static int filt_sowrite(struct knote *kn, long hint);
316670eb
A
153static void filt_sockdetach(struct knote *kn);
154static int filt_sockev(struct knote *kn, long hint);
2d21ac55 155
39236c6e
A
156static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
157static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
55e303ae 158
b0d623f7 159static struct filterops soread_filtops = {
39236c6e
A
160 .f_isfd = 1,
161 .f_detach = filt_sordetach,
162 .f_event = filt_soread,
b0d623f7 163};
39236c6e 164
b0d623f7 165static struct filterops sowrite_filtops = {
39236c6e
A
166 .f_isfd = 1,
167 .f_detach = filt_sowdetach,
168 .f_event = filt_sowrite,
b0d623f7 169};
39236c6e 170
316670eb
A
171static struct filterops sock_filtops = {
172 .f_isfd = 1,
173 .f_detach = filt_sockdetach,
174 .f_event = filt_sockev,
175};
55e303ae 176
fe8ab488
A
177SYSCTL_DECL(_kern_ipc);
178
2d21ac55 179#define EVEN_MORE_LOCKING_DEBUG 0
fe8ab488 180
1c79356b 181int socket_debug = 0;
fe8ab488
A
182SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
183 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
184
39236c6e 185static int socket_zone = M_SOCKET;
1c79356b
A
186so_gen_t so_gencnt; /* generation count for sockets */
187
188MALLOC_DEFINE(M_SONAME, "soname", "socket name");
189MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
190
2d21ac55
A
191#define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
192#define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
193#define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
194#define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
195#define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
fe8ab488 196#define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
2d21ac55 197#define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
fe8ab488 198#define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
2d21ac55 199#define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
1c79356b 200
2d21ac55 201#define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
1c79356b 202
2d21ac55 203int somaxconn = SOMAXCONN;
39236c6e
A
204SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
205 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
1c79356b
A
206
207/* Should we get a maximum also ??? */
fa4905b1 208static int sosendmaxchain = 65536;
1c79356b 209static int sosendminchain = 16384;
55e303ae 210static int sorecvmincopy = 16384;
39236c6e
A
211SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
212 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
213SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
214 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
2d21ac55
A
215
216/*
217 * Set to enable jumbo clusters (if available) for large writes when
218 * the socket is marked with SOF_MULTIPAGES; see below.
219 */
220int sosendjcl = 1;
39236c6e
A
221SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
222 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
1c79356b 223
2d21ac55
A
224/*
225 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
226 * writes on the socket for all protocols on any network interfaces,
227 * depending upon sosendjcl above. Be extra careful when setting this
228 * to 1, because sending down packets that cross physical pages down to
229 * broken drivers (those that falsely assume that the physical pages
230 * are contiguous) might lead to system panics or silent data corruption.
231 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
232 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
233 * capable. Set this to 1 only for testing/debugging purposes.
234 */
235int sosendjcl_ignore_capab = 0;
39236c6e
A
236SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
237 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
1c79356b 238
fe8ab488
A
239int sosendbigcl_ignore_capab = 0;
240SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
241 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
242
6d2010ae
A
243int sodefunctlog = 0;
244SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
39236c6e 245 &sodefunctlog, 0, "");
6d2010ae 246
316670eb
A
247int sothrottlelog = 0;
248SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
39236c6e
A
249 &sothrottlelog, 0, "");
250
251int sorestrictrecv = 1;
252SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
253 &sorestrictrecv, 0, "Enable inbound interface restrictions");
316670eb 254
fe8ab488
A
255int sorestrictsend = 1;
256SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
257 &sorestrictsend, 0, "Enable outbound interface restrictions");
1c79356b 258
39236c6e 259extern struct inpcbinfo tcbinfo;
2d21ac55
A
260
261/* TODO: these should be in header file */
262extern int get_inpcb_str_size(void);
263extern int get_tcp_str_size(void);
2d21ac55 264
39236c6e
A
265static unsigned int sl_zone_size; /* size of sockaddr_list */
266static struct zone *sl_zone; /* zone for sockaddr_list */
267
268static unsigned int se_zone_size; /* size of sockaddr_entry */
269static struct zone *se_zone; /* zone for sockaddr_entry */
91447636
A
270
271vm_size_t so_cache_zone_element_size;
272
39236c6e 273static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **, user_ssize_t *);
2d21ac55
A
274static void cached_sock_alloc(struct socket **, int);
275static void cached_sock_free(struct socket *);
91447636 276
316670eb
A
277/*
278 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
279 * setting the DSCP code on the packet based on the service class; see
280 * <rdar://problem/11277343> for details.
281 */
282__private_extern__ u_int32_t sotcdb = SOTCDB_NO_DSCP;
6d2010ae 283SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
39236c6e 284 &sotcdb, 0, "");
91447636 285
2d21ac55
A
286void
287socketinit(void)
1c79356b 288{
fe8ab488
A
289 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
290 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
291
39236c6e 292 if (socketinit_done) {
91447636
A
293 printf("socketinit: already called...\n");
294 return;
295 }
39236c6e 296 socketinit_done = 1;
91447636 297
39236c6e
A
298 PE_parse_boot_argn("socket_debug", &socket_debug,
299 sizeof (socket_debug));
2d21ac55 300
91447636
A
301 /*
302 * allocate lock group attribute and group for socket cache mutex
303 */
304 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
2d21ac55
A
305 so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
306 so_cache_mtx_grp_attr);
307
91447636
A
308 /*
309 * allocate the lock attribute for socket cache mutex
310 */
311 so_cache_mtx_attr = lck_attr_alloc_init();
91447636 312
2d21ac55
A
313 /* cached sockets mutex */
314 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
39236c6e
A
315 if (so_cache_mtx == NULL) {
316 panic("%s: unable to allocate so_cache_mtx\n", __func__);
317 /* NOTREACHED */
318 }
319 STAILQ_INIT(&so_cache_head);
1c79356b 320
39236c6e
A
321 so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4
322 + get_inpcb_str_size() + 4 + get_tcp_str_size());
2d21ac55 323
39236c6e
A
324 so_cache_zone = zinit(so_cache_zone_element_size,
325 (120000 * so_cache_zone_element_size), 8192, "socache zone");
6d2010ae 326 zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
0b4c1975 327 zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
1c79356b 328
39236c6e
A
329 sl_zone_size = sizeof (struct sockaddr_list);
330 if ((sl_zone = zinit(sl_zone_size, 1024 * sl_zone_size, 1024,
331 "sockaddr_list")) == NULL) {
332 panic("%s: unable to allocate sockaddr_list zone\n", __func__);
333 /* NOTREACHED */
334 }
335 zone_change(sl_zone, Z_CALLERACCT, FALSE);
336 zone_change(sl_zone, Z_EXPAND, TRUE);
337
338 se_zone_size = sizeof (struct sockaddr_entry);
339 if ((se_zone = zinit(se_zone_size, 1024 * se_zone_size, 1024,
340 "sockaddr_entry")) == NULL) {
341 panic("%s: unable to allocate sockaddr_entry zone\n", __func__);
342 /* NOTREACHED */
343 }
344 zone_change(se_zone, Z_CALLERACCT, FALSE);
345 zone_change(se_zone, Z_EXPAND, TRUE);
6d2010ae 346
316670eb 347
39236c6e
A
348 in_pcbinit();
349 sflt_init();
6d2010ae 350 socket_tclass_init();
39236c6e
A
351#if MULTIPATH
352 mp_pcbinit();
353#endif /* MULTIPATH */
1c79356b
A
354}
355
2d21ac55
A
356static void
357cached_sock_alloc(struct socket **so, int waitok)
1c79356b 358{
2d21ac55 359 caddr_t temp;
39236c6e 360 uintptr_t offset;
1c79356b 361
91447636
A
362 lck_mtx_lock(so_cache_mtx);
363
39236c6e
A
364 if (!STAILQ_EMPTY(&so_cache_head)) {
365 VERIFY(cached_sock_count > 0);
1c79356b 366
39236c6e
A
367 *so = STAILQ_FIRST(&so_cache_head);
368 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
369 STAILQ_NEXT((*so), so_cache_ent) = NULL;
91447636 370
39236c6e 371 cached_sock_count--;
91447636 372 lck_mtx_unlock(so_cache_mtx);
1c79356b 373
2d21ac55
A
374 temp = (*so)->so_saved_pcb;
375 bzero((caddr_t)*so, sizeof (struct socket));
39236c6e 376
2d21ac55 377 (*so)->so_saved_pcb = temp;
2d21ac55 378 } else {
1c79356b 379
2d21ac55 380 lck_mtx_unlock(so_cache_mtx);
1c79356b 381
2d21ac55
A
382 if (waitok)
383 *so = (struct socket *)zalloc(so_cache_zone);
384 else
385 *so = (struct socket *)zalloc_noblock(so_cache_zone);
1c79356b 386
39236c6e 387 if (*so == NULL)
2d21ac55 388 return;
1c79356b 389
2d21ac55 390 bzero((caddr_t)*so, sizeof (struct socket));
1c79356b 391
2d21ac55 392 /*
39236c6e
A
393 * Define offsets for extra structures into our
394 * single block of memory. Align extra structures
395 * on longword boundaries.
2d21ac55 396 */
b0d623f7 397
39236c6e 398 offset = (uintptr_t)*so;
2d21ac55 399 offset += sizeof (struct socket);
b0d623f7
A
400
401 offset = ALIGN(offset);
402
2d21ac55
A
403 (*so)->so_saved_pcb = (caddr_t)offset;
404 offset += get_inpcb_str_size();
b0d623f7
A
405
406 offset = ALIGN(offset);
1c79356b 407
316670eb 408 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
2d21ac55 409 (caddr_t)offset;
2d21ac55 410 }
1c79356b 411
39236c6e 412 (*so)->cached_in_sock_layer = true;
1c79356b
A
413}
414
2d21ac55
A
415static void
416cached_sock_free(struct socket *so)
1c79356b 417{
1c79356b 418
91447636 419 lck_mtx_lock(so_cache_mtx);
1c79356b 420
39236c6e 421 so_cache_time = net_uptime();
b0d623f7 422 if (++cached_sock_count > max_cached_sock_count) {
1c79356b 423 --cached_sock_count;
91447636 424 lck_mtx_unlock(so_cache_mtx);
91447636 425 zfree(so_cache_zone, so);
2d21ac55 426 } else {
1c79356b
A
427 if (so_cache_hw < cached_sock_count)
428 so_cache_hw = cached_sock_count;
429
39236c6e 430 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
1c79356b
A
431
432 so->cache_timestamp = so_cache_time;
91447636 433 lck_mtx_unlock(so_cache_mtx);
1c79356b 434 }
1c79356b
A
435}
436
39236c6e
A
437void
438so_update_last_owner_locked(struct socket *so, proc_t self)
6d2010ae 439{
39236c6e
A
440 if (so->last_pid != 0) {
441 /*
442 * last_pid and last_upid should remain zero for sockets
443 * created using sock_socket. The check above achieves that
444 */
445 if (self == PROC_NULL)
316670eb 446 self = current_proc();
39236c6e
A
447
448 if (so->last_upid != proc_uniqueid(self) ||
449 so->last_pid != proc_pid(self)) {
316670eb
A
450 so->last_upid = proc_uniqueid(self);
451 so->last_pid = proc_pid(self);
39236c6e
A
452 proc_getexecutableuuid(self, so->last_uuid,
453 sizeof (so->last_uuid));
316670eb 454 }
fe8ab488 455 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
6d2010ae
A
456 }
457}
458
39236c6e
A
459void
460so_update_policy(struct socket *so)
1c79356b 461{
39236c6e
A
462 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
463 (void) inp_update_policy(sotoinpcb(so));
464}
1c79356b 465
fe8ab488
A
466#if NECP
467static void
468so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr)
469{
470 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
471 inp_update_necp_policy(sotoinpcb(so), override_local_addr, override_remote_addr, 0);
472}
473#endif /* NECP */
474
39236c6e
A
475boolean_t
476so_cache_timer(void)
477{
478 struct socket *p;
479 int n_freed = 0;
480 boolean_t rc = FALSE;
1c79356b 481
39236c6e
A
482 lck_mtx_lock(so_cache_mtx);
483 so_cache_timeouts++;
484 so_cache_time = net_uptime();
485
486 while (!STAILQ_EMPTY(&so_cache_head)) {
487 VERIFY(cached_sock_count > 0);
488 p = STAILQ_FIRST(&so_cache_head);
489 if ((so_cache_time - p->cache_timestamp) <
490 SO_CACHE_TIME_LIMIT)
2d21ac55 491 break;
1c79356b 492
39236c6e
A
493 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
494 --cached_sock_count;
1c79356b 495
91447636 496 zfree(so_cache_zone, p);
2d21ac55
A
497
498 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
499 so_cache_max_freed++;
1c79356b
A
500 break;
501 }
502 }
1c79356b 503
39236c6e
A
504 /* Schedule again if there is more to cleanup */
505 if (!STAILQ_EMPTY(&so_cache_head))
506 rc = TRUE;
507
508 lck_mtx_unlock(so_cache_mtx);
509 return (rc);
1c79356b 510}
1c79356b
A
511
512/*
513 * Get a socket structure from our zone, and initialize it.
514 * We don't implement `waitok' yet (see comments in uipc_domain.c).
515 * Note that it would probably be better to allocate socket
516 * and PCB at the same time, but I'm not convinced that all
517 * the protocols can be easily modified to do this.
518 */
519struct socket *
2d21ac55 520soalloc(int waitok, int dom, int type)
1c79356b
A
521{
522 struct socket *so;
523
2d21ac55
A
524 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
525 cached_sock_alloc(&so, waitok);
526 } else {
527 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
528 M_WAITOK);
529 if (so != NULL)
530 bzero(so, sizeof (*so));
1c79356b 531 }
2d21ac55 532 if (so != NULL) {
fe8ab488 533 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
2d21ac55
A
534 so->so_zone = socket_zone;
535#if CONFIG_MACF_SOCKET
39236c6e
A
536 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
537 if (mac_socket_label_init(so, !waitok) != 0) {
2d21ac55
A
538 sodealloc(so);
539 return (NULL);
540 }
541#endif /* MAC_SOCKET */
1c79356b
A
542 }
543
2d21ac55 544 return (so);
1c79356b
A
545}
546
547int
39236c6e
A
548socreate_internal(int dom, struct socket **aso, int type, int proto,
549 struct proc *p, uint32_t flags, struct proc *ep)
1c79356b 550{
39236c6e
A
551 struct protosw *prp;
552 struct socket *so;
553 int error = 0;
d1ecb069 554
55e303ae
A
555#if TCPDEBUG
556 extern int tcpconsdebug;
557#endif
39236c6e
A
558
559 VERIFY(aso != NULL);
560 *aso = NULL;
561
562 if (proto != 0)
1c79356b
A
563 prp = pffindproto(dom, proto, type);
564 else
565 prp = pffindtype(dom, type);
9bccf70c 566
39236c6e
A
567 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
568 if (pffinddomain(dom) == NULL)
2d21ac55 569 return (EAFNOSUPPORT);
2d21ac55 570 if (proto != 0) {
39236c6e 571 if (pffindprotonotype(dom, proto) != NULL)
2d21ac55 572 return (EPROTOTYPE);
2d21ac55 573 }
9bccf70c
A
574 return (EPROTONOSUPPORT);
575 }
1c79356b
A
576 if (prp->pr_type != type)
577 return (EPROTOTYPE);
b0d623f7 578 so = soalloc(1, dom, type);
39236c6e 579 if (so == NULL)
1c79356b
A
580 return (ENOBUFS);
581
39236c6e
A
582 if (flags & SOCF_ASYNC)
583 so->so_state |= SS_NBIO;
584#if MULTIPATH
585 if (flags & SOCF_MP_SUBFLOW) {
586 /*
587 * A multipath subflow socket is used internally in the kernel,
588 * therefore it does not have a file desciptor associated by
589 * default.
590 */
591 so->so_state |= SS_NOFDREF;
592 so->so_flags |= SOF_MP_SUBFLOW;
593 }
594#endif /* MULTIPATH */
595
1c79356b
A
596 TAILQ_INIT(&so->so_incomp);
597 TAILQ_INIT(&so->so_comp);
598 so->so_type = type;
316670eb
A
599 so->last_upid = proc_uniqueid(p);
600 so->last_pid = proc_pid(p);
39236c6e 601 proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
fe8ab488 602 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
39236c6e
A
603
604 if (ep != PROC_NULL && ep != p) {
605 so->e_upid = proc_uniqueid(ep);
606 so->e_pid = proc_pid(ep);
607 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
608 so->so_flags |= SOF_DELEGATED;
609 }
1c79356b 610
316670eb 611 so->so_cred = kauth_cred_proc_ref(p);
b0d623f7 612 if (!suser(kauth_cred_get(), NULL))
39236c6e 613 so->so_state |= SS_PRIV;
b0d623f7 614
1c79356b 615 so->so_proto = prp;
39236c6e 616 so->so_rcv.sb_flags |= SB_RECV;
91447636 617 so->so_rcv.sb_so = so->so_snd.sb_so = so;
0c530ab8
A
618 so->next_lock_lr = 0;
619 so->next_unlock_lr = 0;
2d21ac55
A
620
621#if CONFIG_MACF_SOCKET
622 mac_socket_label_associate(kauth_cred_get(), so);
623#endif /* MAC_SOCKET */
624
2d21ac55 625 /*
39236c6e
A
626 * Attachment will create the per pcb lock if necessary and
627 * increase refcount for creation, make sure it's done before
628 * socket is inserted in lists.
2d21ac55
A
629 */
630 so->so_usecount++;
91447636
A
631
632 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
39236c6e 633 if (error != 0) {
2d21ac55
A
634 /*
635 * Warning:
636 * If so_pcb is not zero, the socket will be leaked,
637 * so protocol attachment handler must be coded carefuly
55e303ae 638 */
1c79356b 639 so->so_state |= SS_NOFDREF;
37839358
A
640 so->so_usecount--;
641 sofreelastref(so, 1); /* will deallocate the socket */
1c79356b
A
642 return (error);
643 }
39236c6e
A
644
645 atomic_add_32(&prp->pr_domain->dom_refs, 1);
1c79356b 646 TAILQ_INIT(&so->so_evlist);
91447636
A
647
648 /* Attach socket filters for this protocol */
649 sflt_initsock(so);
55e303ae
A
650#if TCPDEBUG
651 if (tcpconsdebug == 2)
652 so->so_options |= SO_DEBUG;
9bccf70c 653#endif
6d2010ae 654 so_set_default_traffic_class(so);
39236c6e 655
d1ecb069 656 /*
39236c6e
A
657 * If this thread or task is marked to create backgrounded sockets,
658 * mark the socket as background.
d1ecb069 659 */
39236c6e 660 if (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
d1ecb069 661 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
6d2010ae
A
662 so->so_background_thread = current_thread();
663 }
664
665 switch (dom) {
316670eb 666 /*
39236c6e
A
667 * Don't mark Unix domain, system or multipath sockets as
668 * eligible for defunct by default.
669 */
6d2010ae 670 case PF_LOCAL:
316670eb 671 case PF_SYSTEM:
39236c6e 672 case PF_MULTIPATH:
6d2010ae
A
673 so->so_flags |= SOF_NODEFUNCT;
674 break;
316670eb
A
675 default:
676 break;
d1ecb069
A
677 }
678
fe8ab488
A
679 /*
680 * Entitlements can't be checked at socket creation time except if the
681 * application requested a feature guarded by a privilege (c.f., socket
682 * delegation).
683 * The priv(9) and the Sandboxing APIs are designed with the idea that
684 * a privilege check should only be triggered by a userland request.
685 * A privilege check at socket creation time is time consuming and
686 * could trigger many authorisation error messages from the security
687 * APIs.
688 */
689
1c79356b 690 *aso = so;
39236c6e 691
1c79356b
A
692 return (0);
693}
694
39236c6e
A
695/*
696 * Returns: 0 Success
697 * EAFNOSUPPORT
698 * EPROTOTYPE
699 * EPROTONOSUPPORT
700 * ENOBUFS
701 * <pru_attach>:ENOBUFS[AF_UNIX]
702 * <pru_attach>:ENOBUFS[TCP]
703 * <pru_attach>:ENOMEM[TCP]
704 * <pru_attach>:??? [other protocol families, IPSEC]
705 */
706int
707socreate(int dom, struct socket **aso, int type, int proto)
708{
709 return (socreate_internal(dom, aso, type, proto, current_proc(), 0,
710 PROC_NULL));
711}
712
713int
714socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
715{
716 int error = 0;
717 struct proc *ep = PROC_NULL;
718
719 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
720 error = ESRCH;
721 goto done;
722 }
723
724 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
725
726 /*
727 * It might not be wise to hold the proc reference when calling
728 * socreate_internal since it calls soalloc with M_WAITOK
729 */
730done:
731 if (ep != PROC_NULL)
732 proc_rele(ep);
733
734 return (error);
735}
736
2d21ac55
A
737/*
738 * Returns: 0 Success
739 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
740 * <pru_bind>:EAFNOSUPPORT Address family not supported
741 * <pru_bind>:EADDRNOTAVAIL Address not available.
742 * <pru_bind>:EINVAL Invalid argument
743 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
744 * <pru_bind>:EACCES Permission denied
745 * <pru_bind>:EADDRINUSE Address in use
746 * <pru_bind>:EAGAIN Resource unavailable, try again
747 * <pru_bind>:EPERM Operation not permitted
748 * <pru_bind>:???
749 * <sf_bind>:???
750 *
751 * Notes: It's not possible to fully enumerate the return codes above,
752 * since socket filter authors and protocol family authors may
753 * not choose to limit their error returns to those listed, even
754 * though this may result in some software operating incorrectly.
755 *
756 * The error codes which are enumerated above are those known to
757 * be returned by the tcp_usr_bind function supplied.
758 */
1c79356b 759int
39236c6e 760sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
1c79356b
A
761{
762 struct proc *p = current_proc();
91447636 763 int error = 0;
1c79356b 764
39236c6e
A
765 if (dolock)
766 socket_lock(so, 1);
767 VERIFY(so->so_usecount > 1);
768
6d2010ae 769 so_update_last_owner_locked(so, p);
39236c6e 770 so_update_policy(so);
fe8ab488
A
771
772#if NECP
773 so_update_necp_policy(so, nam, NULL);
774#endif /* NECP */
775
2d21ac55 776 /*
6d2010ae
A
777 * If this is a bind request on a socket that has been marked
778 * as inactive, reject it now before we go any further.
2d21ac55
A
779 */
780 if (so->so_flags & SOF_DEFUNCT) {
781 error = EINVAL;
39236c6e
A
782 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
783 __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
784 SOCK_DOM(so), SOCK_TYPE(so), error));
2d21ac55
A
785 goto out;
786 }
787
91447636 788 /* Socket filter */
6d2010ae 789 error = sflt_bind(so, nam);
2d21ac55 790
91447636
A
791 if (error == 0)
792 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
2d21ac55 793out:
39236c6e
A
794 if (dolock)
795 socket_unlock(so, 1);
2d21ac55 796
91447636
A
797 if (error == EJUSTRETURN)
798 error = 0;
2d21ac55 799
1c79356b
A
800 return (error);
801}
802
803void
2d21ac55 804sodealloc(struct socket *so)
1c79356b 805{
316670eb
A
806 kauth_cred_unref(&so->so_cred);
807
6d2010ae
A
808 /* Remove any filters */
809 sflt_termsock(so);
810
fe8ab488
A
811#if CONTENT_FILTER
812 cfil_sock_detach(so);
813#endif /* CONTENT_FILTER */
814
39236c6e
A
815 /* Delete the state allocated for msg queues on a socket */
816 if (so->so_flags & SOF_ENABLE_MSGS) {
817 FREE(so->so_msg_state, M_TEMP);
818 so->so_msg_state = NULL;
819 }
820 VERIFY(so->so_msg_state == NULL);
821
fe8ab488 822 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
1c79356b 823
2d21ac55
A
824#if CONFIG_MACF_SOCKET
825 mac_socket_label_destroy(so);
826#endif /* MAC_SOCKET */
39236c6e
A
827
828 if (so->cached_in_sock_layer) {
2d21ac55
A
829 cached_sock_free(so);
830 } else {
2d21ac55 831 FREE_ZONE(so, sizeof (*so), so->so_zone);
91447636 832 }
1c79356b
A
833}
834
2d21ac55
A
835/*
836 * Returns: 0 Success
837 * EINVAL
838 * EOPNOTSUPP
839 * <pru_listen>:EINVAL[AF_UNIX]
840 * <pru_listen>:EINVAL[TCP]
841 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
842 * <pru_listen>:EINVAL[TCP] Invalid argument
843 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
844 * <pru_listen>:EACCES[TCP] Permission denied
845 * <pru_listen>:EADDRINUSE[TCP] Address in use
846 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
847 * <pru_listen>:EPERM[TCP] Operation not permitted
848 * <sf_listen>:???
849 *
850 * Notes: Other <pru_listen> returns depend on the protocol family; all
851 * <sf_listen> returns depend on what the filter author causes
852 * their filter to return.
853 */
1c79356b 854int
2d21ac55 855solisten(struct socket *so, int backlog)
1c79356b 856{
1c79356b 857 struct proc *p = current_proc();
2d21ac55 858 int error = 0;
1c79356b 859
91447636 860 socket_lock(so, 1);
39236c6e
A
861
862 so_update_last_owner_locked(so, p);
863 so_update_policy(so);
fe8ab488
A
864
865#if NECP
866 so_update_necp_policy(so, NULL, NULL);
867#endif /* NECP */
868
2d21ac55
A
869 if (so->so_proto == NULL) {
870 error = EINVAL;
871 goto out;
872 }
873 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
874 error = EOPNOTSUPP;
875 goto out;
876 }
877
878 /*
879 * If the listen request is made on a socket that is not fully
6d2010ae
A
880 * disconnected, or on a socket that has been marked as inactive,
881 * reject the request now.
2d21ac55
A
882 */
883 if ((so->so_state &
884 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
885 (so->so_flags & SOF_DEFUNCT)) {
886 error = EINVAL;
6d2010ae 887 if (so->so_flags & SOF_DEFUNCT) {
39236c6e
A
888 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
889 "(%d)\n", __func__, proc_pid(p),
890 (uint64_t)VM_KERNEL_ADDRPERM(so),
891 SOCK_DOM(so), SOCK_TYPE(so), error));
6d2010ae 892 }
2d21ac55
A
893 goto out;
894 }
895
39236c6e 896 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
2d21ac55
A
897 error = EPERM;
898 goto out;
899 }
900
6d2010ae 901 error = sflt_listen(so);
39236c6e 902 if (error == 0)
91447636 903 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
2d21ac55 904
1c79356b 905 if (error) {
91447636
A
906 if (error == EJUSTRETURN)
907 error = 0;
2d21ac55 908 goto out;
1c79356b 909 }
2d21ac55 910
91447636 911 if (TAILQ_EMPTY(&so->so_comp))
1c79356b 912 so->so_options |= SO_ACCEPTCONN;
2d21ac55
A
913 /*
914 * POSIX: The implementation may have an upper limit on the length of
915 * the listen queue-either global or per accepting socket. If backlog
916 * exceeds this limit, the length of the listen queue is set to the
917 * limit.
918 *
919 * If listen() is called with a backlog argument value that is less
920 * than 0, the function behaves as if it had been called with a backlog
921 * argument value of 0.
922 *
923 * A backlog argument of 0 may allow the socket to accept connections,
924 * in which case the length of the listen queue may be set to an
925 * implementation-defined minimum value.
926 */
927 if (backlog <= 0 || backlog > somaxconn)
1c79356b 928 backlog = somaxconn;
1c79356b 929
2d21ac55
A
930 so->so_qlimit = backlog;
931out:
91447636 932 socket_unlock(so, 1);
2d21ac55 933 return (error);
1c79356b
A
934}
935
1c79356b 936void
2d21ac55 937sofreelastref(struct socket *so, int dealloc)
9bccf70c 938{
1c79356b
A
939 struct socket *head = so->so_head;
940
2d21ac55 941 /* Assume socket is locked */
1c79356b 942
39236c6e 943 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
0b4e3aa0
A
944 selthreadclear(&so->so_snd.sb_sel);
945 selthreadclear(&so->so_rcv.sb_sel);
39236c6e
A
946 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
947 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
fe8ab488 948 so->so_event = sonullevent;
1c79356b 949 return;
0b4e3aa0 950 }
9bccf70c 951 if (head != NULL) {
91447636 952 socket_lock(head, 1);
9bccf70c
A
953 if (so->so_state & SS_INCOMP) {
954 TAILQ_REMOVE(&head->so_incomp, so, so_list);
955 head->so_incqlen--;
956 } else if (so->so_state & SS_COMP) {
957 /*
958 * We must not decommission a socket that's
959 * on the accept(2) queue. If we do, then
960 * accept(2) may hang after select(2) indicated
961 * that the listening socket was ready.
962 */
9bccf70c
A
963 selthreadclear(&so->so_snd.sb_sel);
964 selthreadclear(&so->so_rcv.sb_sel);
39236c6e
A
965 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
966 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
fe8ab488 967 so->so_event = sonullevent;
91447636 968 socket_unlock(head, 1);
9bccf70c
A
969 return;
970 } else {
971 panic("sofree: not queued");
972 }
1c79356b 973 head->so_qlen--;
9bccf70c 974 so->so_state &= ~SS_INCOMP;
1c79356b 975 so->so_head = NULL;
91447636 976 socket_unlock(head, 1);
1c79356b 977 }
39236c6e 978 sowflush(so);
1c79356b 979 sorflush(so);
2d21ac55 980
39236c6e
A
981#if FLOW_DIVERT
982 if (so->so_flags & SOF_FLOW_DIVERT) {
983 flow_divert_detach(so);
984 }
985#endif /* FLOW_DIVERT */
986
91447636
A
987 /* 3932268: disable upcall */
988 so->so_rcv.sb_flags &= ~SB_UPCALL;
989 so->so_snd.sb_flags &= ~SB_UPCALL;
fe8ab488 990 so->so_event = sonullevent;
2d21ac55 991
91447636
A
992 if (dealloc)
993 sodealloc(so);
1c79356b
A
994}
995
2d21ac55
A
996void
997soclose_wait_locked(struct socket *so)
998{
999 lck_mtx_t *mutex_held;
1000
1001 if (so->so_proto->pr_getlock != NULL)
1002 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1003 else
1004 mutex_held = so->so_proto->pr_domain->dom_mtx;
1005 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1006
4a3eedf9
A
1007 /*
1008 * Double check here and return if there's no outstanding upcall;
1009 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1010 */
316670eb 1011 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
2d21ac55 1012 return;
316670eb
A
1013 so->so_rcv.sb_flags &= ~SB_UPCALL;
1014 so->so_snd.sb_flags &= ~SB_UPCALL;
2d21ac55 1015 so->so_flags |= SOF_CLOSEWAIT;
39236c6e 1016 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
2d21ac55
A
1017 "soclose_wait_locked", NULL);
1018 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1019 so->so_flags &= ~SOF_CLOSEWAIT;
1020}
1021
1c79356b
A
1022/*
1023 * Close a socket on last file table reference removal.
1024 * Initiate disconnect if connected.
1025 * Free socket when disconnect complete.
1026 */
1027int
2d21ac55 1028soclose_locked(struct socket *so)
1c79356b 1029{
1c79356b 1030 int error = 0;
2d21ac55 1031 lck_mtx_t *mutex_held;
91447636 1032 struct timespec ts;
1c79356b 1033
91447636 1034 if (so->so_usecount == 0) {
2d21ac55 1035 panic("soclose: so=%p refcount=0\n", so);
39236c6e 1036 /* NOTREACHED */
1c79356b
A
1037 }
1038
91447636 1039 sflt_notify(so, sock_evt_closing, NULL);
2d21ac55 1040
39236c6e
A
1041 if (so->so_upcallusecount)
1042 soclose_wait_locked(so);
1043
fe8ab488
A
1044#if CONTENT_FILTER
1045 /*
1046 * We have to wait until the content filters are done
1047 */
1048 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1049 cfil_sock_close_wait(so);
1050 cfil_sock_is_closed(so);
1051 cfil_sock_detach(so);
1052 }
1053#endif /* CONTENT_FILTER */
1054
91447636 1055 if ((so->so_options & SO_ACCEPTCONN)) {
2d21ac55
A
1056 struct socket *sp, *sonext;
1057 int socklock = 0;
1058
1059 /*
1060 * We do not want new connection to be added
1061 * to the connection queues
1062 */
91447636 1063 so->so_options &= ~SO_ACCEPTCONN;
2d21ac55 1064
39236c6e
A
1065 for (sp = TAILQ_FIRST(&so->so_incomp);
1066 sp != NULL; sp = sonext) {
2d21ac55
A
1067 sonext = TAILQ_NEXT(sp, so_list);
1068
39236c6e
A
1069 /*
1070 * Radar 5350314
2d21ac55
A
1071 * skip sockets thrown away by tcpdropdropblreq
1072 * they will get cleanup by the garbage collection.
1073 * otherwise, remove the incomp socket from the queue
1074 * and let soabort trigger the appropriate cleanup.
91447636 1075 */
39236c6e 1076 if (sp->so_flags & SOF_OVERFLOW)
2d21ac55
A
1077 continue;
1078
ff6e181a 1079 if (so->so_proto->pr_getlock != NULL) {
39236c6e
A
1080 /*
1081 * Lock ordering for consistency with the
1082 * rest of the stack, we lock the socket
1083 * first and then grabb the head.
2d21ac55 1084 */
91447636 1085 socket_unlock(so, 0);
ff6e181a 1086 socket_lock(sp, 1);
ff6e181a 1087 socket_lock(so, 0);
39236c6e 1088 socklock = 1;
2d21ac55
A
1089 }
1090
1091 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1092 so->so_incqlen--;
1093
1094 if (sp->so_state & SS_INCOMP) {
1095 sp->so_state &= ~SS_INCOMP;
1096 sp->so_head = NULL;
1097
1098 (void) soabort(sp);
ff6e181a 1099 }
2d21ac55 1100
39236c6e 1101 if (socklock)
2d21ac55 1102 socket_unlock(sp, 1);
91447636
A
1103 }
1104
1105 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
91447636 1106 /* Dequeue from so_comp since sofree() won't do it */
2d21ac55 1107 TAILQ_REMOVE(&so->so_comp, sp, so_list);
91447636 1108 so->so_qlen--;
ff6e181a
A
1109
1110 if (so->so_proto->pr_getlock != NULL) {
1111 socket_unlock(so, 0);
1112 socket_lock(sp, 1);
1113 }
1114
2d21ac55
A
1115 if (sp->so_state & SS_COMP) {
1116 sp->so_state &= ~SS_COMP;
1117 sp->so_head = NULL;
1118
1119 (void) soabort(sp);
1120 }
91447636 1121
ff6e181a 1122 if (so->so_proto->pr_getlock != NULL) {
91447636 1123 socket_unlock(sp, 1);
ff6e181a
A
1124 socket_lock(so, 0);
1125 }
91447636
A
1126 }
1127 }
39236c6e 1128 if (so->so_pcb == NULL) {
91447636
A
1129 /* 3915887: mark the socket as ready for dealloc */
1130 so->so_flags |= SOF_PCBCLEARING;
1c79356b 1131 goto discard;
91447636 1132 }
1c79356b
A
1133 if (so->so_state & SS_ISCONNECTED) {
1134 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
91447636 1135 error = sodisconnectlocked(so);
1c79356b
A
1136 if (error)
1137 goto drop;
1138 }
1139 if (so->so_options & SO_LINGER) {
1140 if ((so->so_state & SS_ISDISCONNECTING) &&
1141 (so->so_state & SS_NBIO))
1142 goto drop;
2d21ac55 1143 if (so->so_proto->pr_getlock != NULL)
91447636 1144 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2d21ac55 1145 else
91447636 1146 mutex_held = so->so_proto->pr_domain->dom_mtx;
1c79356b 1147 while (so->so_state & SS_ISCONNECTED) {
91447636 1148 ts.tv_sec = (so->so_linger/100);
2d21ac55
A
1149 ts.tv_nsec = (so->so_linger % 100) *
1150 NSEC_PER_USEC * 1000 * 10;
1151 error = msleep((caddr_t)&so->so_timeo,
1152 mutex_held, PSOCK | PCATCH, "soclose", &ts);
91447636 1153 if (error) {
2d21ac55
A
1154 /*
1155 * It's OK when the time fires,
1156 * don't report an error
1157 */
91447636
A
1158 if (error == EWOULDBLOCK)
1159 error = 0;
1c79356b 1160 break;
91447636 1161 }
1c79356b
A
1162 }
1163 }
1164 }
1165drop:
39236c6e 1166 if (so->so_usecount == 0) {
2d21ac55 1167 panic("soclose: usecount is zero so=%p\n", so);
39236c6e
A
1168 /* NOTREACHED */
1169 }
1170 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1c79356b
A
1171 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1172 if (error == 0)
1173 error = error2;
1174 }
39236c6e 1175 if (so->so_usecount <= 0) {
2d21ac55 1176 panic("soclose: usecount is zero so=%p\n", so);
39236c6e
A
1177 /* NOTREACHED */
1178 }
1c79356b 1179discard:
39236c6e
A
1180 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1181 (so->so_state & SS_NOFDREF)) {
1c79356b 1182 panic("soclose: NOFDREF");
39236c6e
A
1183 /* NOTREACHED */
1184 }
1c79356b 1185 so->so_state |= SS_NOFDREF;
39236c6e
A
1186
1187 if (so->so_flags & SOF_MP_SUBFLOW)
1188 so->so_flags &= ~SOF_MP_SUBFLOW;
1189
316670eb
A
1190 if ((so->so_flags & SOF_KNOTE) != 0)
1191 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
39236c6e
A
1192
1193 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1c79356b 1194 evsofree(so);
39236c6e 1195
91447636 1196 so->so_usecount--;
1c79356b 1197 sofree(so);
1c79356b
A
1198 return (error);
1199}
1200
91447636 1201int
2d21ac55 1202soclose(struct socket *so)
91447636
A
1203{
1204 int error = 0;
1205 socket_lock(so, 1);
2d21ac55 1206
2d21ac55 1207 if (so->so_retaincnt == 0) {
91447636 1208 error = soclose_locked(so);
2d21ac55
A
1209 } else {
1210 /*
1211 * if the FD is going away, but socket is
1212 * retained in kernel remove its reference
1213 */
91447636
A
1214 so->so_usecount--;
1215 if (so->so_usecount < 2)
2d21ac55
A
1216 panic("soclose: retaincnt non null and so=%p "
1217 "usecount=%d\n", so, so->so_usecount);
91447636
A
1218 }
1219 socket_unlock(so, 1);
1220 return (error);
1221}
1222
1c79356b
A
1223/*
1224 * Must be called at splnet...
1225 */
2d21ac55 1226/* Should already be locked */
1c79356b 1227int
2d21ac55 1228soabort(struct socket *so)
1c79356b 1229{
9bccf70c 1230 int error;
1c79356b 1231
91447636 1232#ifdef MORE_LOCKING_DEBUG
2d21ac55 1233 lck_mtx_t *mutex_held;
91447636 1234
2d21ac55 1235 if (so->so_proto->pr_getlock != NULL)
91447636 1236 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2d21ac55 1237 else
91447636
A
1238 mutex_held = so->so_proto->pr_domain->dom_mtx;
1239 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1240#endif
1241
2d21ac55
A
1242 if ((so->so_flags & SOF_ABORTED) == 0) {
1243 so->so_flags |= SOF_ABORTED;
1244 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1245 if (error) {
1246 sofree(so);
1247 return (error);
1248 }
9bccf70c
A
1249 }
1250 return (0);
1c79356b
A
1251}
1252
1253int
2d21ac55 1254soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
9bccf70c 1255{
1c79356b 1256 int error;
91447636 1257
2d21ac55
A
1258 if (dolock)
1259 socket_lock(so, 1);
1c79356b 1260
39236c6e
A
1261 so_update_last_owner_locked(so, PROC_NULL);
1262 so_update_policy(so);
fe8ab488
A
1263#if NECP
1264 so_update_necp_policy(so, NULL, NULL);
1265#endif /* NECP */
39236c6e 1266
1c79356b
A
1267 if ((so->so_state & SS_NOFDREF) == 0)
1268 panic("soaccept: !NOFDREF");
1269 so->so_state &= ~SS_NOFDREF;
1270 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
2d21ac55
A
1271
1272 if (dolock)
1273 socket_unlock(so, 1);
1c79356b
A
1274 return (error);
1275}
2d21ac55 1276
91447636 1277int
2d21ac55 1278soaccept(struct socket *so, struct sockaddr **nam)
91447636
A
1279{
1280 return (soacceptlock(so, nam, 1));
1281}
1c79356b
A
1282
1283int
2d21ac55
A
1284soacceptfilter(struct socket *so)
1285{
1286 struct sockaddr *local = NULL, *remote = NULL;
6d2010ae 1287 int error = 0;
2d21ac55
A
1288 struct socket *head = so->so_head;
1289
1290 /*
39236c6e
A
1291 * Hold the lock even if this socket has not been made visible
1292 * to the filter(s). For sockets with global locks, this protects
1293 * against the head or peer going away
2d21ac55 1294 */
b0d623f7
A
1295 socket_lock(so, 1);
1296 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1297 sogetaddr_locked(so, &local, 0) != 0) {
2d21ac55
A
1298 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1299 so->so_head = NULL;
b0d623f7 1300 socket_unlock(so, 1);
2d21ac55
A
1301 soclose(so);
1302 /* Out of resources; try it again next time */
1303 error = ECONNABORTED;
1304 goto done;
1305 }
1306
6d2010ae 1307 error = sflt_accept(head, so, local, remote);
2d21ac55
A
1308
1309 /*
1310 * If we get EJUSTRETURN from one of the filters, mark this socket
1311 * as inactive and return it anyway. This newly accepted socket
1312 * will be disconnected later before we hand it off to the caller.
1313 */
1314 if (error == EJUSTRETURN) {
1315 error = 0;
6d2010ae
A
1316 (void) sosetdefunct(current_proc(), so,
1317 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
2d21ac55
A
1318 }
1319
1320 if (error != 0) {
1321 /*
1322 * This may seem like a duplication to the above error
1323 * handling part when we return ECONNABORTED, except
1324 * the following is done while holding the lock since
1325 * the socket has been exposed to the filter(s) earlier.
1326 */
1327 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1328 so->so_head = NULL;
1329 socket_unlock(so, 1);
1330 soclose(so);
1331 /* Propagate socket filter's error code to the caller */
1332 } else {
1333 socket_unlock(so, 1);
1334 }
1335done:
1336 /* Callee checks for NULL pointer */
1337 sock_freeaddr(remote);
1338 sock_freeaddr(local);
1339 return (error);
1340}
1c79356b 1341
2d21ac55
A
1342/*
1343 * Returns: 0 Success
1344 * EOPNOTSUPP Operation not supported on socket
1345 * EISCONN Socket is connected
1346 * <pru_connect>:EADDRNOTAVAIL Address not available.
1347 * <pru_connect>:EINVAL Invalid argument
1348 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1349 * <pru_connect>:EACCES Permission denied
1350 * <pru_connect>:EADDRINUSE Address in use
1351 * <pru_connect>:EAGAIN Resource unavailable, try again
1352 * <pru_connect>:EPERM Operation not permitted
1353 * <sf_connect_out>:??? [anything a filter writer might set]
1354 */
1355int
1356soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1c79356b 1357{
1c79356b
A
1358 int error;
1359 struct proc *p = current_proc();
1c79356b 1360
2d21ac55
A
1361 if (dolock)
1362 socket_lock(so, 1);
39236c6e
A
1363
1364 so_update_last_owner_locked(so, p);
1365 so_update_policy(so);
1366
fe8ab488
A
1367#if NECP
1368 so_update_necp_policy(so, NULL, nam);
1369#endif /* NECP */
1370
2d21ac55
A
1371 /*
1372 * If this is a listening socket or if this is a previously-accepted
1373 * socket that has been marked as inactive, reject the connect request.
1374 */
1375 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
6d2010ae
A
1376 error = EOPNOTSUPP;
1377 if (so->so_flags & SOF_DEFUNCT) {
39236c6e
A
1378 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1379 "(%d)\n", __func__, proc_pid(p),
1380 (uint64_t)VM_KERNEL_ADDRPERM(so),
1381 SOCK_DOM(so), SOCK_TYPE(so), error));
6d2010ae 1382 }
2d21ac55
A
1383 if (dolock)
1384 socket_unlock(so, 1);
6d2010ae 1385 return (error);
91447636 1386 }
2d21ac55 1387
39236c6e 1388 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
2d21ac55
A
1389 if (dolock)
1390 socket_unlock(so, 1);
1391 return (EPERM);
1392 }
1393
1c79356b
A
1394 /*
1395 * If protocol is connection-based, can only connect once.
1396 * Otherwise, if connected, try to disconnect first.
1397 * This allows user to disconnect by connecting to, e.g.,
1398 * a null address.
1399 */
1400 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1401 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
2d21ac55 1402 (error = sodisconnectlocked(so)))) {
1c79356b 1403 error = EISCONN;
2d21ac55 1404 } else {
91447636
A
1405 /*
1406 * Run connect filter before calling protocol:
1407 * - non-blocking connect returns before completion;
1408 */
6d2010ae 1409 error = sflt_connectout(so, nam);
39236c6e 1410 if (error != 0) {
91447636
A
1411 if (error == EJUSTRETURN)
1412 error = 0;
6d2010ae 1413 } else {
39236c6e
A
1414 error = (*so->so_proto->pr_usrreqs->pru_connect)
1415 (so, nam, p);
91447636 1416 }
1c79356b 1417 }
2d21ac55
A
1418 if (dolock)
1419 socket_unlock(so, 1);
1c79356b
A
1420 return (error);
1421}
1422
91447636 1423int
2d21ac55 1424soconnect(struct socket *so, struct sockaddr *nam)
91447636
A
1425{
1426 return (soconnectlock(so, nam, 1));
1427}
1428
2d21ac55
A
1429/*
1430 * Returns: 0 Success
1431 * <pru_connect2>:EINVAL[AF_UNIX]
1432 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1433 * <pru_connect2>:??? [other protocol families]
1434 *
1435 * Notes: <pru_connect2> is not supported by [TCP].
1436 */
1c79356b 1437int
2d21ac55 1438soconnect2(struct socket *so1, struct socket *so2)
1c79356b 1439{
1c79356b 1440 int error;
91447636 1441
0c530ab8 1442 socket_lock(so1, 1);
2d21ac55 1443 if (so2->so_proto->pr_lock)
0c530ab8 1444 socket_lock(so2, 1);
1c79356b
A
1445
1446 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
2d21ac55 1447
0c530ab8 1448 socket_unlock(so1, 1);
2d21ac55 1449 if (so2->so_proto->pr_lock)
0c530ab8 1450 socket_unlock(so2, 1);
1c79356b
A
1451 return (error);
1452}
1453
39236c6e
A
1454int
1455soconnectxlocked(struct socket *so, struct sockaddr_list **src_sl,
1456 struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
1457 associd_t aid, connid_t *pcid, uint32_t flags, void *arg,
1458 uint32_t arglen)
1459{
1460 int error;
1461
fe8ab488
A
1462 so_update_last_owner_locked(so, p);
1463 so_update_policy(so);
1464
39236c6e
A
1465 /*
1466 * If this is a listening socket or if this is a previously-accepted
1467 * socket that has been marked as inactive, reject the connect request.
1468 */
1469 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1470 error = EOPNOTSUPP;
1471 if (so->so_flags & SOF_DEFUNCT) {
1472 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1473 "(%d)\n", __func__, proc_pid(p),
1474 (uint64_t)VM_KERNEL_ADDRPERM(so),
1475 SOCK_DOM(so), SOCK_TYPE(so), error));
1476 }
1477 return (error);
1478 }
1479
1480 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0)
1481 return (EPERM);
1482
1483 /*
1484 * If protocol is connection-based, can only connect once
1485 * unless PR_MULTICONN is set. Otherwise, if connected,
1486 * try to disconnect first. This allows user to disconnect
1487 * by connecting to, e.g., a null address.
1488 */
1489 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) &&
1490 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1491 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1492 (error = sodisconnectlocked(so)) != 0)) {
1493 error = EISCONN;
1494 } else {
1495 /*
1496 * Run connect filter before calling protocol:
1497 * - non-blocking connect returns before completion;
1498 */
1499 error = sflt_connectxout(so, dst_sl);
1500 if (error != 0) {
1501 if (error == EJUSTRETURN)
1502 error = 0;
1503 } else {
1504 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1505 (so, src_sl, dst_sl, p, ifscope, aid, pcid,
1506 flags, arg, arglen);
1507 }
1508 }
1509
1510 return (error);
1511}
1512
1c79356b 1513int
2d21ac55 1514sodisconnectlocked(struct socket *so)
1c79356b 1515{
1c79356b 1516 int error;
1c79356b
A
1517
1518 if ((so->so_state & SS_ISCONNECTED) == 0) {
1519 error = ENOTCONN;
1520 goto bad;
1521 }
1522 if (so->so_state & SS_ISDISCONNECTING) {
1523 error = EALREADY;
1524 goto bad;
1525 }
2d21ac55 1526
1c79356b 1527 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
39236c6e 1528 if (error == 0)
91447636 1529 sflt_notify(so, sock_evt_disconnected, NULL);
39236c6e 1530
1c79356b 1531bad:
1c79356b
A
1532 return (error);
1533}
2d21ac55
A
1534
1535/* Locking version */
91447636 1536int
2d21ac55 1537sodisconnect(struct socket *so)
91447636 1538{
2d21ac55 1539 int error;
91447636
A
1540
1541 socket_lock(so, 1);
1542 error = sodisconnectlocked(so);
1543 socket_unlock(so, 1);
2d21ac55 1544 return (error);
91447636 1545}
1c79356b 1546
39236c6e
A
1547int
1548sodisconnectxlocked(struct socket *so, associd_t aid, connid_t cid)
1549{
1550 int error;
1551
1552 /*
1553 * Call the protocol disconnectx handler; let it handle all
1554 * matters related to the connection state of this session.
1555 */
1556 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1557 if (error == 0) {
1558 /*
1559 * The event applies only for the session, not for
1560 * the disconnection of individual subflows.
1561 */
1562 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
1563 sflt_notify(so, sock_evt_disconnected, NULL);
1564 }
1565 return (error);
1566}
1567
1568int
1569sodisconnectx(struct socket *so, associd_t aid, connid_t cid)
1570{
1571 int error;
1572
1573 socket_lock(so, 1);
1574 error = sodisconnectxlocked(so, aid, cid);
1575 socket_unlock(so, 1);
1576 return (error);
1577}
1578
1579int
1580sopeelofflocked(struct socket *so, associd_t aid, struct socket **psop)
1581{
1582 return ((*so->so_proto->pr_usrreqs->pru_peeloff)(so, aid, psop));
1583}
1584
1585#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
91447636
A
1586
1587/*
1588 * sosendcheck will lock the socket buffer if it isn't locked and
1589 * verify that there is space for the data being inserted.
2d21ac55
A
1590 *
1591 * Returns: 0 Success
1592 * EPIPE
1593 * sblock:EWOULDBLOCK
1594 * sblock:EINTR
1595 * sbwait:EBADF
1596 * sbwait:EINTR
1597 * [so_error]:???
91447636 1598 */
39236c6e
A
1599int
1600sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1601 int32_t clen, int32_t atomic, int flags, int *sblocked,
1602 struct mbuf *control)
91447636 1603{
39236c6e 1604 int error = 0;
b0d623f7 1605 int32_t space;
3a60a9f5 1606 int assumelock = 0;
91447636
A
1607
1608restart:
1609 if (*sblocked == 0) {
3a60a9f5 1610 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
2d21ac55
A
1611 so->so_send_filt_thread != 0 &&
1612 so->so_send_filt_thread == current_thread()) {
3a60a9f5
A
1613 /*
1614 * We're being called recursively from a filter,
1615 * allow this to continue. Radar 4150520.
1616 * Don't set sblocked because we don't want
1617 * to perform an unlock later.
1618 */
1619 assumelock = 1;
2d21ac55 1620 } else {
3a60a9f5
A
1621 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1622 if (error) {
6d2010ae
A
1623 if (so->so_flags & SOF_DEFUNCT)
1624 goto defunct;
2d21ac55 1625 return (error);
3a60a9f5
A
1626 }
1627 *sblocked = 1;
1628 }
91447636 1629 }
2d21ac55
A
1630
1631 /*
6d2010ae
A
1632 * If a send attempt is made on a socket that has been marked
1633 * as inactive (disconnected), reject the request.
2d21ac55 1634 */
6d2010ae
A
1635 if (so->so_flags & SOF_DEFUNCT) {
1636defunct:
1637 error = EPIPE;
39236c6e
A
1638 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
1639 __func__, proc_selfpid(), (uint64_t)VM_KERNEL_ADDRPERM(so),
1640 SOCK_DOM(so), SOCK_TYPE(so), error));
6d2010ae
A
1641 return (error);
1642 }
2d21ac55 1643
fe8ab488
A
1644 if (so->so_state & SS_CANTSENDMORE) {
1645#if CONTENT_FILTER
1646 /*
1647 * Can re-inject data of half closed connections
1648 */
1649 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1650 so->so_snd.sb_cfil_thread == current_thread() &&
1651 cfil_sock_data_pending(&so->so_snd) != 0)
1652 CFIL_LOG(LOG_INFO,
1653 "so %llx ignore SS_CANTSENDMORE",
1654 (uint64_t)VM_KERNEL_ADDRPERM(so));
1655 else
1656#endif /* CONTENT_FILTER */
1657 return (EPIPE);
1658 }
91447636
A
1659 if (so->so_error) {
1660 error = so->so_error;
1661 so->so_error = 0;
2d21ac55 1662 return (error);
91447636 1663 }
2d21ac55 1664
91447636 1665 if ((so->so_state & SS_ISCONNECTED) == 0) {
2d21ac55 1666 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
fe8ab488
A
1667 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1668 (resid != 0 || clen == 0)) {
1669#if MPTCP
1670 /*
1671 * MPTCP Fast Join sends data before the
1672 * socket is truly connected.
1673 */
1674 if ((so->so_flags & (SOF_MP_SUBFLOW |
1675 SOF_MPTCP_FASTJOIN)) !=
1676 (SOF_MP_SUBFLOW | SOF_MPTCP_FASTJOIN))
1677#endif /* MPTCP */
2d21ac55 1678 return (ENOTCONN);
fe8ab488 1679 }
2d21ac55
A
1680 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1681 return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1682 ENOTCONN : EDESTADDRREQ);
1683 }
91447636 1684 }
39236c6e
A
1685 if (so->so_flags & SOF_ENABLE_MSGS)
1686 space = msgq_sbspace(so, control);
1687 else
1688 space = sbspace(&so->so_snd);
1689
91447636
A
1690 if (flags & MSG_OOB)
1691 space += 1024;
1692 if ((atomic && resid > so->so_snd.sb_hiwat) ||
2d21ac55
A
1693 clen > so->so_snd.sb_hiwat)
1694 return (EMSGSIZE);
39236c6e 1695
316670eb
A
1696 if ((space < resid + clen &&
1697 (atomic || space < (int32_t)so->so_snd.sb_lowat || space < clen)) ||
1698 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2d21ac55
A
1699 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1700 assumelock) {
1701 return (EWOULDBLOCK);
3a60a9f5 1702 }
39236c6e 1703 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
6d2010ae 1704 *sblocked = 0;
91447636
A
1705 error = sbwait(&so->so_snd);
1706 if (error) {
6d2010ae
A
1707 if (so->so_flags & SOF_DEFUNCT)
1708 goto defunct;
2d21ac55 1709 return (error);
91447636
A
1710 }
1711 goto restart;
1712 }
2d21ac55 1713 return (0);
91447636
A
1714}
1715
1c79356b
A
1716/*
1717 * Send on a socket.
1718 * If send must go all at once and message is larger than
1719 * send buffering, then hard error.
1720 * Lock against other senders.
1721 * If must go all at once and not enough room now, then
1722 * inform user that this would block and do nothing.
1723 * Otherwise, if nonblocking, send as much as possible.
1724 * The data to be sent is described by "uio" if nonzero,
1725 * otherwise by the mbuf chain "top" (which must be null
1726 * if uio is not). Data provided in mbuf chain must be small
1727 * enough to send all at once.
1728 *
1729 * Returns nonzero on error, timeout or signal; callers
1730 * must check for short counts if EINTR/ERESTART are returned.
1731 * Data and control buffers are freed on return.
1732 * Experiment:
1733 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1734 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1735 * point at the mbuf chain being constructed and go from there.
2d21ac55
A
1736 *
1737 * Returns: 0 Success
1738 * EOPNOTSUPP
1739 * EINVAL
1740 * ENOBUFS
1741 * uiomove:EFAULT
1742 * sosendcheck:EPIPE
1743 * sosendcheck:EWOULDBLOCK
1744 * sosendcheck:EINTR
1745 * sosendcheck:EBADF
1746 * sosendcheck:EINTR
1747 * sosendcheck:??? [value from so_error]
1748 * <pru_send>:ECONNRESET[TCP]
1749 * <pru_send>:EINVAL[TCP]
1750 * <pru_send>:ENOBUFS[TCP]
1751 * <pru_send>:EADDRINUSE[TCP]
1752 * <pru_send>:EADDRNOTAVAIL[TCP]
1753 * <pru_send>:EAFNOSUPPORT[TCP]
1754 * <pru_send>:EACCES[TCP]
1755 * <pru_send>:EAGAIN[TCP]
1756 * <pru_send>:EPERM[TCP]
1757 * <pru_send>:EMSGSIZE[TCP]
1758 * <pru_send>:EHOSTUNREACH[TCP]
1759 * <pru_send>:ENETUNREACH[TCP]
1760 * <pru_send>:ENETDOWN[TCP]
1761 * <pru_send>:ENOMEM[TCP]
1762 * <pru_send>:ENOBUFS[TCP]
1763 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
1764 * <pru_send>:EINVAL[AF_UNIX]
1765 * <pru_send>:EOPNOTSUPP[AF_UNIX]
1766 * <pru_send>:EPIPE[AF_UNIX]
1767 * <pru_send>:ENOTCONN[AF_UNIX]
1768 * <pru_send>:EISCONN[AF_UNIX]
1769 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
1770 * <sf_data_out>:??? [whatever a filter author chooses]
1771 *
1772 * Notes: Other <pru_send> returns depend on the protocol family; all
1773 * <sf_data_out> returns depend on what the filter author causes
1774 * their filter to return.
1c79356b
A
1775 */
1776int
2d21ac55
A
1777sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1778 struct mbuf *top, struct mbuf *control, int flags)
1c79356b
A
1779{
1780 struct mbuf **mp;
39236c6e
A
1781 struct mbuf *m, *freelist = NULL;
1782 user_ssize_t space, len, resid;
91447636 1783 int clen = 0, error, dontroute, mlen, sendflags;
1c79356b 1784 int atomic = sosendallatonce(so) || top;
91447636 1785 int sblocked = 0;
1c79356b 1786 struct proc *p = current_proc();
39236c6e 1787 struct mbuf *control_copy = NULL;
1c79356b 1788
39236c6e 1789 if (uio != NULL)
91447636 1790 resid = uio_resid(uio);
39236c6e 1791 else
1c79356b 1792 resid = top->m_pkthdr.len;
39236c6e 1793
2d21ac55
A
1794 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
1795 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1c79356b 1796
91447636 1797 socket_lock(so, 1);
fe8ab488
A
1798
1799 /*
1800 * Re-injection should not affect process accounting
1801 */
1802 if ((flags & MSG_SKIPCFIL) == 0) {
6d2010ae 1803 so_update_last_owner_locked(so, p);
39236c6e 1804 so_update_policy(so);
fe8ab488
A
1805
1806#if NECP
1807 so_update_necp_policy(so, NULL, addr);
1808#endif /* NECP */
1809 }
1810
2d21ac55
A
1811 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
1812 error = EOPNOTSUPP;
1813 socket_unlock(so, 1);
1814 goto out;
1815 }
91447636 1816
1c79356b
A
1817 /*
1818 * In theory resid should be unsigned.
1819 * However, space must be signed, as it might be less than 0
1820 * if we over-committed, and we must use a signed comparison
1821 * of space and resid. On the other hand, a negative resid
1822 * causes us to loop sending 0-length segments to the protocol.
1823 *
39236c6e
A
1824 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
1825 * But it will be used by sockets doing message delivery.
1826 *
fe8ab488 1827 * Note: We limit resid to be a positive int value as we use
39236c6e 1828 * imin() to set bytes_to_copy -- radr://14558484
1c79356b 1829 */
fe8ab488 1830 if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
39236c6e 1831 !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
1c79356b 1832 error = EINVAL;
91447636 1833 socket_unlock(so, 1);
1c79356b
A
1834 goto out;
1835 }
1836
39236c6e
A
1837 dontroute = (flags & MSG_DONTROUTE) &&
1838 (so->so_options & SO_DONTROUTE) == 0 &&
1c79356b 1839 (so->so_proto->pr_flags & PR_ATOMIC);
b0d623f7 1840 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
39236c6e
A
1841
1842 if (control != NULL)
1c79356b 1843 clen = control->m_len;
1c79356b 1844
1c79356b 1845 do {
2d21ac55 1846 error = sosendcheck(so, addr, resid, clen, atomic, flags,
39236c6e
A
1847 &sblocked, control);
1848 if (error)
3a60a9f5 1849 goto release;
39236c6e 1850
1c79356b 1851 mp = &top;
39236c6e
A
1852 if (so->so_flags & SOF_ENABLE_MSGS)
1853 space = msgq_sbspace(so, control);
1854 else
1855 space = sbspace(&so->so_snd) - clen;
1856 space += ((flags & MSG_OOB) ? 1024 : 0);
fa4905b1 1857
1c79356b 1858 do {
2d21ac55 1859 if (uio == NULL) {
91447636
A
1860 /*
1861 * Data is prepackaged in "top".
1862 */
1863 resid = 0;
1c79356b
A
1864 if (flags & MSG_EOR)
1865 top->m_flags |= M_EOR;
91447636 1866 } else {
2d21ac55
A
1867 int chainlength;
1868 int bytes_to_copy;
1869 boolean_t jumbocl;
fe8ab488 1870 boolean_t bigcl;
2d21ac55 1871
b0d623f7 1872 bytes_to_copy = imin(resid, space);
2d21ac55 1873
39236c6e 1874 if (sosendminchain > 0)
91447636 1875 chainlength = 0;
39236c6e 1876 else
91447636 1877 chainlength = sosendmaxchain;
2d21ac55 1878
fe8ab488
A
1879 /*
1880 * Use big 4 KB cluster only when outgoing
1881 * interface does not want 2 LB clusters
1882 */
1883 bigcl =
1884 !(so->so_flags1 & SOF1_IF_2KCL) ||
1885 sosendbigcl_ignore_capab;
1886
2d21ac55
A
1887 /*
1888 * Attempt to use larger than system page-size
1889 * clusters for large writes only if there is
1890 * a jumbo cluster pool and if the socket is
1891 * marked accordingly.
1892 */
1893 jumbocl = sosendjcl && njcl > 0 &&
1894 ((so->so_flags & SOF_MULTIPAGES) ||
fe8ab488
A
1895 sosendjcl_ignore_capab) &&
1896 bigcl;
2d21ac55 1897
91447636 1898 socket_unlock(so, 0);
2d21ac55 1899
91447636
A
1900 do {
1901 int num_needed;
39236c6e 1902 int hdrs_needed = (top == NULL) ? 1 : 0;
2d21ac55 1903
91447636 1904 /*
2d21ac55
A
1905 * try to maintain a local cache of mbuf
1906 * clusters needed to complete this
1907 * write the list is further limited to
1908 * the number that are currently needed
1909 * to fill the socket this mechanism
1910 * allows a large number of mbufs/
1911 * clusters to be grabbed under a single
1912 * mbuf lock... if we can't get any
1913 * clusters, than fall back to trying
1914 * for mbufs if we fail early (or
1915 * miscalcluate the number needed) make
1916 * sure to release any clusters we
1917 * haven't yet consumed.
91447636 1918 */
2d21ac55 1919 if (freelist == NULL &&
6d2010ae
A
1920 bytes_to_copy > MBIGCLBYTES &&
1921 jumbocl) {
2d21ac55
A
1922 num_needed =
1923 bytes_to_copy / M16KCLBYTES;
1924
1925 if ((bytes_to_copy -
1926 (num_needed * M16KCLBYTES))
1927 >= MINCLSIZE)
1928 num_needed++;
91447636 1929
2d21ac55
A
1930 freelist =
1931 m_getpackets_internal(
1932 (unsigned int *)&num_needed,
1933 hdrs_needed, M_WAIT, 0,
1934 M16KCLBYTES);
1935 /*
1936 * Fall back to 4K cluster size
1937 * if allocation failed
1938 */
1939 }
1940
1941 if (freelist == NULL &&
fe8ab488
A
1942 bytes_to_copy > MCLBYTES &&
1943 bigcl) {
2d21ac55 1944 num_needed =
6d2010ae 1945 bytes_to_copy / MBIGCLBYTES;
2d21ac55
A
1946
1947 if ((bytes_to_copy -
6d2010ae 1948 (num_needed * MBIGCLBYTES)) >=
2d21ac55 1949 MINCLSIZE)
91447636 1950 num_needed++;
2d21ac55
A
1951
1952 freelist =
1953 m_getpackets_internal(
1954 (unsigned int *)&num_needed,
1955 hdrs_needed, M_WAIT, 0,
6d2010ae 1956 MBIGCLBYTES);
2d21ac55
A
1957 /*
1958 * Fall back to cluster size
1959 * if allocation failed
1960 */
91447636 1961 }
2d21ac55
A
1962
1963 if (freelist == NULL &&
1964 bytes_to_copy > MINCLSIZE) {
1965 num_needed =
1966 bytes_to_copy / MCLBYTES;
1967
1968 if ((bytes_to_copy -
1969 (num_needed * MCLBYTES)) >=
1970 MINCLSIZE)
91447636 1971 num_needed++;
2d21ac55
A
1972
1973 freelist =
1974 m_getpackets_internal(
1975 (unsigned int *)&num_needed,
1976 hdrs_needed, M_WAIT, 0,
1977 MCLBYTES);
1978 /*
1979 * Fall back to a single mbuf
1980 * if allocation failed
1981 */
91447636 1982 }
2d21ac55 1983
91447636 1984 if (freelist == NULL) {
39236c6e 1985 if (top == NULL)
2d21ac55
A
1986 MGETHDR(freelist,
1987 M_WAIT, MT_DATA);
91447636 1988 else
2d21ac55
A
1989 MGET(freelist,
1990 M_WAIT, MT_DATA);
91447636
A
1991
1992 if (freelist == NULL) {
1993 error = ENOBUFS;
1994 socket_lock(so, 0);
3a60a9f5 1995 goto release;
91447636
A
1996 }
1997 /*
2d21ac55
A
1998 * For datagram protocols,
1999 * leave room for protocol
2000 * headers in first mbuf.
91447636 2001 */
39236c6e 2002 if (atomic && top == NULL &&
2d21ac55
A
2003 bytes_to_copy < MHLEN) {
2004 MH_ALIGN(freelist,
2005 bytes_to_copy);
2006 }
91447636
A
2007 }
2008 m = freelist;
2009 freelist = m->m_next;
2010 m->m_next = NULL;
2d21ac55 2011
91447636
A
2012 if ((m->m_flags & M_EXT))
2013 mlen = m->m_ext.ext_size;
2014 else if ((m->m_flags & M_PKTHDR))
2d21ac55
A
2015 mlen =
2016 MHLEN - m_leadingspace(m);
91447636
A
2017 else
2018 mlen = MLEN;
b0d623f7 2019 len = imin(mlen, bytes_to_copy);
91447636
A
2020
2021 chainlength += len;
2d21ac55 2022
91447636 2023 space -= len;
fa4905b1 2024
2d21ac55 2025 error = uiomove(mtod(m, caddr_t),
b0d623f7 2026 len, uio);
2d21ac55 2027
91447636 2028 resid = uio_resid(uio);
2d21ac55 2029
91447636
A
2030 m->m_len = len;
2031 *mp = m;
2032 top->m_pkthdr.len += len;
2d21ac55 2033 if (error)
91447636
A
2034 break;
2035 mp = &m->m_next;
2036 if (resid <= 0) {
2037 if (flags & MSG_EOR)
2038 top->m_flags |= M_EOR;
2039 break;
2040 }
2041 bytes_to_copy = min(resid, space);
2d21ac55
A
2042
2043 } while (space > 0 &&
2044 (chainlength < sosendmaxchain || atomic ||
2045 resid < MINCLSIZE));
2046
91447636 2047 socket_lock(so, 0);
2d21ac55 2048
91447636
A
2049 if (error)
2050 goto release;
2051 }
2d21ac55
A
2052
2053 if (flags & (MSG_HOLD|MSG_SEND)) {
3a60a9f5 2054 /* Enqueue for later, go away if HOLD */
39236c6e 2055 struct mbuf *mb1;
2d21ac55 2056 if (so->so_temp && (flags & MSG_FLUSH)) {
3a60a9f5
A
2057 m_freem(so->so_temp);
2058 so->so_temp = NULL;
2059 }
2060 if (so->so_temp)
2061 so->so_tail->m_next = top;
2062 else
2063 so->so_temp = top;
2064 mb1 = top;
2065 while (mb1->m_next)
2d21ac55 2066 mb1 = mb1->m_next;
3a60a9f5 2067 so->so_tail = mb1;
2d21ac55 2068 if (flags & MSG_HOLD) {
3a60a9f5
A
2069 top = NULL;
2070 goto release;
2071 }
2072 top = so->so_temp;
2d21ac55
A
2073 }
2074 if (dontroute)
2075 so->so_options |= SO_DONTROUTE;
2076
2077 /* Compute flags here, for pru_send and NKEs */
2078 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2079 /*
2080 * If the user set MSG_EOF, the protocol
2081 * understands this flag and nothing left to
2082 * send then use PRU_SEND_EOF instead of PRU_SEND.
2083 */
2084 ((flags & MSG_EOF) &&
2085 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
39236c6e
A
2086 (resid <= 0)) ? PRUS_EOF :
2087 /* If there is more to send set PRUS_MORETOCOME */
2088 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
fe8ab488
A
2089
2090 if ((flags & MSG_SKIPCFIL) == 0) {
2091 /*
2092 * Socket filter processing
2093 */
2094 error = sflt_data_out(so, addr, &top,
2095 &control, (sendflags & MSG_OOB) ?
2096 sock_data_filt_flag_oob : 0);
2097 if (error) {
2098 if (error == EJUSTRETURN) {
2099 error = 0;
2100 clen = 0;
2101 control = NULL;
2102 top = NULL;
2103 }
2104 goto release;
91447636 2105 }
fe8ab488
A
2106#if CONTENT_FILTER
2107 /*
2108 * Content filter processing
2109 */
2110 error = cfil_sock_data_out(so, addr, top,
2111 control, (sendflags & MSG_OOB) ?
2112 sock_data_filt_flag_oob : 0);
2113 if (error) {
2114 if (error == EJUSTRETURN) {
2115 error = 0;
2116 clen = 0;
2117 control = NULL;
2118 top = NULL;
2119 }
2120 goto release;
2121 }
2122#endif /* CONTENT_FILTER */
1c79356b 2123 }
39236c6e
A
2124 if (so->so_flags & SOF_ENABLE_MSGS) {
2125 /*
2126 * Make a copy of control mbuf,
2127 * so that msg priority can be
2128 * passed to subsequent mbufs.
2129 */
2130 control_copy = m_dup(control, M_NOWAIT);
2131 }
6d2010ae 2132 error = (*so->so_proto->pr_usrreqs->pru_send)
39236c6e
A
2133 (so, sendflags, top, addr, control, p);
2134
2d21ac55
A
2135 if (flags & MSG_SEND)
2136 so->so_temp = NULL;
39236c6e 2137
2d21ac55
A
2138 if (dontroute)
2139 so->so_options &= ~SO_DONTROUTE;
2140
2141 clen = 0;
39236c6e
A
2142 control = control_copy;
2143 control_copy = NULL;
2144 top = NULL;
2d21ac55
A
2145 mp = &top;
2146 if (error)
2147 goto release;
1c79356b
A
2148 } while (resid && space > 0);
2149 } while (resid);
2150
2151release:
3a60a9f5 2152 if (sblocked)
39236c6e 2153 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
3a60a9f5
A
2154 else
2155 socket_unlock(so, 1);
1c79356b 2156out:
39236c6e 2157 if (top != NULL)
1c79356b 2158 m_freem(top);
39236c6e 2159 if (control != NULL)
1c79356b 2160 m_freem(control);
39236c6e 2161 if (freelist != NULL)
2d21ac55 2162 m_freem_list(freelist);
39236c6e
A
2163 if (control_copy != NULL)
2164 m_freem(control_copy);
1c79356b 2165
2d21ac55
A
2166 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid, so->so_snd.sb_cc,
2167 space, error);
1c79356b
A
2168
2169 return (error);
2170}
2171
fe8ab488
A
2172int
2173sosend_list(struct socket *so, struct sockaddr *addr, struct uio **uioarray,
2174 u_int uiocnt, struct mbuf *top, struct mbuf *control, int flags)
2175{
2176 struct mbuf *m, *freelist = NULL;
2177 user_ssize_t len, resid;
2178 int clen = 0, error, dontroute, mlen;
2179 int atomic = sosendallatonce(so) || top;
2180 int sblocked = 0;
2181 struct proc *p = current_proc();
2182 u_int uiofirst = 0;
2183 u_int uiolast = 0;
2184
2185 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2186 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2187
2188 if (so->so_type != SOCK_DGRAM) {
2189 error = EINVAL;
2190 goto out;
2191 }
2192 if (atomic == 0) {
2193 error = EINVAL;
2194 goto out;
2195 }
2196 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2197 error = EPROTONOSUPPORT;
2198 goto out;
2199 }
2200 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2201 error = EINVAL;
2202 goto out;
2203 }
2204 if (uioarray != NULL)
2205 resid = uio_array_resid(uioarray, uiocnt);
2206 else
2207 resid = mbuf_pkt_list_len(top);
2208
2209 /*
2210 * In theory resid should be unsigned.
2211 * However, space must be signed, as it might be less than 0
2212 * if we over-committed, and we must use a signed comparison
2213 * of space and resid. On the other hand, a negative resid
2214 * causes us to loop sending 0-length segments to the protocol.
2215 *
2216 * Note: We limit resid to be a positive int value as we use
2217 * imin() to set bytes_to_copy -- radr://14558484
2218 */
2219 if (resid < 0 || resid > INT_MAX) {
2220 error = EINVAL;
2221 goto out;
2222 }
2223 /*
2224 * Disallow functionality not currently supported
2225 * Note: Will need to treat arrays of addresses and controls
2226 */
2227 if (addr != NULL) {
2228 printf("%s addr not supported\n", __func__);
2229 error = EOPNOTSUPP;
2230 goto out;
2231 }
2232 if (control != NULL) {
2233 printf("%s control not supported\n", __func__);
2234 error = EOPNOTSUPP;
2235 goto out;
2236 }
2237
2238 socket_lock(so, 1);
2239 so_update_last_owner_locked(so, p);
2240 so_update_policy(so);
2241
2242#if NECP
2243 so_update_necp_policy(so, NULL, addr);
2244#endif /* NECP */
2245
2246 dontroute = (flags & MSG_DONTROUTE) &&
2247 (so->so_options & SO_DONTROUTE) == 0 &&
2248 (so->so_proto->pr_flags & PR_ATOMIC);
2249 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2250
2251 if (control != NULL)
2252 clen = control->m_len;
2253
2254 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2255 &sblocked, control);
2256 if (error)
2257 goto release;
2258
2259 do {
2260 int i;
2261
2262 if (uioarray == NULL) {
2263 /*
2264 * Data is prepackaged in "top".
2265 */
2266 resid = 0;
2267 } else {
2268 int num_needed = 0;
2269 int chainlength;
2270 size_t maxpktlen = 0;
2271
2272 if (sosendminchain > 0)
2273 chainlength = 0;
2274 else
2275 chainlength = sosendmaxchain;
2276
2277 socket_unlock(so, 0);
2278
2279 /*
2280 * Find a set of uio that fit in a reasonable number
2281 * of mbuf packets
2282 */
2283 for (i = uiofirst; i < uiocnt; i++) {
2284 struct uio *auio = uioarray[i];
2285
2286 len = uio_resid(auio);
2287
2288 /* Do nothing for empty messages */
2289 if (len == 0)
2290 continue;
2291
2292 num_needed += 1;
2293 uiolast += 1;
2294
2295 if (len > maxpktlen)
2296 maxpktlen = len;
2297
2298 chainlength += len;
2299 if (chainlength > sosendmaxchain)
2300 break;
2301 }
2302 /*
2303 * Nothing left to send
2304 */
2305 if (num_needed == 0) {
2306 socket_lock(so, 0);
2307 break;
2308 }
2309 /*
2310 * Allocate the mbuf packets at once
2311 */
2312 freelist = m_allocpacket_internal(
2313 (unsigned int *)&num_needed,
2314 maxpktlen, NULL, M_WAIT, 1, 0);
2315
2316 if (freelist == NULL) {
2317 socket_lock(so, 0);
2318 error = ENOMEM;
2319 goto release;
2320 }
2321 /*
2322 * Copy each uio of the set into its own mbuf packet
2323 */
2324 for (i = uiofirst, m = freelist;
2325 i < uiolast && m != NULL;
2326 i++) {
2327 int bytes_to_copy;
2328 struct mbuf *n;
2329 struct uio *auio = uioarray[i];
2330
2331 bytes_to_copy = uio_resid(auio);
2332
2333 /* Do nothing for empty messages */
2334 if (bytes_to_copy == 0)
2335 continue;
2336
2337 for (n = m; n != NULL; n = n->m_next) {
2338 mlen = mbuf_maxlen(n);
2339
2340 len = imin(mlen, bytes_to_copy);
2341
2342 /*
2343 * Note: uiomove() decrements the iovec
2344 * length
2345 */
2346 error = uiomove(mtod(n, caddr_t),
2347 len, auio);
2348 if (error != 0)
2349 break;
2350 n->m_len = len;
2351 m->m_pkthdr.len += len;
2352
2353 VERIFY(m->m_pkthdr.len <= maxpktlen);
2354
2355 bytes_to_copy -= len;
2356 resid -= len;
2357 }
2358 if (m->m_pkthdr.len == 0) {
2359 printf("%s so %llx pkt %llx len null\n",
2360 __func__,
2361 (uint64_t)VM_KERNEL_ADDRPERM(so),
2362 (uint64_t)VM_KERNEL_ADDRPERM(m));
2363 }
2364 if (error != 0)
2365 break;
2366 m = m->m_nextpkt;
2367 }
2368
2369 socket_lock(so, 0);
2370
2371 if (error)
2372 goto release;
2373 top = freelist;
2374 freelist = NULL;
2375 }
2376
2377 if (dontroute)
2378 so->so_options |= SO_DONTROUTE;
2379
2380 if ((flags & MSG_SKIPCFIL) == 0) {
2381 struct mbuf **prevnextp = NULL;
2382
2383 for (i = uiofirst, m = top;
2384 i < uiolast && m != NULL;
2385 i++) {
2386 struct mbuf *nextpkt = m->m_nextpkt;
2387
2388 /*
2389 * Socket filter processing
2390 */
2391 error = sflt_data_out(so, addr, &m,
2392 &control, 0);
2393 if (error != 0 && error != EJUSTRETURN)
2394 goto release;
2395
2396#if CONTENT_FILTER
2397 if (error == 0) {
2398 /*
2399 * Content filter processing
2400 */
2401 error = cfil_sock_data_out(so, addr, m,
2402 control, 0);
2403 if (error != 0 && error != EJUSTRETURN)
2404 goto release;
2405 }
2406#endif /* CONTENT_FILTER */
2407 /*
2408 * Remove packet from the list when
2409 * swallowed by a filter
2410 */
2411 if (error == EJUSTRETURN) {
2412 error = 0;
2413 if (prevnextp != NULL)
2414 *prevnextp = nextpkt;
2415 else
2416 top = nextpkt;
2417 }
2418
2419 m = nextpkt;
2420 if (m != NULL)
2421 prevnextp = &m->m_nextpkt;
2422 }
2423 }
2424 if (top != NULL)
2425 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2426 (so, 0, top, addr, control, p);
2427
2428 if (dontroute)
2429 so->so_options &= ~SO_DONTROUTE;
2430
2431 clen = 0;
2432 top = NULL;
2433 uiofirst = uiolast;
2434 } while (resid > 0 && error == 0);
2435release:
2436 if (sblocked)
2437 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2438 else
2439 socket_unlock(so, 1);
2440out:
2441 if (top != NULL)
2442 m_freem(top);
2443 if (control != NULL)
2444 m_freem(control);
2445 if (freelist != NULL)
2446 m_freem_list(freelist);
2447
2448 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2449 so->so_snd.sb_cc, 0, error);
2450
2451 return (error);
2452}
2453
1c79356b
A
2454/*
2455 * Implement receive operations on a socket.
2456 * We depend on the way that records are added to the sockbuf
2457 * by sbappend*. In particular, each record (mbufs linked through m_next)
2458 * must begin with an address if the protocol so specifies,
2459 * followed by an optional mbuf or mbufs containing ancillary data,
2460 * and then zero or more mbufs of data.
2461 * In order to avoid blocking network interrupts for the entire time here,
2462 * we splx() while doing the actual copy to user space.
2463 * Although the sockbuf is locked, new data may still be appended,
2464 * and thus we must maintain consistency of the sockbuf during that time.
2465 *
2466 * The caller may receive the data as a single mbuf chain by supplying
2467 * an mbuf **mp0 for use in returning the chain. The uio is then used
2468 * only for the count in uio_resid.
2d21ac55
A
2469 *
2470 * Returns: 0 Success
2471 * ENOBUFS
2472 * ENOTCONN
2473 * EWOULDBLOCK
2474 * uiomove:EFAULT
2475 * sblock:EWOULDBLOCK
2476 * sblock:EINTR
2477 * sbwait:EBADF
2478 * sbwait:EINTR
2479 * sodelayed_copy:EFAULT
2480 * <pru_rcvoob>:EINVAL[TCP]
2481 * <pru_rcvoob>:EWOULDBLOCK[TCP]
2482 * <pru_rcvoob>:???
2483 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
2484 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
2485 * <pr_domain->dom_externalize>:???
2486 *
2487 * Notes: Additional return values from calls through <pru_rcvoob> and
2488 * <pr_domain->dom_externalize> depend on protocols other than
2489 * TCP or AF_UNIX, which are documented above.
1c79356b
A
2490 */
2491int
2d21ac55
A
2492soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2493 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1c79356b 2494{
39236c6e
A
2495 struct mbuf *m, **mp, *ml = NULL;
2496 struct mbuf *nextrecord, *free_list;
2497 int flags, error, offset;
2498 user_ssize_t len;
1c79356b 2499 struct protosw *pr = so->so_proto;
39236c6e
A
2500 int moff, type =0;
2501 user_ssize_t orig_resid = uio_resid(uio);
2502 user_ssize_t delayed_copy_len;
55e303ae
A
2503 int can_delay;
2504 int need_event;
2505 struct proc *p = current_proc();
2506
2d21ac55
A
2507 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so, uio_resid(uio),
2508 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
1c79356b 2509
fe8ab488
A
2510 /*
2511 * Sanity check on the length passed by caller as we are making 'int'
2512 * comparisons
2513 */
2514 if (orig_resid < 0 || orig_resid > INT_MAX)
2515 return (EINVAL);
2516
91447636 2517 socket_lock(so, 1);
6d2010ae 2518 so_update_last_owner_locked(so, p);
39236c6e 2519 so_update_policy(so);
1c79356b 2520
91447636 2521#ifdef MORE_LOCKING_DEBUG
39236c6e
A
2522 if (so->so_usecount == 1) {
2523 panic("%s: so=%x no other reference on socket\n", __func__, so);
2524 /* NOTREACHED */
2525 }
91447636 2526#endif
1c79356b 2527 mp = mp0;
39236c6e
A
2528 if (psa != NULL)
2529 *psa = NULL;
2530 if (controlp != NULL)
2531 *controlp = NULL;
2532 if (flagsp != NULL)
1c79356b
A
2533 flags = *flagsp &~ MSG_EOR;
2534 else
2535 flags = 0;
2d21ac55
A
2536
2537 /*
2538 * If a recv attempt is made on a previously-accepted socket
2539 * that has been marked as inactive (disconnected), reject
2540 * the request.
2541 */
2542 if (so->so_flags & SOF_DEFUNCT) {
2543 struct sockbuf *sb = &so->so_rcv;
2544
6d2010ae 2545 error = ENOTCONN;
39236c6e
A
2546 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
2547 __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
2548 SOCK_DOM(so), SOCK_TYPE(so), error));
2d21ac55
A
2549 /*
2550 * This socket should have been disconnected and flushed
6d2010ae
A
2551 * prior to being returned from sodefunct(); there should
2552 * be no data on its receive list, so panic otherwise.
2d21ac55 2553 */
6d2010ae
A
2554 if (so->so_state & SS_DEFUNCT)
2555 sb_empty_assert(sb, __func__);
2d21ac55 2556 socket_unlock(so, 1);
6d2010ae 2557 return (error);
2d21ac55
A
2558 }
2559
2560 /*
2561 * When SO_WANTOOBFLAG is set we try to get out-of-band data
2562 * regardless of the flags argument. Here is the case were
2563 * out-of-band data is not inline.
2564 */
2565 if ((flags & MSG_OOB) ||
2566 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2567 (so->so_options & SO_OOBINLINE) == 0 &&
2568 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
1c79356b 2569 m = m_get(M_WAIT, MT_DATA);
55e303ae 2570 if (m == NULL) {
91447636 2571 socket_unlock(so, 1);
2d21ac55
A
2572 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
2573 ENOBUFS, 0, 0, 0, 0);
9bccf70c 2574 return (ENOBUFS);
55e303ae 2575 }
1c79356b
A
2576 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
2577 if (error)
2578 goto bad;
91447636 2579 socket_unlock(so, 0);
1c79356b
A
2580 do {
2581 error = uiomove(mtod(m, caddr_t),
b0d623f7 2582 imin(uio_resid(uio), m->m_len), uio);
1c79356b 2583 m = m_free(m);
39236c6e 2584 } while (uio_resid(uio) && error == 0 && m != NULL);
91447636 2585 socket_lock(so, 0);
1c79356b 2586bad:
39236c6e 2587 if (m != NULL)
1c79356b 2588 m_freem(m);
39236c6e 2589
9bccf70c
A
2590 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
2591 if (error == EWOULDBLOCK || error == EINVAL) {
2d21ac55 2592 /*
9bccf70c 2593 * Let's try to get normal data:
2d21ac55
A
2594 * EWOULDBLOCK: out-of-band data not
2595 * receive yet. EINVAL: out-of-band data
2596 * already read.
9bccf70c
A
2597 */
2598 error = 0;
2599 goto nooob;
39236c6e 2600 } else if (error == 0 && flagsp != NULL) {
9bccf70c 2601 *flagsp |= MSG_OOB;
2d21ac55
A
2602 }
2603 }
91447636 2604 socket_unlock(so, 1);
2d21ac55
A
2605 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2606 0, 0, 0, 0);
39236c6e 2607
1c79356b
A
2608 return (error);
2609 }
2610nooob:
39236c6e
A
2611 if (mp != NULL)
2612 *mp = NULL;
fe8ab488
A
2613
2614 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
1c79356b 2615 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
fe8ab488 2616 }
1c79356b 2617
39236c6e 2618 free_list = NULL;
55e303ae 2619 delayed_copy_len = 0;
1c79356b 2620restart:
91447636
A
2621#ifdef MORE_LOCKING_DEBUG
2622 if (so->so_usecount <= 1)
fe8ab488
A
2623 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
2624 (uint64_t)VM_KERNEL_ADDRPERM(so), so->so_usecount);
91447636 2625#endif
6601e61a
A
2626 /*
2627 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2628 * and if so just return to the caller. This could happen when
2629 * soreceive() is called by a socket upcall function during the
2630 * time the socket is freed. The socket buffer would have been
2631 * locked across the upcall, therefore we cannot put this thread
2632 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2633 * we may livelock), because the lock on the socket buffer will
2634 * only be released when the upcall routine returns to its caller.
2635 * Because the socket has been officially closed, there can be
2636 * no further read on it.
39236c6e
A
2637 *
2638 * A multipath subflow socket would have its SS_NOFDREF set by
2639 * default, so check for SOF_MP_SUBFLOW socket flag; when the
2640 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
6601e61a
A
2641 */
2642 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
39236c6e 2643 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
6601e61a
A
2644 socket_unlock(so, 1);
2645 return (0);
2646 }
2647
9bccf70c
A
2648 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
2649 if (error) {
91447636 2650 socket_unlock(so, 1);
2d21ac55
A
2651 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2652 0, 0, 0, 0);
1c79356b
A
2653 return (error);
2654 }
1c79356b
A
2655
2656 m = so->so_rcv.sb_mb;
2657 /*
2658 * If we have less data than requested, block awaiting more
2659 * (subject to any timeout) if:
2660 * 1. the current count is less than the low water mark, or
2661 * 2. MSG_WAITALL is set, and it is possible to do the entire
2662 * receive operation at once if we block (resid <= hiwat).
2663 * 3. MSG_DONTWAIT is not set
2664 * If MSG_WAITALL is set but resid is larger than the receive buffer,
2665 * we have to do the receive in sections, and thus risk returning
2666 * a short count if a timeout or signal occurs after we start.
2667 */
39236c6e 2668 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
91447636 2669 so->so_rcv.sb_cc < uio_resid(uio)) &&
2d21ac55 2670 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
91447636 2671 ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
39236c6e 2672 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
2d21ac55
A
2673 /*
2674 * Panic if we notice inconsistencies in the socket's
2675 * receive list; both sb_mb and sb_cc should correctly
2676 * reflect the contents of the list, otherwise we may
2677 * end up with false positives during select() or poll()
2678 * which could put the application in a bad state.
2679 */
316670eb 2680 SB_MB_CHECK(&so->so_rcv);
55e303ae 2681
1c79356b 2682 if (so->so_error) {
39236c6e 2683 if (m != NULL)
1c79356b
A
2684 goto dontblock;
2685 error = so->so_error;
2686 if ((flags & MSG_PEEK) == 0)
2687 so->so_error = 0;
2688 goto release;
2689 }
2690 if (so->so_state & SS_CANTRCVMORE) {
fe8ab488
A
2691#if CONTENT_FILTER
2692 /*
2693 * Deal with half closed connections
2694 */
2695 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2696 cfil_sock_data_pending(&so->so_rcv) != 0)
2697 CFIL_LOG(LOG_INFO,
2698 "so %llx ignore SS_CANTRCVMORE",
2699 (uint64_t)VM_KERNEL_ADDRPERM(so));
2700 else
2701#endif /* CONTENT_FILTER */
39236c6e 2702 if (m != NULL)
1c79356b
A
2703 goto dontblock;
2704 else
2705 goto release;
2706 }
39236c6e 2707 for (; m != NULL; m = m->m_next)
2d21ac55 2708 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
1c79356b
A
2709 m = so->so_rcv.sb_mb;
2710 goto dontblock;
2711 }
2712 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
2713 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2714 error = ENOTCONN;
2715 goto release;
2716 }
91447636 2717 if (uio_resid(uio) == 0)
1c79356b 2718 goto release;
2d21ac55
A
2719 if ((so->so_state & SS_NBIO) ||
2720 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1c79356b
A
2721 error = EWOULDBLOCK;
2722 goto release;
2723 }
2d21ac55
A
2724 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
2725 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
39236c6e 2726 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
2d21ac55 2727#if EVEN_MORE_LOCKING_DEBUG
1c79356b 2728 if (socket_debug)
2d21ac55 2729 printf("Waiting for socket data\n");
91447636 2730#endif
55e303ae 2731
1c79356b 2732 error = sbwait(&so->so_rcv);
2d21ac55 2733#if EVEN_MORE_LOCKING_DEBUG
1c79356b 2734 if (socket_debug)
2d21ac55 2735 printf("SORECEIVE - sbwait returned %d\n", error);
91447636 2736#endif
39236c6e
A
2737 if (so->so_usecount < 1) {
2738 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
2739 __func__, so, so->so_usecount);
2740 /* NOTREACHED */
2741 }
9bccf70c 2742 if (error) {
91447636 2743 socket_unlock(so, 1);
2d21ac55
A
2744 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2745 0, 0, 0, 0);
2746 return (error);
1c79356b
A
2747 }
2748 goto restart;
2749 }
2750dontblock:
b0d623f7 2751 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
2d21ac55
A
2752 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
2753 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
1c79356b
A
2754 nextrecord = m->m_nextpkt;
2755 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
2756 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2d21ac55
A
2757#if CONFIG_MACF_SOCKET_SUBSET
2758 /*
2759 * Call the MAC framework for policy checking if we're in
2760 * the user process context and the socket isn't connected.
2761 */
2762 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2763 struct mbuf *m0 = m;
2764 /*
2765 * Dequeue this record (temporarily) from the receive
2766 * list since we're about to drop the socket's lock
2767 * where a new record may arrive and be appended to
2768 * the list. Upon MAC policy failure, the record
2769 * will be freed. Otherwise, we'll add it back to
2770 * the head of the list. We cannot rely on SB_LOCK
2771 * because append operation uses the socket's lock.
2772 */
2773 do {
2774 m->m_nextpkt = NULL;
2775 sbfree(&so->so_rcv, m);
2776 m = m->m_next;
2777 } while (m != NULL);
2778 m = m0;
2779 so->so_rcv.sb_mb = nextrecord;
2780 SB_EMPTY_FIXUP(&so->so_rcv);
2781 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2782 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2783 socket_unlock(so, 0);
fe8ab488 2784
2d21ac55
A
2785 if (mac_socket_check_received(proc_ucred(p), so,
2786 mtod(m, struct sockaddr *)) != 0) {
2787 /*
2788 * MAC policy failure; free this record and
2789 * process the next record (or block until
2790 * one is available). We have adjusted sb_cc
2791 * and sb_mbcnt above so there is no need to
2792 * call sbfree() again.
2793 */
2794 do {
2795 m = m_free(m);
2796 } while (m != NULL);
2797 /*
2798 * Clear SB_LOCK but don't unlock the socket.
2799 * Process the next record or wait for one.
2800 */
2801 socket_lock(so, 0);
39236c6e 2802 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2d21ac55
A
2803 goto restart;
2804 }
2805 socket_lock(so, 0);
6d2010ae
A
2806 /*
2807 * If the socket has been defunct'd, drop it.
2808 */
2809 if (so->so_flags & SOF_DEFUNCT) {
2810 m_freem(m);
2811 error = ENOTCONN;
2812 goto release;
2813 }
2d21ac55
A
2814 /*
2815 * Re-adjust the socket receive list and re-enqueue
2816 * the record in front of any packets which may have
2817 * been appended while we dropped the lock.
2818 */
2819 for (m = m0; m->m_next != NULL; m = m->m_next)
2820 sballoc(&so->so_rcv, m);
2821 sballoc(&so->so_rcv, m);
2822 if (so->so_rcv.sb_mb == NULL) {
2823 so->so_rcv.sb_lastrecord = m0;
2824 so->so_rcv.sb_mbtail = m;
2825 }
2826 m = m0;
2827 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2828 so->so_rcv.sb_mb = m;
2829 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2830 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2831 }
2832#endif /* CONFIG_MACF_SOCKET_SUBSET */
1c79356b 2833 orig_resid = 0;
39236c6e 2834 if (psa != NULL) {
1c79356b 2835 *psa = dup_sockaddr(mtod(m, struct sockaddr *),
39236c6e
A
2836 mp0 == NULL);
2837 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
4a249263
A
2838 error = EWOULDBLOCK;
2839 goto release;
2840 }
2841 }
1c79356b
A
2842 if (flags & MSG_PEEK) {
2843 m = m->m_next;
2844 } else {
2845 sbfree(&so->so_rcv, m);
39236c6e
A
2846 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2847 panic("%s: about to create invalid socketbuf",
2848 __func__);
2849 /* NOTREACHED */
2850 }
1c79356b
A
2851 MFREE(m, so->so_rcv.sb_mb);
2852 m = so->so_rcv.sb_mb;
2d21ac55
A
2853 if (m != NULL) {
2854 m->m_nextpkt = nextrecord;
2855 } else {
2856 so->so_rcv.sb_mb = nextrecord;
2857 SB_EMPTY_FIXUP(&so->so_rcv);
2858 }
1c79356b
A
2859 }
2860 }
2d21ac55
A
2861
2862 /*
2863 * Process one or more MT_CONTROL mbufs present before any data mbufs
2864 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2865 * just copy the data; if !MSG_PEEK, we call into the protocol to
2866 * perform externalization.
2867 */
2868 if (m != NULL && m->m_type == MT_CONTROL) {
2869 struct mbuf *cm = NULL, *cmn;
2870 struct mbuf **cme = &cm;
2871 struct sockbuf *sb_rcv = &so->so_rcv;
6d2010ae 2872 struct mbuf **msgpcm = NULL;
2d21ac55
A
2873
2874 /*
2875 * Externalizing the control messages would require us to
2876 * drop the socket's lock below. Once we re-acquire the
2877 * lock, the mbuf chain might change. In order to preserve
2878 * consistency, we unlink all control messages from the
2879 * first mbuf chain in one shot and link them separately
2880 * onto a different chain.
2881 */
2882 do {
2883 if (flags & MSG_PEEK) {
2884 if (controlp != NULL) {
6d2010ae
A
2885 if (*controlp == NULL) {
2886 msgpcm = controlp;
2887 }
2d21ac55 2888 *controlp = m_copy(m, 0, m->m_len);
6d2010ae 2889
39236c6e
A
2890 /*
2891 * If we failed to allocate an mbuf,
6d2010ae 2892 * release any previously allocated
39236c6e 2893 * mbufs for control data. Return
6d2010ae 2894 * an error. Keep the mbufs in the
39236c6e 2895 * socket as this is using
6d2010ae
A
2896 * MSG_PEEK flag.
2897 */
2898 if (*controlp == NULL) {
2899 m_freem(*msgpcm);
2900 error = ENOBUFS;
2901 goto release;
2902 }
2d21ac55 2903 controlp = &(*controlp)->m_next;
91447636 2904 }
2d21ac55 2905 m = m->m_next;
1c79356b 2906 } else {
2d21ac55
A
2907 m->m_nextpkt = NULL;
2908 sbfree(sb_rcv, m);
2909 sb_rcv->sb_mb = m->m_next;
2910 m->m_next = NULL;
2911 *cme = m;
2912 cme = &(*cme)->m_next;
2913 m = sb_rcv->sb_mb;
2914 }
2915 } while (m != NULL && m->m_type == MT_CONTROL);
2916
2917 if (!(flags & MSG_PEEK)) {
2918 if (sb_rcv->sb_mb != NULL) {
2919 sb_rcv->sb_mb->m_nextpkt = nextrecord;
2920 } else {
2921 sb_rcv->sb_mb = nextrecord;
2922 SB_EMPTY_FIXUP(sb_rcv);
1c79356b 2923 }
2d21ac55
A
2924 if (nextrecord == NULL)
2925 sb_rcv->sb_lastrecord = m;
1c79356b 2926 }
2d21ac55
A
2927
2928 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2929 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2930
2931 while (cm != NULL) {
2932 int cmsg_type;
2933
2934 cmn = cm->m_next;
2935 cm->m_next = NULL;
2936 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2937
2938 /*
2939 * Call the protocol to externalize SCM_RIGHTS message
2940 * and return the modified message to the caller upon
2941 * success. Otherwise, all other control messages are
2942 * returned unmodified to the caller. Note that we
2943 * only get into this loop if MSG_PEEK is not set.
2944 */
2945 if (pr->pr_domain->dom_externalize != NULL &&
2946 cmsg_type == SCM_RIGHTS) {
2947 /*
2948 * Release socket lock: see 3903171. This
2949 * would also allow more records to be appended
2950 * to the socket buffer. We still have SB_LOCK
2951 * set on it, so we can be sure that the head
2952 * of the mbuf chain won't change.
2953 */
2954 socket_unlock(so, 0);
2955 error = (*pr->pr_domain->dom_externalize)(cm);
2956 socket_lock(so, 0);
2957 } else {
2958 error = 0;
2959 }
2960
2961 if (controlp != NULL && error == 0) {
2962 *controlp = cm;
2963 controlp = &(*controlp)->m_next;
2964 orig_resid = 0;
2965 } else {
2966 (void) m_free(cm);
2967 }
2968 cm = cmn;
1c79356b 2969 }
39236c6e 2970 /*
316670eb 2971 * Update the value of nextrecord in case we received new
39236c6e 2972 * records when the socket was unlocked above for
316670eb
A
2973 * externalizing SCM_RIGHTS.
2974 */
2975 if (m != NULL)
2d21ac55
A
2976 nextrecord = sb_rcv->sb_mb->m_nextpkt;
2977 else
316670eb
A
2978 nextrecord = sb_rcv->sb_mb;
2979 orig_resid = 0;
1c79356b 2980 }
2d21ac55 2981
39236c6e
A
2982 /*
2983 * If the socket is a TCP socket with message delivery
2984 * enabled, then create a control msg to deliver the
2985 * relative TCP sequence number for this data. Waiting
2986 * until this point will protect against failures to
2987 * allocate an mbuf for control msgs.
2988 */
2989 if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
2990 (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
2991 struct mbuf *seq_cm;
2992
2993 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
2994 sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
2995 if (seq_cm == NULL) {
2996 /* unable to allocate a control mbuf */
2997 error = ENOBUFS;
2998 goto release;
2999 }
3000 *controlp = seq_cm;
3001 controlp = &seq_cm->m_next;
3002 }
3003
2d21ac55
A
3004 if (m != NULL) {
3005 if (!(flags & MSG_PEEK)) {
3006 /*
3007 * We get here because m points to an mbuf following
3008 * any MT_SONAME or MT_CONTROL mbufs which have been
3009 * processed above. In any case, m should be pointing
3010 * to the head of the mbuf chain, and the nextrecord
3011 * should be either NULL or equal to m->m_nextpkt.
3012 * See comments above about SB_LOCK.
3013 */
39236c6e
A
3014 if (m != so->so_rcv.sb_mb ||
3015 m->m_nextpkt != nextrecord) {
3016 panic("%s: post-control !sync so=%p m=%p "
3017 "nextrecord=%p\n", __func__, so, m,
3018 nextrecord);
3019 /* NOTREACHED */
3020 }
2d21ac55
A
3021 if (nextrecord == NULL)
3022 so->so_rcv.sb_lastrecord = m;
3023 }
1c79356b
A
3024 type = m->m_type;
3025 if (type == MT_OOBDATA)
3026 flags |= MSG_OOB;
2d21ac55
A
3027 } else {
3028 if (!(flags & MSG_PEEK)) {
2d21ac55
A
3029 SB_EMPTY_FIXUP(&so->so_rcv);
3030 }
1c79356b 3031 }
2d21ac55
A
3032 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3033 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3034
1c79356b
A
3035 moff = 0;
3036 offset = 0;
fa4905b1 3037
91447636 3038 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
2d21ac55 3039 can_delay = 1;
55e303ae 3040 else
2d21ac55 3041 can_delay = 0;
55e303ae
A
3042
3043 need_event = 0;
fa4905b1 3044
39236c6e
A
3045 while (m != NULL &&
3046 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
1c79356b
A
3047 if (m->m_type == MT_OOBDATA) {
3048 if (type != MT_OOBDATA)
3049 break;
2d21ac55 3050 } else if (type == MT_OOBDATA) {
1c79356b 3051 break;
2d21ac55 3052 }
9bccf70c 3053 /*
2d21ac55 3054 * Make sure to allways set MSG_OOB event when getting
9bccf70c
A
3055 * out of band data inline.
3056 */
1c79356b 3057 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2d21ac55
A
3058 (so->so_options & SO_OOBINLINE) != 0 &&
3059 (so->so_state & SS_RCVATMARK) != 0) {
9bccf70c
A
3060 flags |= MSG_OOB;
3061 }
1c79356b 3062 so->so_state &= ~SS_RCVATMARK;
91447636 3063 len = uio_resid(uio) - delayed_copy_len;
1c79356b
A
3064 if (so->so_oobmark && len > so->so_oobmark - offset)
3065 len = so->so_oobmark - offset;
3066 if (len > m->m_len - moff)
3067 len = m->m_len - moff;
3068 /*
3069 * If mp is set, just pass back the mbufs.
3070 * Otherwise copy them out via the uio, then free.
3071 * Sockbuf must be consistent here (points to current mbuf,
3072 * it points to next record) when we drop priority;
3073 * we must note any additions to the sockbuf when we
3074 * block interrupts again.
3075 */
39236c6e 3076 if (mp == NULL) {
2d21ac55
A
3077 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3078 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
55e303ae 3079 if (can_delay && len == m->m_len) {
2d21ac55 3080 /*
55e303ae
A
3081 * only delay the copy if we're consuming the
3082 * mbuf and we're NOT in MSG_PEEK mode
3083 * and we have enough data to make it worthwile
2d21ac55
A
3084 * to drop and retake the lock... can_delay
3085 * reflects the state of the 2 latter
3086 * constraints moff should always be zero
3087 * in these cases
55e303ae 3088 */
2d21ac55 3089 delayed_copy_len += len;
55e303ae 3090 } else {
2d21ac55
A
3091 if (delayed_copy_len) {
3092 error = sodelayed_copy(so, uio,
3093 &free_list, &delayed_copy_len);
55e303ae
A
3094
3095 if (error) {
55e303ae
A
3096 goto release;
3097 }
2d21ac55
A
3098 /*
3099 * can only get here if MSG_PEEK is not
3100 * set therefore, m should point at the
3101 * head of the rcv queue; if it doesn't,
3102 * it means something drastically
3103 * changed while we were out from behind
3104 * the lock in sodelayed_copy. perhaps
3105 * a RST on the stream. in any event,
3106 * the stream has been interrupted. it's
3107 * probably best just to return whatever
3108 * data we've moved and let the caller
3109 * sort it out...
3110 */
55e303ae 3111 if (m != so->so_rcv.sb_mb) {
2d21ac55 3112 break;
55e303ae
A
3113 }
3114 }
91447636 3115 socket_unlock(so, 0);
2d21ac55
A
3116 error = uiomove(mtod(m, caddr_t) + moff,
3117 (int)len, uio);
91447636 3118 socket_lock(so, 0);
55e303ae 3119
55e303ae 3120 if (error)
2d21ac55 3121 goto release;
55e303ae 3122 }
2d21ac55 3123 } else {
91447636 3124 uio_setresid(uio, (uio_resid(uio) - len));
2d21ac55 3125 }
1c79356b
A
3126 if (len == m->m_len - moff) {
3127 if (m->m_flags & M_EOR)
3128 flags |= MSG_EOR;
3129 if (flags & MSG_PEEK) {
3130 m = m->m_next;
3131 moff = 0;
3132 } else {
3133 nextrecord = m->m_nextpkt;
3134 sbfree(&so->so_rcv, m);
91447636 3135 m->m_nextpkt = NULL;
55e303ae 3136
39236c6e
A
3137 /*
3138 * If this packet is an unordered packet
3139 * (indicated by M_UNORDERED_DATA flag), remove
3140 * the additional bytes added to the
3141 * receive socket buffer size.
3142 */
3143 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3144 m->m_len &&
3145 (m->m_flags & M_UNORDERED_DATA) &&
3146 sbreserve(&so->so_rcv,
3147 so->so_rcv.sb_hiwat - m->m_len)) {
3148 if (so->so_msg_state->msg_uno_bytes >
3149 m->m_len) {
3150 so->so_msg_state->
3151 msg_uno_bytes -= m->m_len;
3152 } else {
3153 so->so_msg_state->
3154 msg_uno_bytes = 0;
3155 }
3156 m->m_flags &= ~M_UNORDERED_DATA;
3157 }
3158
3159 if (mp != NULL) {
1c79356b
A
3160 *mp = m;
3161 mp = &m->m_next;
3162 so->so_rcv.sb_mb = m = m->m_next;
39236c6e 3163 *mp = NULL;
1c79356b 3164 } else {
55e303ae 3165 if (free_list == NULL)
2d21ac55
A
3166 free_list = m;
3167 else
3168 ml->m_next = m;
3169 ml = m;
14353aa8 3170 so->so_rcv.sb_mb = m = m->m_next;
39236c6e 3171 ml->m_next = NULL;
1c79356b 3172 }
2d21ac55 3173 if (m != NULL) {
1c79356b 3174 m->m_nextpkt = nextrecord;
2d21ac55
A
3175 if (nextrecord == NULL)
3176 so->so_rcv.sb_lastrecord = m;
3177 } else {
3178 so->so_rcv.sb_mb = nextrecord;
3179 SB_EMPTY_FIXUP(&so->so_rcv);
3180 }
3181 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3182 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
1c79356b
A
3183 }
3184 } else {
2d21ac55 3185 if (flags & MSG_PEEK) {
1c79356b 3186 moff += len;
2d21ac55 3187 } else {
6d2010ae
A
3188 if (mp != NULL) {
3189 int copy_flag;
3190
3191 if (flags & MSG_DONTWAIT)
3192 copy_flag = M_DONTWAIT;
3193 else
3194 copy_flag = M_WAIT;
3195 *mp = m_copym(m, 0, len, copy_flag);
39236c6e
A
3196 /*
3197 * Failed to allocate an mbuf?
3198 * Adjust uio_resid back, it was
3199 * adjusted down by len bytes which
3200 * we didn't copy over.
3201 */
6d2010ae 3202 if (*mp == NULL) {
39236c6e
A
3203 uio_setresid(uio,
3204 (uio_resid(uio) + len));
6d2010ae
A
3205 break;
3206 }
3207 }
1c79356b
A
3208 m->m_data += len;
3209 m->m_len -= len;
3210 so->so_rcv.sb_cc -= len;
3211 }
3212 }
3213 if (so->so_oobmark) {
3214 if ((flags & MSG_PEEK) == 0) {
3215 so->so_oobmark -= len;
3216 if (so->so_oobmark == 0) {
2d21ac55
A
3217 so->so_state |= SS_RCVATMARK;
3218 /*
3219 * delay posting the actual event until
3220 * after any delayed copy processing
3221 * has finished
3222 */
3223 need_event = 1;
3224 break;
1c79356b
A
3225 }
3226 } else {
3227 offset += len;
3228 if (offset == so->so_oobmark)
3229 break;
3230 }
3231 }
2d21ac55 3232 if (flags & MSG_EOR)
1c79356b
A
3233 break;
3234 /*
2d21ac55
A
3235 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3236 * (for non-atomic socket), we must not quit until
3237 * "uio->uio_resid == 0" or an error termination.
3238 * If a signal/timeout occurs, return with a short
3239 * count but without error. Keep sockbuf locked
3240 * against other readers.
1c79356b 3241 */
39236c6e 3242 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL &&
2d21ac55 3243 (uio_resid(uio) - delayed_copy_len) > 0 &&
1c79356b 3244 !sosendallatonce(so) && !nextrecord) {
fe8ab488
A
3245 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3246#if CONTENT_FILTER
3247 && cfil_sock_data_pending(&so->so_rcv) == 0
3248#endif /* CONTENT_FILTER */
3249 ))
2d21ac55 3250 goto release;
fa4905b1 3251
2d21ac55
A
3252 /*
3253 * Depending on the protocol (e.g. TCP), the following
3254 * might cause the socket lock to be dropped and later
3255 * be reacquired, and more data could have arrived and
3256 * have been appended to the receive socket buffer by
3257 * the time it returns. Therefore, we only sleep in
3258 * sbwait() below if and only if the socket buffer is
3259 * empty, in order to avoid a false sleep.
3260 */
3261 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3262 (((struct inpcb *)so->so_pcb)->inp_state !=
3263 INPCB_STATE_DEAD))
3264 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3265
3266 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3267 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3268
3269 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3270 error = 0;
55e303ae 3271 goto release;
fa4905b1 3272 }
55e303ae 3273 /*
2d21ac55
A
3274 * have to wait until after we get back from the sbwait
3275 * to do the copy because we will drop the lock if we
3276 * have enough data that has been delayed... by dropping
3277 * the lock we open up a window allowing the netisr
3278 * thread to process the incoming packets and to change
3279 * the state of this socket... we're issuing the sbwait
3280 * because the socket is empty and we're expecting the
3281 * netisr thread to wake us up when more packets arrive;
3282 * if we allow that processing to happen and then sbwait
3283 * we could stall forever with packets sitting in the
3284 * socket if no further packets arrive from the remote
3285 * side.
55e303ae 3286 *
2d21ac55
A
3287 * we want to copy before we've collected all the data
3288 * to satisfy this request to allow the copy to overlap
3289 * the incoming packet processing on an MP system
55e303ae 3290 */
2d21ac55
A
3291 if (delayed_copy_len > sorecvmincopy &&
3292 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3293 error = sodelayed_copy(so, uio,
3294 &free_list, &delayed_copy_len);
55e303ae
A
3295
3296 if (error)
2d21ac55 3297 goto release;
1c79356b
A
3298 }
3299 m = so->so_rcv.sb_mb;
39236c6e 3300 if (m != NULL) {
1c79356b 3301 nextrecord = m->m_nextpkt;
fa4905b1 3302 }
316670eb 3303 SB_MB_CHECK(&so->so_rcv);
1c79356b
A
3304 }
3305 }
91447636 3306#ifdef MORE_LOCKING_DEBUG
39236c6e
A
3307 if (so->so_usecount <= 1) {
3308 panic("%s: after big while so=%p ref=%d on socket\n",
3309 __func__, so, so->so_usecount);
3310 /* NOTREACHED */
3311 }
91447636 3312#endif
1c79356b 3313
39236c6e 3314 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
2d21ac55 3315 if (so->so_options & SO_DONTTRUNC) {
1c79356b 3316 flags |= MSG_RCVMORE;
2d21ac55 3317 } else {
9bccf70c 3318 flags |= MSG_TRUNC;
1c79356b
A
3319 if ((flags & MSG_PEEK) == 0)
3320 (void) sbdroprecord(&so->so_rcv);
3321 }
3322 }
2d21ac55
A
3323
3324 /*
3325 * pru_rcvd below (for TCP) may cause more data to be received
3326 * if the socket lock is dropped prior to sending the ACK; some
3327 * legacy OpenTransport applications don't handle this well
3328 * (if it receives less data than requested while MSG_HAVEMORE
3329 * is set), and so we set the flag now based on what we know
3330 * prior to calling pru_rcvd.
3331 */
3332 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
3333 flags |= MSG_HAVEMORE;
3334
1c79356b 3335 if ((flags & MSG_PEEK) == 0) {
39236c6e 3336 if (m == NULL) {
1c79356b 3337 so->so_rcv.sb_mb = nextrecord;
2d21ac55
A
3338 /*
3339 * First part is an inline SB_EMPTY_FIXUP(). Second
3340 * part makes sure sb_lastrecord is up-to-date if
3341 * there is still data in the socket buffer.
3342 */
3343 if (so->so_rcv.sb_mb == NULL) {
3344 so->so_rcv.sb_mbtail = NULL;
3345 so->so_rcv.sb_lastrecord = NULL;
3346 } else if (nextrecord->m_nextpkt == NULL) {
3347 so->so_rcv.sb_lastrecord = nextrecord;
3348 }
316670eb 3349 SB_MB_CHECK(&so->so_rcv);
2d21ac55
A
3350 }
3351 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3352 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
1c79356b
A
3353 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
3354 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3355 }
39236c6e 3356
55e303ae 3357 if (delayed_copy_len) {
91447636 3358 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
55e303ae 3359 if (error)
2d21ac55 3360 goto release;
55e303ae 3361 }
39236c6e
A
3362 if (free_list != NULL) {
3363 m_freem_list(free_list);
3364 free_list = NULL;
55e303ae
A
3365 }
3366 if (need_event)
2d21ac55 3367 postevent(so, 0, EV_OOB);
39236c6e 3368
91447636 3369 if (orig_resid == uio_resid(uio) && orig_resid &&
1c79356b 3370 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
39236c6e 3371 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
1c79356b
A
3372 goto restart;
3373 }
3374
39236c6e 3375 if (flagsp != NULL)
1c79356b
A
3376 *flagsp |= flags;
3377release:
91447636 3378#ifdef MORE_LOCKING_DEBUG
39236c6e
A
3379 if (so->so_usecount <= 1) {
3380 panic("%s: release so=%p ref=%d on socket\n", __func__,
2d21ac55 3381 so, so->so_usecount);
39236c6e
A
3382 /* NOTREACHED */
3383 }
91447636 3384#endif
39236c6e 3385 if (delayed_copy_len)
2d21ac55 3386 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
1c79356b 3387
39236c6e
A
3388 if (free_list != NULL)
3389 m_freem_list(free_list);
3390
3391 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
3392
2d21ac55
A
3393 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
3394 so->so_rcv.sb_cc, 0, error);
1c79356b
A
3395
3396 return (error);
3397}
3398
2d21ac55
A
3399/*
3400 * Returns: 0 Success
3401 * uiomove:EFAULT
3402 */
3403static int
3404sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
39236c6e 3405 user_ssize_t *resid)
55e303ae 3406{
2d21ac55 3407 int error = 0;
55e303ae
A
3408 struct mbuf *m;
3409
3410 m = *free_list;
3411
91447636 3412 socket_unlock(so, 0);
55e303ae 3413
39236c6e 3414 while (m != NULL && error == 0) {
2d21ac55 3415 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
2d21ac55
A
3416 m = m->m_next;
3417 }
3418 m_freem_list(*free_list);
3419
39236c6e 3420 *free_list = NULL;
2d21ac55
A
3421 *resid = 0;
3422
3423 socket_lock(so, 0);
55e303ae 3424
2d21ac55
A
3425 return (error);
3426}
3427
2d21ac55 3428int
fe8ab488
A
3429soreceive_list(struct socket *so, struct sockaddr **psa, struct uio **uioarray,
3430 u_int uiocnt, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2d21ac55 3431{
fe8ab488
A
3432 struct mbuf *m, **mp;
3433 struct mbuf *nextrecord;
3434 struct mbuf *ml = NULL, *free_list = NULL;
3435 int flags, error, offset;
3436 user_ssize_t len;
3437 struct protosw *pr = so->so_proto;
3438 user_ssize_t orig_resid, resid;
3439 struct proc *p = current_proc();
3440 struct uio *auio = NULL;
3441 int i = 0;
3442 int sblocked = 0;
55e303ae 3443
fe8ab488
A
3444 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
3445 so, uiocnt,
3446 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
3447
3448 mp = mp0;
3449 if (psa != NULL)
3450 *psa = NULL;
3451 if (controlp != NULL)
3452 *controlp = NULL;
3453 if (flagsp != NULL)
3454 flags = *flagsp &~ MSG_EOR;
3455 else
3456 flags = 0;
3457 /*
3458 * Disallow functionality not currently supported
3459 */
3460 if (mp0 != NULL) {
3461 printf("%s mp0 not supported\n", __func__);
3462 error = EOPNOTSUPP;
3463 goto out;
3464 }
3465 if (psa != NULL) {
3466 printf("%s sockaddr not supported\n", __func__);
3467 error = EOPNOTSUPP;
3468 goto out;
3469 }
3470 if (controlp != NULL) {
3471 printf("%s control not supported\n", __func__);
3472 error = EOPNOTSUPP;
3473 goto out;
3474 }
3475
3476 /*
3477 * Sanity checks:
3478 * - Only supports don't wait flags
3479 * - Only support datagram sockets (could be extended to raw)
3480 * - Must be atomic
3481 * - Protocol must support packet chains
3482 * - The uio array is NULL (should we panic?)
3483 */
3484 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
3485 printf("%s flags not supported\n", __func__);
3486 error = EOPNOTSUPP;
3487 goto out;
3488 }
3489 if (so->so_type != SOCK_DGRAM) {
3490 error = EINVAL;
3491 goto out;
3492 }
3493 if (sosendallatonce(so) == 0) {
3494 error = EINVAL;
3495 goto out;
3496 }
3497 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
3498 error = EPROTONOSUPPORT;
3499 goto out;
3500 }
3501 if (uioarray == NULL) {
3502 printf("%s uioarray is NULL\n", __func__);
3503 error = EINVAL;
3504 goto out;
3505 }
3506 if (uiocnt == 0) {
3507 printf("%s uiocnt is 0\n", __func__);
3508 error = EINVAL;
3509 goto out;
3510 }
3511 /*
3512 * Sanity check on the length passed by caller as we are making 'int'
3513 * comparisons
3514 */
3515 resid = orig_resid = uio_array_resid(uioarray, uiocnt);
3516 if (orig_resid < 0 || orig_resid > INT_MAX) {
3517 error = EINVAL;
3518 goto out;
3519 }
3520
3521 socket_lock(so, 1);
3522 so_update_last_owner_locked(so, p);
3523 so_update_policy(so);
3524
3525#if NECP
3526 so_update_necp_policy(so, NULL, NULL);
3527#endif /* NECP */
3528
3529 /*
3530 * If a recv attempt is made on a previously-accepted socket
3531 * that has been marked as inactive (disconnected), reject
3532 * the request.
3533 */
3534 if (so->so_flags & SOF_DEFUNCT) {
3535 struct sockbuf *sb = &so->so_rcv;
3536
3537 error = ENOTCONN;
3538 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
3539 __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
3540 SOCK_DOM(so), SOCK_TYPE(so), error));
3541 /*
3542 * This socket should have been disconnected and flushed
3543 * prior to being returned from sodefunct(); there should
3544 * be no data on its receive list, so panic otherwise.
3545 */
3546 if (so->so_state & SS_DEFUNCT)
3547 sb_empty_assert(sb, __func__);
3548 goto release;
3549 }
3550 if (mp != NULL)
3551 *mp = NULL;
3552restart:
3553 /*
3554 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3555 * and if so just return to the caller. This could happen when
3556 * soreceive() is called by a socket upcall function during the
3557 * time the socket is freed. The socket buffer would have been
3558 * locked across the upcall, therefore we cannot put this thread
3559 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3560 * we may livelock), because the lock on the socket buffer will
3561 * only be released when the upcall routine returns to its caller.
3562 * Because the socket has been officially closed, there can be
3563 * no further read on it.
3564 */
3565 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3566 (SS_NOFDREF | SS_CANTRCVMORE)) {
3567 error = 0;
3568 goto release;
3569 }
3570
3571 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3572 if (error) {
3573 goto release;
3574 }
3575 sblocked = 1;
3576
3577 /*
3578 * Skip empty uio
3579 */
3580 auio = uioarray[i];
3581 while (uio_resid(auio) == 0) {
3582 i++;
3583 if (i >= uiocnt) {
3584 error = 0;
3585 goto release;
3586 }
3587 }
3588
3589 m = so->so_rcv.sb_mb;
3590 /*
3591 * Block awaiting more datagram if needed
3592 */
3593 if (m == NULL) {
3594 /*
3595 * Panic if we notice inconsistencies in the socket's
3596 * receive list; both sb_mb and sb_cc should correctly
3597 * reflect the contents of the list, otherwise we may
3598 * end up with false positives during select() or poll()
3599 * which could put the application in a bad state.
3600 */
3601 SB_MB_CHECK(&so->so_rcv);
3602
3603 if (so->so_error) {
3604 error = so->so_error;
3605 goto release;
3606 }
3607 if (so->so_state & SS_CANTRCVMORE) {
3608 goto release;
3609 }
3610 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3611 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3612 error = ENOTCONN;
3613 goto release;
3614 }
3615 if ((so->so_state & SS_NBIO) ||
3616 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3617 error = EWOULDBLOCK;
3618 goto release;
3619 }
3620 /*
3621 * Do not block if we got some data
3622 * Note: We could use MSG_WAITALL to wait
3623 */
3624 resid = uio_array_resid(uioarray, uiocnt);
3625 if (resid != orig_resid) {
3626 error = 0;
3627 goto release;
3628 }
3629
3630 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3631 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3632
3633 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3634 sblocked = 0;
3635
3636 error = sbwait(&so->so_rcv);
3637 if (error) {
3638 goto release;
3639 }
3640 goto restart;
3641 }
3642
3643 if (m->m_pkthdr.len == 0) {
3644 printf("%s so %llx pkt %llx len is null\n",
3645 __func__,
3646 (uint64_t)VM_KERNEL_ADDRPERM(so),
3647 (uint64_t)VM_KERNEL_ADDRPERM(m));
3648 goto restart;
3649 }
3650 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3651 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3652 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3653
3654 /*
3655 * Consume the current uio index as we have a datagram
3656 */
3657 i += 1;
3658 nextrecord = m->m_nextpkt;
3659
3660#if SO_RECEIVE_LIST_SOCKADDR_NOT_YET
3661 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3662 /*
3663 * to be adapted from soreceive()
3664 */
3665 }
3666#endif /* SO_RECEIVE_LIST_SOCKADDR_NOT_YET */
3667
3668#if SO_RECEIVE_LIST_CONTROL_NOT_YET
3669 /*
3670 * Process one or more MT_CONTROL mbufs present before any data mbufs
3671 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3672 * just copy the data; if !MSG_PEEK, we call into the protocol to
3673 * perform externalization.
3674 */
3675 if (m != NULL && m->m_type == MT_CONTROL) {
3676 /*
3677 * to be adapted from soreceive()
3678 */
3679 }
3680#endif /* SO_RECEIVE_LIST_CONTROL_NOT_YET */
3681
3682 offset = 0;
3683
3684 /*
3685 * Loop to copy out the mbufs of the current record
3686 */
3687 while (m != NULL && uio_resid(auio) > 0 && error == 0) {
3688 len = uio_resid(auio);
3689
3690 if (m->m_len == 0)
3691 printf("%s: so %llx m %llx m_len is 0\n",
3692 __func__,
3693 (uint64_t)VM_KERNEL_ADDRPERM(so),
3694 (uint64_t)VM_KERNEL_ADDRPERM(m));
3695
3696 /*
3697 * Clip to the residual length
3698 */
3699 if (len > m->m_len)
3700 len = m->m_len;
3701 /*
3702 * If mp is set, just pass back the mbufs.
3703 * Otherwise copy them out via the uio, then free.
3704 * Sockbuf must be consistent here (points to current mbuf,
3705 * it points to next record) when we drop priority;
3706 * we must note any additions to the sockbuf when we
3707 * block interrupts again.
3708 */
3709 if (mp != NULL) {
3710 uio_setresid(auio, (uio_resid(auio) - len));
3711 } else {
3712 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3713 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3714
3715 socket_unlock(so, 0);
3716 error = uiomove(mtod(m, caddr_t), (int)len, auio);
3717 socket_lock(so, 0);
3718
3719 if (error)
3720 goto release;
3721 }
3722 if (len == m->m_len) {
3723 /*
3724 * m was entirely copied
3725 */
3726 nextrecord = m->m_nextpkt;
3727 sbfree(&so->so_rcv, m);
3728 m->m_nextpkt = NULL;
3729
3730 /*
3731 * Move to m_next
3732 */
3733 if (mp != NULL) {
3734 *mp = m;
3735 mp = &m->m_next;
3736 so->so_rcv.sb_mb = m = m->m_next;
3737 *mp = NULL;
3738 } else {
3739 if (free_list == NULL)
3740 free_list = m;
3741 else
3742 ml->m_next = m;
3743 ml = m;
3744 so->so_rcv.sb_mb = m = m->m_next;
3745 ml->m_next = NULL;
3746 ml->m_nextpkt = NULL;
3747 }
3748 if (m != NULL) {
3749 m->m_nextpkt = nextrecord;
3750 if (nextrecord == NULL)
3751 so->so_rcv.sb_lastrecord = m;
3752 } else {
3753 so->so_rcv.sb_mb = nextrecord;
3754 SB_EMPTY_FIXUP(&so->so_rcv);
3755 }
3756 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3757 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3758 } else {
3759 /*
3760 * Stop the loop on partial copy
3761 */
3762 if (mp != NULL) {
3763 int copy_flag;
3764
3765 if (flags & MSG_DONTWAIT)
3766 copy_flag = M_DONTWAIT;
3767 else
3768 copy_flag = M_WAIT;
3769 *mp = m_copym(m, 0, len, copy_flag);
3770 /*
3771 * Failed to allocate an mbuf?
3772 * Adjust uio_resid back, it was
3773 * adjusted down by len bytes which
3774 * we didn't copy over.
3775 */
3776 if (*mp == NULL) {
3777 uio_setresid(auio,
3778 (uio_resid(auio) + len));
3779 error = ENOMEM;
3780 break;
3781 }
3782 }
3783 break;
3784 }
3785 }
3786#ifdef MORE_LOCKING_DEBUG
3787 if (so->so_usecount <= 1) {
3788 panic("%s: after big while so=%llx ref=%d on socket\n",
3789 __func__,
3790 (uint64_t)VM_KERNEL_ADDRPERM(so), so->so_usecount);
3791 /* NOTREACHED */
3792 }
3793#endif
3794 /*
3795 * Tell the caller we made a partial copy
3796 */
3797 if (m != NULL) {
3798 if (so->so_options & SO_DONTTRUNC) {
3799 m->m_data += len;
3800 m->m_len -= len;
3801 so->so_rcv.sb_cc -= len;
3802 flags |= MSG_RCVMORE;
3803 } else {
3804 (void) sbdroprecord(&so->so_rcv);
3805 nextrecord = so->so_rcv.sb_mb;
3806 m = NULL;
3807 flags |= MSG_TRUNC;
3808 }
3809 }
3810
3811 if (m == NULL) {
3812 so->so_rcv.sb_mb = nextrecord;
3813 /*
3814 * First part is an inline SB_EMPTY_FIXUP(). Second
3815 * part makes sure sb_lastrecord is up-to-date if
3816 * there is still data in the socket buffer.
3817 */
3818 if (so->so_rcv.sb_mb == NULL) {
3819 so->so_rcv.sb_mbtail = NULL;
3820 so->so_rcv.sb_lastrecord = NULL;
3821 } else if (nextrecord->m_nextpkt == NULL) {
3822 so->so_rcv.sb_lastrecord = nextrecord;
3823 }
3824 SB_MB_CHECK(&so->so_rcv);
3825 }
3826 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3827 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3828
3829 /*
3830 * We can continue to the next packet as long as:
3831 * - We haven't exhausted the uio array
3832 * - There was no error
3833 * - A packet was not truncated
3834 * - We can still receive more data
3835 */
3836 if (i < uiocnt && error == 0 &&
3837 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0
3838 && (so->so_state & SS_CANTRCVMORE) == 0) {
3839 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3840 sblocked = 0;
3841
3842 goto restart;
3843 }
3844
3845release:
3846 /*
3847 * pru_rcvd may cause more data to be received if the socket lock
3848 * is dropped so we set MSG_HAVEMORE now based on what we know.
3849 * That way the caller won't be surprised if it receives less data than requested.
3850 */
3851 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
3852 flags |= MSG_HAVEMORE;
3853
3854 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
3855 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3856
3857 if (flagsp != NULL)
3858 *flagsp |= flags;
3859 if (sblocked)
3860 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
3861 else
3862 socket_unlock(so, 1);
3863out:
3864 /*
3865 * Amortize the cost
3866 */
3867 if (free_list != NULL)
3868 m_freem_list(free_list);
3869
3870 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
3871 0, 0, 0, 0);
3872 return (error);
3873}
3874
3875/*
3876 * Returns: 0 Success
3877 * EINVAL
3878 * ENOTCONN
3879 * <pru_shutdown>:EINVAL
3880 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
3881 * <pru_shutdown>:ENOBUFS[TCP]
3882 * <pru_shutdown>:EMSGSIZE[TCP]
3883 * <pru_shutdown>:EHOSTUNREACH[TCP]
3884 * <pru_shutdown>:ENETUNREACH[TCP]
3885 * <pru_shutdown>:ENETDOWN[TCP]
3886 * <pru_shutdown>:ENOMEM[TCP]
3887 * <pru_shutdown>:EACCES[TCP]
3888 * <pru_shutdown>:EMSGSIZE[TCP]
3889 * <pru_shutdown>:ENOBUFS[TCP]
3890 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
3891 * <pru_shutdown>:??? [other protocol families]
3892 */
3893int
3894soshutdown(struct socket *so, int how)
3895{
3896 int error;
3897
3898 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
3899
3900 switch (how) {
3901 case SHUT_RD:
3902 case SHUT_WR:
3903 case SHUT_RDWR:
3904 socket_lock(so, 1);
3905 if ((so->so_state &
3906 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
3907 error = ENOTCONN;
2d21ac55
A
3908 } else {
3909 error = soshutdownlock(so, how);
3910 }
3911 socket_unlock(so, 1);
3912 break;
3913 default:
3914 error = EINVAL;
3915 break;
55e303ae 3916 }
55e303ae 3917
fe8ab488
A
3918 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
3919
55e303ae
A
3920 return (error);
3921}
3922
1c79356b 3923int
fe8ab488 3924soshutdownlock_final(struct socket *so, int how)
1c79356b 3925{
2d21ac55
A
3926 struct protosw *pr = so->so_proto;
3927 int error = 0;
1c79356b 3928
91447636 3929 sflt_notify(so, sock_evt_shutdown, &how);
1c79356b 3930
9bccf70c 3931 if (how != SHUT_WR) {
2d21ac55
A
3932 if ((so->so_state & SS_CANTRCVMORE) != 0) {
3933 /* read already shut down */
3934 error = ENOTCONN;
3935 goto done;
3936 }
1c79356b
A
3937 sorflush(so);
3938 postevent(so, 0, EV_RCLOSED);
3939 }
9bccf70c 3940 if (how != SHUT_RD) {
2d21ac55
A
3941 if ((so->so_state & SS_CANTSENDMORE) != 0) {
3942 /* write already shut down */
3943 error = ENOTCONN;
3944 goto done;
3945 }
3946 error = (*pr->pr_usrreqs->pru_shutdown)(so);
3947 postevent(so, 0, EV_WCLOSED);
1c79356b 3948 }
2d21ac55 3949done:
fe8ab488
A
3950 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
3951 return (error);
3952}
3953
3954int
3955soshutdownlock(struct socket *so, int how)
3956{
3957 int error = 0;
3958
3959#if CONTENT_FILTER
3960 /*
3961 * A content filter may delay the actual shutdown until it
3962 * has processed the pending data
3963 */
3964 if (so->so_flags & SOF_CONTENT_FILTER) {
3965 error = cfil_sock_shutdown(so, &how);
3966 if (error == EJUSTRETURN) {
3967 error = 0;
3968 goto done;
3969 } else if (error != 0) {
3970 goto done;
3971 }
3972 }
3973#endif /* CONTENT_FILTER */
3974
3975 error = soshutdownlock_final(so, how);
3976
3977done:
2d21ac55 3978 return (error);
1c79356b
A
3979}
3980
39236c6e
A
3981void
3982sowflush(struct socket *so)
3983{
3984 struct sockbuf *sb = &so->so_snd;
3985#ifdef notyet
3986 lck_mtx_t *mutex_held;
3987 /*
3988 * XXX: This code is currently commented out, because we may get here
3989 * as part of sofreelastref(), and at that time, pr_getlock() may no
3990 * longer be able to return us the lock; this will be fixed in future.
3991 */
3992 if (so->so_proto->pr_getlock != NULL)
3993 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
3994 else
3995 mutex_held = so->so_proto->pr_domain->dom_mtx;
3996
3997 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
3998#endif /* notyet */
3999
4000 /*
4001 * Obtain lock on the socket buffer (SB_LOCK). This is required
4002 * to prevent the socket buffer from being unexpectedly altered
4003 * while it is used by another thread in socket send/receive.
4004 *
4005 * sblock() must not fail here, hence the assertion.
4006 */
4007 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4008 VERIFY(sb->sb_flags & SB_LOCK);
4009
4010 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
4011 sb->sb_flags |= SB_DROP;
4012 sb->sb_upcall = NULL;
4013 sb->sb_upcallarg = NULL;
4014
4015 sbunlock(sb, TRUE); /* keep socket locked */
4016
4017 selthreadclear(&sb->sb_sel);
4018 sbrelease(sb);
4019}
4020
1c79356b 4021void
2d21ac55 4022sorflush(struct socket *so)
1c79356b 4023{
39236c6e
A
4024 struct sockbuf *sb = &so->so_rcv;
4025 struct protosw *pr = so->so_proto;
1c79356b 4026 struct sockbuf asb;
39236c6e 4027#ifdef notyet
2d21ac55 4028 lck_mtx_t *mutex_held;
39236c6e
A
4029 /*
4030 * XXX: This code is currently commented out, because we may get here
4031 * as part of sofreelastref(), and at that time, pr_getlock() may no
4032 * longer be able to return us the lock; this will be fixed in future.
4033 */
2d21ac55 4034 if (so->so_proto->pr_getlock != NULL)
91447636 4035 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2d21ac55 4036 else
91447636 4037 mutex_held = so->so_proto->pr_domain->dom_mtx;
39236c6e 4038
91447636 4039 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
39236c6e 4040#endif /* notyet */
91447636
A
4041
4042 sflt_notify(so, sock_evt_flush_read, NULL);
1c79356b 4043
1c79356b 4044 socantrcvmore(so);
39236c6e
A
4045
4046 /*
4047 * Obtain lock on the socket buffer (SB_LOCK). This is required
4048 * to prevent the socket buffer from being unexpectedly altered
4049 * while it is used by another thread in socket send/receive.
4050 *
4051 * sblock() must not fail here, hence the assertion.
4052 */
4053 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4054 VERIFY(sb->sb_flags & SB_LOCK);
4055
4056 /*
4057 * Copy only the relevant fields from "sb" to "asb" which we
4058 * need for sbrelease() to function. In particular, skip
4059 * sb_sel as it contains the wait queue linkage, which would
4060 * wreak havoc if we were to issue selthreadclear() on "asb".
4061 * Make sure to not carry over SB_LOCK in "asb", as we need
4062 * to acquire it later as part of sbrelease().
4063 */
4064 bzero(&asb, sizeof (asb));
4065 asb.sb_cc = sb->sb_cc;
4066 asb.sb_hiwat = sb->sb_hiwat;
4067 asb.sb_mbcnt = sb->sb_mbcnt;
4068 asb.sb_mbmax = sb->sb_mbmax;
4069 asb.sb_ctl = sb->sb_ctl;
4070 asb.sb_lowat = sb->sb_lowat;
4071 asb.sb_mb = sb->sb_mb;
4072 asb.sb_mbtail = sb->sb_mbtail;
4073 asb.sb_lastrecord = sb->sb_lastrecord;
4074 asb.sb_so = sb->sb_so;
4075 asb.sb_flags = sb->sb_flags;
4076 asb.sb_flags &= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL);
4077 asb.sb_flags |= SB_DROP;
4078
4079 /*
4080 * Ideally we'd bzero() these and preserve the ones we need;
4081 * but to do that we'd need to shuffle things around in the
4082 * sockbuf, and we can't do it now because there are KEXTS
4083 * that are directly referring to the socket structure.
4084 *
4085 * Setting SB_DROP acts as a barrier to prevent further appends.
4086 * Clearing SB_SEL is done for selthreadclear() below.
4087 */
4088 sb->sb_cc = 0;
4089 sb->sb_hiwat = 0;
4090 sb->sb_mbcnt = 0;
4091 sb->sb_mbmax = 0;
4092 sb->sb_ctl = 0;
4093 sb->sb_lowat = 0;
4094 sb->sb_mb = NULL;
4095 sb->sb_mbtail = NULL;
4096 sb->sb_lastrecord = NULL;
4097 sb->sb_timeo.tv_sec = 0;
4098 sb->sb_timeo.tv_usec = 0;
4099 sb->sb_upcall = NULL;
4100 sb->sb_upcallarg = NULL;
4101 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
4102 sb->sb_flags |= SB_DROP;
4103
4104 sbunlock(sb, TRUE); /* keep socket locked */
4105
4106 /*
4107 * Note that selthreadclear() is called on the original "sb" and
4108 * not the local "asb" because of the way wait queue linkage is
4109 * implemented. Given that selwakeup() may be triggered, SB_SEL
4110 * should no longer be set (cleared above.)
4111 */
0b4e3aa0 4112 selthreadclear(&sb->sb_sel);
39236c6e
A
4113
4114 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
1c79356b 4115 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
39236c6e 4116
1c79356b
A
4117 sbrelease(&asb);
4118}
4119
4120/*
4121 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4122 * an additional variant to handle the case where the option value needs
4123 * to be some kind of integer, but not a specific size.
4124 * In addition to their use here, these functions are also called by the
4125 * protocol-level pr_ctloutput() routines.
2d21ac55
A
4126 *
4127 * Returns: 0 Success
4128 * EINVAL
4129 * copyin:EFAULT
1c79356b
A
4130 */
4131int
2d21ac55 4132sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
1c79356b
A
4133{
4134 size_t valsize;
4135
4136 /*
4137 * If the user gives us more than we wanted, we ignore it,
4138 * but if we don't get the minimum length the caller
4139 * wants, we return EINVAL. On success, sopt->sopt_valsize
4140 * is set to however much we actually retrieved.
4141 */
4142 if ((valsize = sopt->sopt_valsize) < minlen)
2d21ac55 4143 return (EINVAL);
1c79356b
A
4144 if (valsize > len)
4145 sopt->sopt_valsize = valsize = len;
4146
b0d623f7 4147 if (sopt->sopt_p != kernproc)
1c79356b
A
4148 return (copyin(sopt->sopt_val, buf, valsize));
4149
91447636 4150 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
2d21ac55
A
4151 return (0);
4152}
4153
4154/*
4155 * sooptcopyin_timeval
4156 * Copy in a timeval value into tv_p, and take into account whether the
4157 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4158 * code here so that we can verify the 64-bit tv_sec value before we lose
4159 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4160 */
4161static int
39236c6e 4162sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
2d21ac55
A
4163{
4164 int error;
b0d623f7 4165
2d21ac55 4166 if (proc_is64bit(sopt->sopt_p)) {
b0d623f7 4167 struct user64_timeval tv64;
2d21ac55 4168
39236c6e 4169 if (sopt->sopt_valsize < sizeof (tv64))
2d21ac55 4170 return (EINVAL);
39236c6e
A
4171
4172 sopt->sopt_valsize = sizeof (tv64);
b0d623f7 4173 if (sopt->sopt_p != kernproc) {
39236c6e 4174 error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
b0d623f7
A
4175 if (error != 0)
4176 return (error);
4177 } else {
4178 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
39236c6e 4179 sizeof (tv64));
2d21ac55 4180 }
39236c6e
A
4181 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4182 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000)
2d21ac55 4183 return (EDOM);
39236c6e 4184
2d21ac55
A
4185 tv_p->tv_sec = tv64.tv_sec;
4186 tv_p->tv_usec = tv64.tv_usec;
4187 } else {
b0d623f7
A
4188 struct user32_timeval tv32;
4189
39236c6e 4190 if (sopt->sopt_valsize < sizeof (tv32))
2d21ac55 4191 return (EINVAL);
39236c6e
A
4192
4193 sopt->sopt_valsize = sizeof (tv32);
b0d623f7 4194 if (sopt->sopt_p != kernproc) {
39236c6e 4195 error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
2d21ac55
A
4196 if (error != 0) {
4197 return (error);
4198 }
4199 } else {
b0d623f7 4200 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
39236c6e 4201 sizeof (tv32));
2d21ac55 4202 }
39236c6e
A
4203#ifndef __LP64__
4204 /*
4205 * K64todo "comparison is always false due to
4206 * limited range of data type"
4207 */
4208 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4209 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000)
2d21ac55 4210 return (EDOM);
b0d623f7
A
4211#endif
4212 tv_p->tv_sec = tv32.tv_sec;
4213 tv_p->tv_usec = tv32.tv_usec;
2d21ac55
A
4214 }
4215 return (0);
1c79356b
A
4216}
4217
2d21ac55
A
4218/*
4219 * Returns: 0 Success
4220 * EINVAL
4221 * ENOPROTOOPT
4222 * ENOBUFS
4223 * EDOM
4224 * sooptcopyin:EINVAL
4225 * sooptcopyin:EFAULT
4226 * sooptcopyin_timeval:EINVAL
4227 * sooptcopyin_timeval:EFAULT
4228 * sooptcopyin_timeval:EDOM
4229 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4230 * <pr_ctloutput>:???w
4231 * sflt_attach_private:??? [whatever a filter author chooses]
4232 * <sf_setoption>:??? [whatever a filter author chooses]
4233 *
4234 * Notes: Other <pru_listen> returns depend on the protocol family; all
4235 * <sf_listen> returns depend on what the filter author causes
4236 * their filter to return.
4237 */
1c79356b 4238int
39236c6e 4239sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
1c79356b
A
4240{
4241 int error, optval;
4242 struct linger l;
4243 struct timeval tv;
2d21ac55
A
4244#if CONFIG_MACF_SOCKET
4245 struct mac extmac;
4246#endif /* MAC_SOCKET */
91447636 4247
39236c6e
A
4248 if (sopt->sopt_dir != SOPT_SET)
4249 sopt->sopt_dir = SOPT_SET;
4250
4251 if (dolock)
4252 socket_lock(so, 1);
4253
4254 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4255 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
b0d623f7 4256 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
2d21ac55
A
4257 /* the socket has been shutdown, no more sockopt's */
4258 error = EINVAL;
39236c6e 4259 goto out;
9bccf70c
A
4260 }
4261
6d2010ae 4262 error = sflt_setsockopt(so, sopt);
39236c6e 4263 if (error != 0) {
6d2010ae
A
4264 if (error == EJUSTRETURN)
4265 error = 0;
39236c6e 4266 goto out;
1c79356b
A
4267 }
4268
1c79356b 4269 if (sopt->sopt_level != SOL_SOCKET) {
39236c6e
A
4270 if (so->so_proto != NULL &&
4271 so->so_proto->pr_ctloutput != NULL) {
2d21ac55 4272 error = (*so->so_proto->pr_ctloutput)(so, sopt);
39236c6e 4273 goto out;
91447636 4274 }
1c79356b
A
4275 error = ENOPROTOOPT;
4276 } else {
39236c6e
A
4277 /*
4278 * Allow socket-level (SOL_SOCKET) options to be filtered by
4279 * the protocol layer, if needed. A zero value returned from
4280 * the handler means use default socket-level processing as
4281 * done by the rest of this routine. Otherwise, any other
4282 * return value indicates that the option is unsupported.
4283 */
4284 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4285 pru_socheckopt(so, sopt)) != 0)
4286 goto out;
4287
4288 error = 0;
1c79356b
A
4289 switch (sopt->sopt_name) {
4290 case SO_LINGER:
91447636 4291 case SO_LINGER_SEC:
2d21ac55 4292 error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
39236c6e
A
4293 if (error != 0)
4294 goto out;
1c79356b 4295
2d21ac55
A
4296 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
4297 l.l_linger : l.l_linger * hz;
39236c6e 4298 if (l.l_onoff != 0)
1c79356b
A
4299 so->so_options |= SO_LINGER;
4300 else
4301 so->so_options &= ~SO_LINGER;
4302 break;
4303
4304 case SO_DEBUG:
4305 case SO_KEEPALIVE:
4306 case SO_DONTROUTE:
4307 case SO_USELOOPBACK:
4308 case SO_BROADCAST:
4309 case SO_REUSEADDR:
4310 case SO_REUSEPORT:
4311 case SO_OOBINLINE:
4312 case SO_TIMESTAMP:
6d2010ae 4313 case SO_TIMESTAMP_MONOTONIC:
1c79356b
A
4314 case SO_DONTTRUNC:
4315 case SO_WANTMORE:
9bccf70c 4316 case SO_WANTOOBFLAG:
fe8ab488 4317 case SO_NOWAKEFROMSLEEP:
2d21ac55
A
4318 error = sooptcopyin(sopt, &optval, sizeof (optval),
4319 sizeof (optval));
39236c6e
A
4320 if (error != 0)
4321 goto out;
1c79356b
A
4322 if (optval)
4323 so->so_options |= sopt->sopt_name;
4324 else
4325 so->so_options &= ~sopt->sopt_name;
4326 break;
4327
4328 case SO_SNDBUF:
4329 case SO_RCVBUF:
4330 case SO_SNDLOWAT:
4331 case SO_RCVLOWAT:
2d21ac55
A
4332 error = sooptcopyin(sopt, &optval, sizeof (optval),
4333 sizeof (optval));
39236c6e
A
4334 if (error != 0)
4335 goto out;
1c79356b
A
4336
4337 /*
4338 * Values < 1 make no sense for any of these
4339 * options, so disallow them.
4340 */
4341 if (optval < 1) {
4342 error = EINVAL;
39236c6e 4343 goto out;
1c79356b
A
4344 }
4345
4346 switch (sopt->sopt_name) {
4347 case SO_SNDBUF:
39236c6e
A
4348 case SO_RCVBUF: {
4349 struct sockbuf *sb =
4350 (sopt->sopt_name == SO_SNDBUF) ?
4351 &so->so_snd : &so->so_rcv;
4352 if (sbreserve(sb, (u_int32_t)optval) == 0) {
1c79356b 4353 error = ENOBUFS;
39236c6e 4354 goto out;
1c79356b 4355 }
316670eb
A
4356 sb->sb_flags |= SB_USRSIZE;
4357 sb->sb_flags &= ~SB_AUTOSIZE;
4358 sb->sb_idealsize = (u_int32_t)optval;
1c79356b 4359 break;
316670eb 4360 }
1c79356b
A
4361 /*
4362 * Make sure the low-water is never greater than
4363 * the high-water.
4364 */
fe8ab488
A
4365 case SO_SNDLOWAT: {
4366 int space = sbspace(&so->so_snd);
4367 u_int32_t hiwat = so->so_snd.sb_hiwat;
4368
4369 if (so->so_snd.sb_flags & SB_UNIX) {
4370 struct unpcb *unp =
4371 (struct unpcb *)(so->so_pcb);
4372 if (unp != NULL && unp->unp_conn != NULL) {
4373 hiwat += unp->unp_conn->unp_cc;
4374 }
4375 }
4376
1c79356b 4377 so->so_snd.sb_lowat =
fe8ab488
A
4378 (optval > hiwat) ?
4379 hiwat : optval;
4380
4381 if (space >= so->so_snd.sb_lowat) {
4382 sowwakeup(so);
4383 }
1c79356b 4384 break;
fe8ab488
A
4385 }
4386 case SO_RCVLOWAT: {
4387 int64_t data_len;
1c79356b
A
4388 so->so_rcv.sb_lowat =
4389 (optval > so->so_rcv.sb_hiwat) ?
4390 so->so_rcv.sb_hiwat : optval;
fe8ab488
A
4391 data_len = so->so_rcv.sb_cc
4392 - so->so_rcv.sb_ctl;
4393 if (data_len >= so->so_rcv.sb_lowat)
4394 sorwakeup(so);
1c79356b
A
4395 break;
4396 }
fe8ab488 4397 }
1c79356b
A
4398 break;
4399
4400 case SO_SNDTIMEO:
4401 case SO_RCVTIMEO:
2d21ac55 4402 error = sooptcopyin_timeval(sopt, &tv);
39236c6e
A
4403 if (error != 0)
4404 goto out;
1c79356b 4405
1c79356b
A
4406 switch (sopt->sopt_name) {
4407 case SO_SNDTIMEO:
91447636 4408 so->so_snd.sb_timeo = tv;
1c79356b
A
4409 break;
4410 case SO_RCVTIMEO:
91447636 4411 so->so_rcv.sb_timeo = tv;
1c79356b
A
4412 break;
4413 }
4414 break;
4415
39236c6e 4416 case SO_NKE: {
9bccf70c 4417 struct so_nke nke;
1c79356b 4418
2d21ac55
A
4419 error = sooptcopyin(sopt, &nke, sizeof (nke),
4420 sizeof (nke));
39236c6e
A
4421 if (error != 0)
4422 goto out;
1c79356b 4423
6d2010ae 4424 error = sflt_attach_internal(so, nke.nke_handle);
1c79356b
A
4425 break;
4426 }
4427
9bccf70c 4428 case SO_NOSIGPIPE:
2d21ac55
A
4429 error = sooptcopyin(sopt, &optval, sizeof (optval),
4430 sizeof (optval));
39236c6e
A
4431 if (error != 0)
4432 goto out;
4433 if (optval != 0)
2d21ac55
A
4434 so->so_flags |= SOF_NOSIGPIPE;
4435 else
4436 so->so_flags &= ~SOF_NOSIGPIPE;
9bccf70c
A
4437 break;
4438
55e303ae 4439 case SO_NOADDRERR:
2d21ac55
A
4440 error = sooptcopyin(sopt, &optval, sizeof (optval),
4441 sizeof (optval));
39236c6e
A
4442 if (error != 0)
4443 goto out;
4444 if (optval != 0)
2d21ac55
A
4445 so->so_flags |= SOF_NOADDRAVAIL;
4446 else
4447 so->so_flags &= ~SOF_NOADDRAVAIL;
2d21ac55
A
4448 break;
4449
4450 case SO_REUSESHAREUID:
4451 error = sooptcopyin(sopt, &optval, sizeof (optval),
4452 sizeof (optval));
39236c6e
A
4453 if (error != 0)
4454 goto out;
4455 if (optval != 0)
2d21ac55
A
4456 so->so_flags |= SOF_REUSESHAREUID;
4457 else
4458 so->so_flags &= ~SOF_REUSESHAREUID;
4459 break;
39236c6e 4460
2d21ac55
A
4461 case SO_NOTIFYCONFLICT:
4462 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
4463 error = EPERM;
39236c6e 4464 goto out;
2d21ac55
A
4465 }
4466 error = sooptcopyin(sopt, &optval, sizeof (optval),
4467 sizeof (optval));
39236c6e
A
4468 if (error != 0)
4469 goto out;
4470 if (optval != 0)
2d21ac55
A
4471 so->so_flags |= SOF_NOTIFYCONFLICT;
4472 else
4473 so->so_flags &= ~SOF_NOTIFYCONFLICT;
4474 break;
39236c6e 4475
2d21ac55 4476 case SO_RESTRICTIONS:
2d21ac55
A
4477 error = sooptcopyin(sopt, &optval, sizeof (optval),
4478 sizeof (optval));
39236c6e
A
4479 if (error != 0)
4480 goto out;
4481
4482 error = so_set_restrictions(so, optval);
2d21ac55
A
4483 break;
4484
fe8ab488
A
4485 case SO_AWDL_UNRESTRICTED:
4486 if (SOCK_DOM(so) != PF_INET &&
4487 SOCK_DOM(so) != PF_INET6) {
4488 error = EOPNOTSUPP;
4489 goto out;
4490 }
4491 error = sooptcopyin(sopt, &optval, sizeof(optval),
4492 sizeof(optval));
4493 if (error != 0)
4494 goto out;
4495 if (optval != 0) {
4496 kauth_cred_t cred = NULL;
4497 proc_t ep = PROC_NULL;
4498
4499 if (so->so_flags & SOF_DELEGATED) {
4500 ep = proc_find(so->e_pid);
4501 if (ep)
4502 cred = kauth_cred_proc_ref(ep);
4503 }
4504 error = priv_check_cred(
4505 cred ? cred : so->so_cred,
4506 PRIV_NET_RESTRICTED_AWDL, 0);
4507 if (error == 0)
4508 inp_set_awdl_unrestricted(
4509 sotoinpcb(so));
4510 if (cred)
4511 kauth_cred_unref(&cred);
4512 if (ep != PROC_NULL)
4513 proc_rele(ep);
4514 } else
4515 inp_clear_awdl_unrestricted(sotoinpcb(so));
4516 break;
4517
2d21ac55
A
4518 case SO_LABEL:
4519#if CONFIG_MACF_SOCKET
4520 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
4521 sizeof (extmac))) != 0)
39236c6e 4522 goto out;
2d21ac55
A
4523
4524 error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
4525 so, &extmac);
4526#else
4527 error = EOPNOTSUPP;
4528#endif /* MAC_SOCKET */
55e303ae
A
4529 break;
4530
4a3eedf9
A
4531 case SO_UPCALLCLOSEWAIT:
4532 error = sooptcopyin(sopt, &optval, sizeof (optval),
4533 sizeof (optval));
39236c6e
A
4534 if (error != 0)
4535 goto out;
4536 if (optval != 0)
4a3eedf9
A
4537 so->so_flags |= SOF_UPCALLCLOSEWAIT;
4538 else
4539 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
4540 break;
4a3eedf9 4541
b0d623f7
A
4542 case SO_RANDOMPORT:
4543 error = sooptcopyin(sopt, &optval, sizeof (optval),
4544 sizeof (optval));
39236c6e
A
4545 if (error != 0)
4546 goto out;
4547 if (optval != 0)
b0d623f7
A
4548 so->so_flags |= SOF_BINDRANDOMPORT;
4549 else
4550 so->so_flags &= ~SOF_BINDRANDOMPORT;
4551 break;
4552
4553 case SO_NP_EXTENSIONS: {
4554 struct so_np_extensions sonpx;
4555
39236c6e
A
4556 error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
4557 sizeof (sonpx));
4558 if (error != 0)
4559 goto out;
b0d623f7
A
4560 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
4561 error = EINVAL;
39236c6e 4562 goto out;
b0d623f7
A
4563 }
4564 /*
4565 * Only one bit defined for now
4566 */
4567 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
4568 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
4569 so->so_flags |= SOF_NPX_SETOPTSHUT;
4570 else
4571 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
4572 }
4573 break;
4574 }
4575
d41d1dae
A
4576 case SO_TRAFFIC_CLASS: {
4577 error = sooptcopyin(sopt, &optval, sizeof (optval),
39236c6e
A
4578 sizeof (optval));
4579 if (error != 0)
4580 goto out;
6d2010ae 4581 error = so_set_traffic_class(so, optval);
39236c6e
A
4582 if (error != 0)
4583 goto out;
6d2010ae 4584 break;
d41d1dae 4585 }
6d2010ae
A
4586
4587 case SO_RECV_TRAFFIC_CLASS: {
4588 error = sooptcopyin(sopt, &optval, sizeof (optval),
39236c6e
A
4589 sizeof (optval));
4590 if (error != 0)
4591 goto out;
6d2010ae
A
4592 if (optval == 0)
4593 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
4594 else
4595 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
4596 break;
4597 }
316670eb 4598
6d2010ae
A
4599 case SO_TRAFFIC_CLASS_DBG: {
4600 struct so_tcdbg so_tcdbg;
316670eb
A
4601
4602 error = sooptcopyin(sopt, &so_tcdbg,
4603 sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
39236c6e
A
4604 if (error != 0)
4605 goto out;
6d2010ae 4606 error = so_set_tcdbg(so, &so_tcdbg);
39236c6e
A
4607 if (error != 0)
4608 goto out;
6d2010ae
A
4609 break;
4610 }
316670eb
A
4611
4612 case SO_PRIVILEGED_TRAFFIC_CLASS:
4613 error = priv_check_cred(kauth_cred_get(),
4614 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
39236c6e
A
4615 if (error != 0)
4616 goto out;
316670eb 4617 error = sooptcopyin(sopt, &optval, sizeof (optval),
39236c6e
A
4618 sizeof (optval));
4619 if (error != 0)
4620 goto out;
316670eb
A
4621 if (optval == 0)
4622 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
4623 else
4624 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
4625 break;
4626
6d2010ae
A
4627 case SO_DEFUNCTOK:
4628 error = sooptcopyin(sopt, &optval, sizeof (optval),
4629 sizeof (optval));
4630 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
4631 if (error == 0)
4632 error = EBADF;
39236c6e 4633 goto out;
6d2010ae
A
4634 }
4635 /*
4636 * Any process can set SO_DEFUNCTOK (clear
4637 * SOF_NODEFUNCT), but only root can clear
4638 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
4639 */
4640 if (optval == 0 &&
4641 kauth_cred_issuser(kauth_cred_get()) == 0) {
4642 error = EPERM;
39236c6e 4643 goto out;
6d2010ae
A
4644 }
4645 if (optval)
4646 so->so_flags &= ~SOF_NODEFUNCT;
4647 else
4648 so->so_flags |= SOF_NODEFUNCT;
4649
39236c6e
A
4650 if (SOCK_DOM(so) == PF_INET ||
4651 SOCK_DOM(so) == PF_INET6) {
4652 char s[MAX_IPv6_STR_LEN];
4653 char d[MAX_IPv6_STR_LEN];
4654 struct inpcb *inp = sotoinpcb(so);
4655
4656 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%s %s:%d -> "
4657 "%s:%d] is now marked as %seligible for "
4658 "defunct\n", __func__, proc_selfpid(),
4659 (uint64_t)VM_KERNEL_ADDRPERM(so),
4660 (SOCK_TYPE(so) == SOCK_STREAM) ?
4661 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
4662 ((SOCK_DOM(so) == PF_INET) ?
4663 (void *)&inp->inp_laddr.s_addr :
4664 (void *)&inp->in6p_laddr), s, sizeof (s)),
4665 ntohs(inp->in6p_lport),
4666 inet_ntop(SOCK_DOM(so),
4667 (SOCK_DOM(so) == PF_INET) ?
4668 (void *)&inp->inp_faddr.s_addr :
4669 (void *)&inp->in6p_faddr, d, sizeof (d)),
4670 ntohs(inp->in6p_fport),
4671 (so->so_flags & SOF_NODEFUNCT) ?
4672 "not " : ""));
4673 } else {
4674 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d] is "
4675 "now marked as %seligible for defunct\n",
4676 __func__, proc_selfpid(),
4677 (uint64_t)VM_KERNEL_ADDRPERM(so),
4678 SOCK_DOM(so), SOCK_TYPE(so),
4679 (so->so_flags & SOF_NODEFUNCT) ?
4680 "not " : ""));
4681 }
6d2010ae
A
4682 break;
4683
4684 case SO_ISDEFUNCT:
4685 /* This option is not settable */
4686 error = EINVAL;
4687 break;
d41d1dae 4688
316670eb
A
4689 case SO_OPPORTUNISTIC:
4690 error = sooptcopyin(sopt, &optval, sizeof (optval),
4691 sizeof (optval));
4692 if (error == 0)
4693 error = so_set_opportunistic(so, optval);
4694 break;
4695
4696 case SO_FLUSH:
4697 /* This option is handled by lower layer(s) */
4698 error = 0;
4699 break;
4700
4701 case SO_RECV_ANYIF:
4702 error = sooptcopyin(sopt, &optval, sizeof (optval),
4703 sizeof (optval));
4704 if (error == 0)
4705 error = so_set_recv_anyif(so, optval);
4706 break;
4707
39236c6e
A
4708 case SO_TRAFFIC_MGT_BACKGROUND: {
4709 /* This option is handled by lower layer(s) */
4710 error = 0;
4711 break;
4712 }
4713
4714#if FLOW_DIVERT
4715 case SO_FLOW_DIVERT_TOKEN:
4716 error = flow_divert_token_set(so, sopt);
4717 break;
4718#endif /* FLOW_DIVERT */
4719
4720
4721 case SO_DELEGATED:
4722 if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
4723 sizeof (optval))) != 0)
4724 break;
4725
4726 error = so_set_effective_pid(so, optval, sopt->sopt_p);
4727 break;
4728
4729 case SO_DELEGATED_UUID: {
4730 uuid_t euuid;
4731
4732 if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
4733 sizeof (euuid))) != 0)
4734 break;
4735
4736 error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
4737 break;
4738 }
fe8ab488
A
4739
4740#if NECP
4741 case SO_NECP_ATTRIBUTES:
4742 error = necp_set_socket_attributes(so, sopt);
4743 break;
4744#endif /* NECP */
4745
4746#if MPTCP
4747 case SO_MPTCP_FASTJOIN:
4748 if (!((so->so_flags & SOF_MP_SUBFLOW) ||
4749 ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
4750 (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
4751 error = ENOPROTOOPT;
4752 break;
4753 }
4754
4755 error = sooptcopyin(sopt, &optval, sizeof (optval),
4756 sizeof (optval));
4757 if (error != 0)
4758 goto out;
4759 if (optval == 0)
4760 so->so_flags &= ~SOF_MPTCP_FASTJOIN;
4761 else
4762 so->so_flags |= SOF_MPTCP_FASTJOIN;
4763 break;
4764#endif /* MPTCP */
39236c6e 4765
1c79356b
A
4766 default:
4767 error = ENOPROTOOPT;
4768 break;
4769 }
39236c6e
A
4770 if (error == 0 && so->so_proto != NULL &&
4771 so->so_proto->pr_ctloutput != NULL) {
4772 (void) so->so_proto->pr_ctloutput(so, sopt);
1c79356b
A
4773 }
4774 }
39236c6e
A
4775out:
4776 if (dolock)
4777 socket_unlock(so, 1);
1c79356b
A
4778 return (error);
4779}
4780
2d21ac55 4781/* Helper routines for getsockopt */
1c79356b 4782int
2d21ac55 4783sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
1c79356b
A
4784{
4785 int error;
4786 size_t valsize;
4787
4788 error = 0;
4789
4790 /*
4791 * Documented get behavior is that we always return a value,
4792 * possibly truncated to fit in the user's buffer.
4793 * Traditional behavior is that we always tell the user
4794 * precisely how much we copied, rather than something useful
4795 * like the total amount we had available for her.
4796 * Note that this interface is not idempotent; the entire answer must
4797 * generated ahead of time.
4798 */
4799 valsize = min(len, sopt->sopt_valsize);
4800 sopt->sopt_valsize = valsize;
91447636 4801 if (sopt->sopt_val != USER_ADDR_NULL) {
b0d623f7 4802 if (sopt->sopt_p != kernproc)
1c79356b
A
4803 error = copyout(buf, sopt->sopt_val, valsize);
4804 else
91447636 4805 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
1c79356b 4806 }
2d21ac55
A
4807 return (error);
4808}
4809
4810static int
39236c6e 4811sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
2d21ac55
A
4812{
4813 int error;
4814 size_t len;
b0d623f7
A
4815 struct user64_timeval tv64;
4816 struct user32_timeval tv32;
2d21ac55
A
4817 const void * val;
4818 size_t valsize;
b0d623f7 4819
2d21ac55
A
4820 error = 0;
4821 if (proc_is64bit(sopt->sopt_p)) {
39236c6e 4822 len = sizeof (tv64);
2d21ac55
A
4823 tv64.tv_sec = tv_p->tv_sec;
4824 tv64.tv_usec = tv_p->tv_usec;
4825 val = &tv64;
4826 } else {
39236c6e 4827 len = sizeof (tv32);
b0d623f7
A
4828 tv32.tv_sec = tv_p->tv_sec;
4829 tv32.tv_usec = tv_p->tv_usec;
4830 val = &tv32;
2d21ac55
A
4831 }
4832 valsize = min(len, sopt->sopt_valsize);
4833 sopt->sopt_valsize = valsize;
4834 if (sopt->sopt_val != USER_ADDR_NULL) {
b0d623f7 4835 if (sopt->sopt_p != kernproc)
2d21ac55
A
4836 error = copyout(val, sopt->sopt_val, valsize);
4837 else
4838 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
4839 }
4840 return (error);
1c79356b
A
4841}
4842
2d21ac55
A
4843/*
4844 * Return: 0 Success
4845 * ENOPROTOOPT
4846 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4847 * <pr_ctloutput>:???
4848 * <sf_getoption>:???
4849 */
1c79356b 4850int
39236c6e 4851sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
1c79356b
A
4852{
4853 int error, optval;
4854 struct linger l;
4855 struct timeval tv;
2d21ac55
A
4856#if CONFIG_MACF_SOCKET
4857 struct mac extmac;
4858#endif /* MAC_SOCKET */
1c79356b 4859
39236c6e 4860 if (sopt->sopt_dir != SOPT_GET)
2d21ac55 4861 sopt->sopt_dir = SOPT_GET;
9bccf70c 4862
39236c6e
A
4863 if (dolock)
4864 socket_lock(so, 1);
2d21ac55 4865
6d2010ae 4866 error = sflt_getsockopt(so, sopt);
39236c6e 4867 if (error != 0) {
6d2010ae
A
4868 if (error == EJUSTRETURN)
4869 error = 0;
39236c6e 4870 goto out;
1c79356b 4871 }
39236c6e 4872
1c79356b 4873 if (sopt->sopt_level != SOL_SOCKET) {
39236c6e
A
4874 if (so->so_proto != NULL &&
4875 so->so_proto->pr_ctloutput != NULL) {
2d21ac55 4876 error = (*so->so_proto->pr_ctloutput)(so, sopt);
39236c6e 4877 goto out;
91447636 4878 }
39236c6e 4879 error = ENOPROTOOPT;
1c79356b 4880 } else {
39236c6e
A
4881 /*
4882 * Allow socket-level (SOL_SOCKET) options to be filtered by
4883 * the protocol layer, if needed. A zero value returned from
4884 * the handler means use default socket-level processing as
4885 * done by the rest of this routine. Otherwise, any other
4886 * return value indicates that the option is unsupported.
4887 */
4888 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4889 pru_socheckopt(so, sopt)) != 0)
4890 goto out;
4891
4892 error = 0;
1c79356b
A
4893 switch (sopt->sopt_name) {
4894 case SO_LINGER:
91447636 4895 case SO_LINGER_SEC:
39236c6e 4896 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
2d21ac55
A
4897 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
4898 so->so_linger : so->so_linger / hz;
4899 error = sooptcopyout(sopt, &l, sizeof (l));
1c79356b
A
4900 break;
4901
4902 case SO_USELOOPBACK:
4903 case SO_DONTROUTE:
4904 case SO_DEBUG:
4905 case SO_KEEPALIVE:
4906 case SO_REUSEADDR:
4907 case SO_REUSEPORT:
4908 case SO_BROADCAST:
4909 case SO_OOBINLINE:
4910 case SO_TIMESTAMP:
6d2010ae 4911 case SO_TIMESTAMP_MONOTONIC:
1c79356b
A
4912 case SO_DONTTRUNC:
4913 case SO_WANTMORE:
9bccf70c 4914 case SO_WANTOOBFLAG:
fe8ab488 4915 case SO_NOWAKEFROMSLEEP:
1c79356b
A
4916 optval = so->so_options & sopt->sopt_name;
4917integer:
2d21ac55 4918 error = sooptcopyout(sopt, &optval, sizeof (optval));
1c79356b
A
4919 break;
4920
4921 case SO_TYPE:
4922 optval = so->so_type;
4923 goto integer;
4924
4925 case SO_NREAD:
2d21ac55
A
4926 if (so->so_proto->pr_flags & PR_ATOMIC) {
4927 int pkt_total;
4928 struct mbuf *m1;
1c79356b 4929
2d21ac55
A
4930 pkt_total = 0;
4931 m1 = so->so_rcv.sb_mb;
39236c6e
A
4932 while (m1 != NULL) {
4933 if (m1->m_type == MT_DATA ||
4934 m1->m_type == MT_HEADER ||
4935 m1->m_type == MT_OOBDATA)
1c79356b 4936 pkt_total += m1->m_len;
1c79356b
A
4937 m1 = m1->m_next;
4938 }
4939 optval = pkt_total;
2d21ac55
A
4940 } else {
4941 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
4942 }
1c79356b 4943 goto integer;
39236c6e 4944
fe8ab488
A
4945 case SO_NUMRCVPKT:
4946 if (so->so_proto->pr_flags & PR_ATOMIC) {
4947 int cnt = 0;
4948 struct mbuf *m1;
4949
4950 m1 = so->so_rcv.sb_mb;
4951 while (m1 != NULL) {
4952 if (m1->m_type == MT_DATA ||
4953 m1->m_type == MT_HEADER ||
4954 m1->m_type == MT_OOBDATA)
4955 cnt += 1;
4956 m1 = m1->m_nextpkt;
4957 }
4958 optval = cnt;
4959 goto integer;
4960 } else {
4961 error = EINVAL;
4962 break;
4963 }
4964
91447636
A
4965 case SO_NWRITE:
4966 optval = so->so_snd.sb_cc;
2d21ac55 4967 goto integer;
39236c6e 4968
1c79356b
A
4969 case SO_ERROR:
4970 optval = so->so_error;
4971 so->so_error = 0;
4972 goto integer;
4973
fe8ab488
A
4974 case SO_SNDBUF: {
4975 u_int32_t hiwat = so->so_snd.sb_hiwat;
1c79356b 4976
fe8ab488
A
4977 if (so->so_snd.sb_flags & SB_UNIX) {
4978 struct unpcb *unp =
4979 (struct unpcb *)(so->so_pcb);
4980 if (unp != NULL && unp->unp_conn != NULL) {
4981 hiwat += unp->unp_conn->unp_cc;
4982 }
4983 }
4984
4985 optval = hiwat;
4986 goto integer;
4987 }
1c79356b
A
4988 case SO_RCVBUF:
4989 optval = so->so_rcv.sb_hiwat;
4990 goto integer;
4991
4992 case SO_SNDLOWAT:
4993 optval = so->so_snd.sb_lowat;
4994 goto integer;
4995
4996 case SO_RCVLOWAT:
4997 optval = so->so_rcv.sb_lowat;
4998 goto integer;
4999
5000 case SO_SNDTIMEO:
5001 case SO_RCVTIMEO:
91447636 5002 tv = (sopt->sopt_name == SO_SNDTIMEO ?
2d21ac55 5003 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1c79356b 5004
2d21ac55
A
5005 error = sooptcopyout_timeval(sopt, &tv);
5006 break;
1c79356b 5007
91447636
A
5008 case SO_NOSIGPIPE:
5009 optval = (so->so_flags & SOF_NOSIGPIPE);
5010 goto integer;
9bccf70c 5011
55e303ae 5012 case SO_NOADDRERR:
91447636
A
5013 optval = (so->so_flags & SOF_NOADDRAVAIL);
5014 goto integer;
55e303ae 5015
2d21ac55
A
5016 case SO_REUSESHAREUID:
5017 optval = (so->so_flags & SOF_REUSESHAREUID);
5018 goto integer;
5019
39236c6e 5020
2d21ac55
A
5021 case SO_NOTIFYCONFLICT:
5022 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5023 goto integer;
39236c6e 5024
2d21ac55 5025 case SO_RESTRICTIONS:
39236c6e 5026 optval = so_get_restrictions(so);
2d21ac55
A
5027 goto integer;
5028
fe8ab488
A
5029 case SO_AWDL_UNRESTRICTED:
5030 if (SOCK_DOM(so) == PF_INET ||
5031 SOCK_DOM(so) == PF_INET6) {
5032 optval = inp_get_awdl_unrestricted(
5033 sotoinpcb(so));
5034 goto integer;
5035 } else
5036 error = EOPNOTSUPP;
5037 break;
5038
2d21ac55
A
5039 case SO_LABEL:
5040#if CONFIG_MACF_SOCKET
5041 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5042 sizeof (extmac))) != 0 ||
5043 (error = mac_socket_label_get(proc_ucred(
5044 sopt->sopt_p), so, &extmac)) != 0)
5045 break;
5046
5047 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5048#else
5049 error = EOPNOTSUPP;
5050#endif /* MAC_SOCKET */
5051 break;
5052
5053 case SO_PEERLABEL:
5054#if CONFIG_MACF_SOCKET
5055 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5056 sizeof (extmac))) != 0 ||
5057 (error = mac_socketpeer_label_get(proc_ucred(
5058 sopt->sopt_p), so, &extmac)) != 0)
5059 break;
5060
5061 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5062#else
5063 error = EOPNOTSUPP;
5064#endif /* MAC_SOCKET */
5065 break;
5066
4a3eedf9
A
5067#ifdef __APPLE_API_PRIVATE
5068 case SO_UPCALLCLOSEWAIT:
5069 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
5070 goto integer;
5071#endif
b0d623f7
A
5072 case SO_RANDOMPORT:
5073 optval = (so->so_flags & SOF_BINDRANDOMPORT);
5074 goto integer;
5075
5076 case SO_NP_EXTENSIONS: {
5077 struct so_np_extensions sonpx;
5078
39236c6e
A
5079 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
5080 SONPX_SETOPTSHUT : 0;
b0d623f7 5081 sonpx.npx_mask = SONPX_MASK_VALID;
4a3eedf9 5082
39236c6e
A
5083 error = sooptcopyout(sopt, &sonpx,
5084 sizeof (struct so_np_extensions));
5085 break;
b0d623f7 5086 }
6d2010ae 5087
d41d1dae
A
5088 case SO_TRAFFIC_CLASS:
5089 optval = so->so_traffic_class;
5090 goto integer;
316670eb 5091
6d2010ae
A
5092 case SO_RECV_TRAFFIC_CLASS:
5093 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
5094 goto integer;
5095
5096 case SO_TRAFFIC_CLASS_STATS:
39236c6e
A
5097 error = sooptcopyout(sopt, &so->so_tc_stats,
5098 sizeof (so->so_tc_stats));
316670eb 5099 break;
6d2010ae 5100
39236c6e 5101 case SO_TRAFFIC_CLASS_DBG:
6d2010ae
A
5102 error = sogetopt_tcdbg(so, sopt);
5103 break;
316670eb
A
5104
5105 case SO_PRIVILEGED_TRAFFIC_CLASS:
5106 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
5107 goto integer;
5108
6d2010ae
A
5109 case SO_DEFUNCTOK:
5110 optval = !(so->so_flags & SOF_NODEFUNCT);
5111 goto integer;
5112
5113 case SO_ISDEFUNCT:
5114 optval = (so->so_flags & SOF_DEFUNCT);
5115 goto integer;
d41d1dae 5116
316670eb
A
5117 case SO_OPPORTUNISTIC:
5118 optval = so_get_opportunistic(so);
5119 goto integer;
5120
5121 case SO_FLUSH:
5122 /* This option is not gettable */
5123 error = EINVAL;
5124 break;
5125
5126 case SO_RECV_ANYIF:
5127 optval = so_get_recv_anyif(so);
5128 goto integer;
5129
39236c6e
A
5130 case SO_TRAFFIC_MGT_BACKGROUND:
5131 /* This option is handled by lower layer(s) */
5132 if (so->so_proto != NULL &&
5133 so->so_proto->pr_ctloutput != NULL) {
5134 (void) so->so_proto->pr_ctloutput(so, sopt);
5135 }
5136 break;
5137
5138#if FLOW_DIVERT
5139 case SO_FLOW_DIVERT_TOKEN:
5140 error = flow_divert_token_get(so, sopt);
5141 break;
5142#endif /* FLOW_DIVERT */
fe8ab488
A
5143
5144#if NECP
5145 case SO_NECP_ATTRIBUTES:
5146 error = necp_get_socket_attributes(so, sopt);
5147 break;
5148#endif /* NECP */
5149
5150#if CONTENT_FILTER
5151 case SO_CFIL_SOCK_ID: {
5152 cfil_sock_id_t sock_id;
5153
5154 sock_id = cfil_sock_id_from_socket(so);
5155
5156 error = sooptcopyout(sopt, &sock_id,
5157 sizeof(cfil_sock_id_t));
5158 break;
5159 }
5160#endif /* CONTENT_FILTER */
5161
5162#if MPTCP
5163 case SO_MPTCP_FASTJOIN:
5164 if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5165 ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5166 (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5167 error = ENOPROTOOPT;
5168 break;
5169 }
5170 optval = (so->so_flags & SOF_MPTCP_FASTJOIN);
5171 break;
5172#endif /* MPTCP */
39236c6e 5173
1c79356b
A
5174 default:
5175 error = ENOPROTOOPT;
5176 break;
5177 }
1c79356b 5178 }
39236c6e
A
5179out:
5180 if (dolock)
5181 socket_unlock(so, 1);
5182 return (error);
1c79356b 5183}
39236c6e
A
5184
5185/*
5186 * The size limits on our soopt_getm is different from that on FreeBSD.
6d2010ae
A
5187 * We limit the size of options to MCLBYTES. This will have to change
5188 * if we need to define options that need more space than MCLBYTES.
5189 */
1c79356b 5190int
9bccf70c 5191soopt_getm(struct sockopt *sopt, struct mbuf **mp)
1c79356b
A
5192{
5193 struct mbuf *m, *m_prev;
5194 int sopt_size = sopt->sopt_valsize;
b0d623f7 5195 int how;
1c79356b 5196
6d2010ae 5197 if (sopt_size <= 0 || sopt_size > MCLBYTES)
2d21ac55 5198 return (EMSGSIZE);
a3d08fcd 5199
b0d623f7
A
5200 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
5201 MGET(m, how, MT_DATA);
39236c6e 5202 if (m == NULL)
2d21ac55 5203 return (ENOBUFS);
1c79356b 5204 if (sopt_size > MLEN) {
b0d623f7 5205 MCLGET(m, how);
1c79356b
A
5206 if ((m->m_flags & M_EXT) == 0) {
5207 m_free(m);
2d21ac55 5208 return (ENOBUFS);
1c79356b
A
5209 }
5210 m->m_len = min(MCLBYTES, sopt_size);
5211 } else {
5212 m->m_len = min(MLEN, sopt_size);
5213 }
5214 sopt_size -= m->m_len;
5215 *mp = m;
5216 m_prev = m;
5217
6d2010ae 5218 while (sopt_size > 0) {
b0d623f7 5219 MGET(m, how, MT_DATA);
39236c6e 5220 if (m == NULL) {
1c79356b 5221 m_freem(*mp);
2d21ac55 5222 return (ENOBUFS);
1c79356b
A
5223 }
5224 if (sopt_size > MLEN) {
b0d623f7 5225 MCLGET(m, how);
1c79356b
A
5226 if ((m->m_flags & M_EXT) == 0) {
5227 m_freem(*mp);
6d2010ae 5228 m_freem(m);
2d21ac55 5229 return (ENOBUFS);
1c79356b
A
5230 }
5231 m->m_len = min(MCLBYTES, sopt_size);
5232 } else {
5233 m->m_len = min(MLEN, sopt_size);
5234 }
5235 sopt_size -= m->m_len;
5236 m_prev->m_next = m;
5237 m_prev = m;
5238 }
2d21ac55 5239 return (0);
1c79356b
A
5240}
5241
6d2010ae 5242/* copyin sopt data into mbuf chain */
1c79356b 5243int
9bccf70c 5244soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
1c79356b
A
5245{
5246 struct mbuf *m0 = m;
5247
91447636 5248 if (sopt->sopt_val == USER_ADDR_NULL)
2d21ac55 5249 return (0);
1c79356b 5250 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
b0d623f7 5251 if (sopt->sopt_p != kernproc) {
1c79356b
A
5252 int error;
5253
2d21ac55
A
5254 error = copyin(sopt->sopt_val, mtod(m, char *),
5255 m->m_len);
1c79356b
A
5256 if (error != 0) {
5257 m_freem(m0);
2d21ac55 5258 return (error);
1c79356b 5259 }
2d21ac55
A
5260 } else {
5261 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
5262 mtod(m, char *), m->m_len);
5263 }
1c79356b 5264 sopt->sopt_valsize -= m->m_len;
2d21ac55 5265 sopt->sopt_val += m->m_len;
1c79356b
A
5266 m = m->m_next;
5267 }
39236c6e
A
5268 /* should be allocated enoughly at ip6_sooptmcopyin() */
5269 if (m != NULL) {
9bccf70c 5270 panic("soopt_mcopyin");
39236c6e
A
5271 /* NOTREACHED */
5272 }
2d21ac55 5273 return (0);
1c79356b
A
5274}
5275
6d2010ae 5276/* copyout mbuf chain data into soopt */
1c79356b 5277int
9bccf70c 5278soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
1c79356b
A
5279{
5280 struct mbuf *m0 = m;
5281 size_t valsize = 0;
5282
91447636 5283 if (sopt->sopt_val == USER_ADDR_NULL)
2d21ac55 5284 return (0);
1c79356b 5285 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
b0d623f7 5286 if (sopt->sopt_p != kernproc) {
1c79356b
A
5287 int error;
5288
2d21ac55
A
5289 error = copyout(mtod(m, char *), sopt->sopt_val,
5290 m->m_len);
1c79356b
A
5291 if (error != 0) {
5292 m_freem(m0);
2d21ac55 5293 return (error);
1c79356b 5294 }
2d21ac55
A
5295 } else {
5296 bcopy(mtod(m, char *),
5297 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
5298 }
5299 sopt->sopt_valsize -= m->m_len;
5300 sopt->sopt_val += m->m_len;
5301 valsize += m->m_len;
5302 m = m->m_next;
1c79356b
A
5303 }
5304 if (m != NULL) {
5305 /* enough soopt buffer should be given from user-land */
5306 m_freem(m0);
2d21ac55 5307 return (EINVAL);
1c79356b
A
5308 }
5309 sopt->sopt_valsize = valsize;
2d21ac55 5310 return (0);
1c79356b
A
5311}
5312
9bccf70c 5313void
2d21ac55 5314sohasoutofband(struct socket *so)
9bccf70c 5315{
9bccf70c
A
5316 if (so->so_pgid < 0)
5317 gsignal(-so->so_pgid, SIGURG);
2d21ac55
A
5318 else if (so->so_pgid > 0)
5319 proc_signal(so->so_pgid, SIGURG);
9bccf70c
A
5320 selwakeup(&so->so_rcv.sb_sel);
5321}
5322
5323int
39236c6e 5324sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
9bccf70c 5325{
39236c6e 5326#pragma unused(cred)
9bccf70c
A
5327 struct proc *p = current_proc();
5328 int revents = 0;
91447636
A
5329
5330 socket_lock(so, 1);
39236c6e
A
5331 so_update_last_owner_locked(so, PROC_NULL);
5332 so_update_policy(so);
9bccf70c
A
5333
5334 if (events & (POLLIN | POLLRDNORM))
5335 if (soreadable(so))
5336 revents |= events & (POLLIN | POLLRDNORM);
5337
5338 if (events & (POLLOUT | POLLWRNORM))
5339 if (sowriteable(so))
5340 revents |= events & (POLLOUT | POLLWRNORM);
5341
5342 if (events & (POLLPRI | POLLRDBAND))
5343 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
5344 revents |= events & (POLLPRI | POLLRDBAND);
5345
5346 if (revents == 0) {
5347 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
2d21ac55
A
5348 /*
5349 * Darwin sets the flag first,
5350 * BSD calls selrecord first
5351 */
9bccf70c
A
5352 so->so_rcv.sb_flags |= SB_SEL;
5353 selrecord(p, &so->so_rcv.sb_sel, wql);
5354 }
5355
5356 if (events & (POLLOUT | POLLWRNORM)) {
2d21ac55
A
5357 /*
5358 * Darwin sets the flag first,
5359 * BSD calls selrecord first
5360 */
9bccf70c
A
5361 so->so_snd.sb_flags |= SB_SEL;
5362 selrecord(p, &so->so_snd.sb_sel, wql);
5363 }
5364 }
5365
91447636 5366 socket_unlock(so, 1);
9bccf70c
A
5367 return (revents);
5368}
55e303ae 5369
55e303ae 5370int
39236c6e 5371soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx)
55e303ae 5372{
39236c6e
A
5373#pragma unused(fp)
5374#if !CONFIG_MACF_SOCKET
5375#pragma unused(ctx)
5376#endif /* MAC_SOCKET */
91447636 5377 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
316670eb 5378 struct klist *skl;
2d21ac55 5379
91447636 5380 socket_lock(so, 1);
39236c6e
A
5381 so_update_last_owner_locked(so, PROC_NULL);
5382 so_update_policy(so);
55e303ae 5383
2d21ac55 5384#if CONFIG_MACF_SOCKET
39236c6e
A
5385 if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
5386 kn, so) != 0) {
2d21ac55
A
5387 socket_unlock(so, 1);
5388 return (1);
5389 }
5390#endif /* MAC_SOCKET */
5391
55e303ae
A
5392 switch (kn->kn_filter) {
5393 case EVFILT_READ:
b0d623f7 5394 kn->kn_fop = &soread_filtops;
04b8595b
A
5395 /*
5396 * If the caller explicitly asked for OOB results (e.g. poll()),
5397 * save that off in the hookid field and reserve the kn_flags
5398 * EV_OOBAND bit for output only).
5399 */
5400 if (kn->kn_flags & EV_OOBAND) {
5401 kn->kn_flags &= ~EV_OOBAND;
5402 kn->kn_hookid = EV_OOBAND;
5403 } else {
5404 kn->kn_hookid = 0;
5405 }
316670eb 5406 skl = &so->so_rcv.sb_sel.si_note;
55e303ae
A
5407 break;
5408 case EVFILT_WRITE:
5409 kn->kn_fop = &sowrite_filtops;
316670eb
A
5410 skl = &so->so_snd.sb_sel.si_note;
5411 break;
5412 case EVFILT_SOCK:
5413 kn->kn_fop = &sock_filtops;
5414 skl = &so->so_klist;
55e303ae
A
5415 break;
5416 default:
91447636 5417 socket_unlock(so, 1);
55e303ae
A
5418 return (1);
5419 }
5420
316670eb 5421 if (KNOTE_ATTACH(skl, kn)) {
39236c6e 5422 switch (kn->kn_filter) {
316670eb
A
5423 case EVFILT_READ:
5424 so->so_rcv.sb_flags |= SB_KNOTE;
5425 break;
5426 case EVFILT_WRITE:
5427 so->so_snd.sb_flags |= SB_KNOTE;
5428 break;
5429 case EVFILT_SOCK:
5430 so->so_flags |= SOF_KNOTE;
5431 break;
5432 default:
5433 socket_unlock(so, 1);
5434 return (1);
5435 }
5436 }
91447636 5437 socket_unlock(so, 1);
55e303ae
A
5438 return (0);
5439}
5440
5441static void
5442filt_sordetach(struct knote *kn)
5443{
91447636 5444 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
55e303ae 5445
91447636
A
5446 socket_lock(so, 1);
5447 if (so->so_rcv.sb_flags & SB_KNOTE)
55e303ae
A
5448 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
5449 so->so_rcv.sb_flags &= ~SB_KNOTE;
91447636 5450 socket_unlock(so, 1);
55e303ae
A
5451}
5452
5453/*ARGSUSED*/
5454static int
5455filt_soread(struct knote *kn, long hint)
5456{
91447636 5457 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
55e303ae 5458
91447636
A
5459 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5460 socket_lock(so, 1);
5461
b0d623f7
A
5462 if (so->so_options & SO_ACCEPTCONN) {
5463 int isempty;
5464
39236c6e
A
5465 /*
5466 * Radar 6615193 handle the listen case dynamically
5467 * for kqueue read filter. This allows to call listen()
5468 * after registering the kqueue EVFILT_READ.
b0d623f7
A
5469 */
5470
5471 kn->kn_data = so->so_qlen;
5472 isempty = ! TAILQ_EMPTY(&so->so_comp);
5473
5474 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5475 socket_unlock(so, 1);
5476
5477 return (isempty);
5478 }
5479
5480 /* socket isn't a listener */
2d21ac55 5481 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
04b8595b
A
5482 /*
5483 * Clear out EV_OOBAND that filt_soread may have set in the
5484 * past.
5485 */
5486 kn->kn_flags &= ~EV_OOBAND;
2d21ac55 5487
04b8595b
A
5488 if ((so->so_oobmark) || (so->so_state & SS_RCVATMARK)){
5489 kn->kn_flags |= EV_OOBAND;
5490 /*
5491 * If caller registered explicit interest in OOB data,
5492 * return immediately (data == amount beyond mark, for
5493 * legacy reasons - that should be changed later).
5494 */
5495 if (kn->kn_hookid == EV_OOBAND) {
5496 /*
5497 * When so_state is SS_RCVATMARK, so_oobmark
5498 * is 0.
5499 */
2d21ac55 5500 kn->kn_data -= so->so_oobmark;
91447636
A
5501 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5502 socket_unlock(so, 1);
5503 return (1);
5504 }
04b8595b
A
5505 }
5506
5507 if ((so->so_state & SS_CANTRCVMORE)
fe8ab488 5508#if CONTENT_FILTER
04b8595b 5509 && cfil_sock_data_pending(&so->so_rcv) == 0
fe8ab488 5510#endif /* CONTENT_FILTER */
04b8595b
A
5511 ) {
5512 kn->kn_flags |= EV_EOF;
5513 kn->kn_fflags = so->so_error;
91447636
A
5514 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5515 socket_unlock(so, 1);
04b8595b 5516 return (1);
91447636
A
5517 }
5518
5519 if (so->so_error) { /* temporary udp error */
5520 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5521 socket_unlock(so, 1);
55e303ae 5522 return (1);
91447636
A
5523 }
5524
6d2010ae 5525 int64_t lowwat = so->so_rcv.sb_lowat;
39236c6e 5526 if (kn->kn_sfflags & NOTE_LOWAT) {
6d2010ae
A
5527 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
5528 lowwat = so->so_rcv.sb_hiwat;
5529 else if (kn->kn_sdata > lowwat)
5530 lowwat = kn->kn_sdata;
5531 }
39236c6e 5532
91447636
A
5533 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5534 socket_unlock(so, 1);
39236c6e 5535
04b8595b 5536 return (kn->kn_data >= lowwat);
55e303ae
A
5537}
5538
5539static void
5540filt_sowdetach(struct knote *kn)
5541{
91447636
A
5542 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5543 socket_lock(so, 1);
55e303ae 5544
2d21ac55 5545 if (so->so_snd.sb_flags & SB_KNOTE)
55e303ae
A
5546 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
5547 so->so_snd.sb_flags &= ~SB_KNOTE;
91447636 5548 socket_unlock(so, 1);
55e303ae
A
5549}
5550
316670eb
A
5551int
5552so_wait_for_if_feedback(struct socket *so)
5553{
39236c6e 5554 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
316670eb
A
5555 (so->so_state & SS_ISCONNECTED)) {
5556 struct inpcb *inp = sotoinpcb(so);
5557 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
5558 return (1);
5559 }
5560 return (0);
5561}
5562
55e303ae
A
5563/*ARGSUSED*/
5564static int
5565filt_sowrite(struct knote *kn, long hint)
5566{
91447636 5567 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
316670eb 5568 int ret = 0;
91447636
A
5569
5570 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5571 socket_lock(so, 1);
55e303ae
A
5572
5573 kn->kn_data = sbspace(&so->so_snd);
5574 if (so->so_state & SS_CANTSENDMORE) {
2d21ac55 5575 kn->kn_flags |= EV_EOF;
55e303ae 5576 kn->kn_fflags = so->so_error;
316670eb
A
5577 ret = 1;
5578 goto out;
55e303ae 5579 }
91447636 5580 if (so->so_error) { /* temporary udp error */
316670eb
A
5581 ret = 1;
5582 goto out;
91447636 5583 }
55e303ae 5584 if (((so->so_state & SS_ISCONNECTED) == 0) &&
91447636 5585 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
316670eb
A
5586 ret = 0;
5587 goto out;
91447636 5588 }
6d2010ae 5589 int64_t lowwat = so->so_snd.sb_lowat;
39236c6e 5590 if (kn->kn_sfflags & NOTE_LOWAT) {
6d2010ae
A
5591 if (kn->kn_sdata > so->so_snd.sb_hiwat)
5592 lowwat = so->so_snd.sb_hiwat;
5593 else if (kn->kn_sdata > lowwat)
5594 lowwat = kn->kn_sdata;
5595 }
316670eb 5596 if (kn->kn_data >= lowwat) {
fe8ab488
A
5597 if (so->so_flags & SOF_NOTSENT_LOWAT) {
5598 if ((SOCK_DOM(so) == PF_INET
5599 || SOCK_DOM(so) == PF_INET6)
5600 && so->so_type == SOCK_STREAM) {
5601 ret = tcp_notsent_lowat_check(so);
5602 }
5603#if MPTCP
5604 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
5605 (SOCK_PROTO(so) == IPPROTO_TCP)) {
5606 ret = mptcp_notsent_lowat_check(so);
5607 }
5608#endif
5609 else {
5610 return (1);
5611 }
316670eb
A
5612 } else {
5613 ret = 1;
5614 }
5615 }
5616 if (so_wait_for_if_feedback(so))
5617 ret = 0;
5618out:
91447636
A
5619 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5620 socket_unlock(so, 1);
39236c6e 5621 return (ret);
316670eb
A
5622}
5623
5624static void
5625filt_sockdetach(struct knote *kn)
5626{
5627 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5628 socket_lock(so, 1);
39236c6e 5629
316670eb
A
5630 if ((so->so_flags & SOF_KNOTE) != 0)
5631 if (KNOTE_DETACH(&so->so_klist, kn))
5632 so->so_flags &= ~SOF_KNOTE;
5633 socket_unlock(so, 1);
5634}
5635
5636static int
5637filt_sockev(struct knote *kn, long hint)
5638{
5639 int ret = 0, locked = 0;
5640 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
39236c6e 5641 long ev_hint = (hint & SO_FILT_HINT_EV);
316670eb
A
5642
5643 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
5644 socket_lock(so, 1);
5645 locked = 1;
5646 }
5647
39236c6e 5648 if (ev_hint & SO_FILT_HINT_CONNRESET) {
316670eb
A
5649 if (kn->kn_sfflags & NOTE_CONNRESET)
5650 kn->kn_fflags |= NOTE_CONNRESET;
39236c6e
A
5651 }
5652 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
316670eb
A
5653 if (kn->kn_sfflags & NOTE_TIMEOUT)
5654 kn->kn_fflags |= NOTE_TIMEOUT;
39236c6e
A
5655 }
5656 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
316670eb
A
5657 if (kn->kn_sfflags & NOTE_NOSRCADDR)
5658 kn->kn_fflags |= NOTE_NOSRCADDR;
39236c6e
A
5659 }
5660 if (ev_hint & SO_FILT_HINT_IFDENIED) {
316670eb
A
5661 if ((kn->kn_sfflags & NOTE_IFDENIED))
5662 kn->kn_fflags |= NOTE_IFDENIED;
39236c6e
A
5663 }
5664 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
316670eb
A
5665 if (kn->kn_sfflags & NOTE_KEEPALIVE)
5666 kn->kn_fflags |= NOTE_KEEPALIVE;
5667 }
39236c6e
A
5668 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
5669 if (kn->kn_sfflags & NOTE_ADAPTIVE_WTIMO)
5670 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
5671 }
5672 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
5673 if (kn->kn_sfflags & NOTE_ADAPTIVE_RTIMO)
5674 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
5675 }
5676 if (ev_hint & SO_FILT_HINT_CONNECTED) {
5677 if (kn->kn_sfflags & NOTE_CONNECTED)
5678 kn->kn_fflags |= NOTE_CONNECTED;
5679 }
5680 if (ev_hint & SO_FILT_HINT_DISCONNECTED) {
5681 if (kn->kn_sfflags & NOTE_DISCONNECTED)
5682 kn->kn_fflags |= NOTE_DISCONNECTED;
5683 }
5684 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
5685 if (so->so_proto != NULL &&
5686 (so->so_proto->pr_flags & PR_EVCONNINFO) &&
5687 (kn->kn_sfflags & NOTE_CONNINFO_UPDATED))
5688 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
5689 }
316670eb
A
5690
5691 if ((kn->kn_sfflags & NOTE_READCLOSED) &&
fe8ab488
A
5692 (so->so_state & SS_CANTRCVMORE)
5693#if CONTENT_FILTER
5694 && cfil_sock_data_pending(&so->so_rcv) == 0
5695#endif /* CONTENT_FILTER */
5696 )
316670eb
A
5697 kn->kn_fflags |= NOTE_READCLOSED;
5698
5699 if ((kn->kn_sfflags & NOTE_WRITECLOSED) &&
39236c6e 5700 (so->so_state & SS_CANTSENDMORE))
316670eb
A
5701 kn->kn_fflags |= NOTE_WRITECLOSED;
5702
5703 if ((kn->kn_sfflags & NOTE_SUSPEND) &&
39236c6e 5704 ((ev_hint & SO_FILT_HINT_SUSPEND) ||
316670eb 5705 (so->so_flags & SOF_SUSPENDED))) {
39236c6e 5706 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
316670eb
A
5707 kn->kn_fflags |= NOTE_SUSPEND;
5708 }
5709
5710 if ((kn->kn_sfflags & NOTE_RESUME) &&
39236c6e 5711 ((ev_hint & SO_FILT_HINT_RESUME) ||
316670eb 5712 (so->so_flags & SOF_SUSPENDED) == 0)) {
39236c6e 5713 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
316670eb
A
5714 kn->kn_fflags |= NOTE_RESUME;
5715 }
5716
5717 if (so->so_error != 0) {
5718 ret = 1;
5719 kn->kn_data = so->so_error;
5720 kn->kn_flags |= EV_EOF;
5721 } else {
5722 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
5723 }
5724
5725 if (kn->kn_fflags != 0)
5726 ret = 1;
5727
5728 if (locked)
5729 socket_unlock(so, 1);
5730
39236c6e 5731 return (ret);
316670eb
A
5732}
5733
5734void
39236c6e
A
5735get_sockev_state(struct socket *so, u_int32_t *statep)
5736{
316670eb
A
5737 u_int32_t state = *(statep);
5738
39236c6e 5739 if (so->so_state & SS_ISCONNECTED)
316670eb 5740 state |= SOCKEV_CONNECTED;
39236c6e 5741 else
316670eb 5742 state &= ~(SOCKEV_CONNECTED);
39236c6e 5743 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
316670eb 5744 *(statep) = state;
55e303ae
A
5745}
5746
39236c6e
A
5747#define SO_LOCK_HISTORY_STR_LEN \
5748 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
b0d623f7 5749
39236c6e
A
5750__private_extern__ const char *
5751solockhistory_nr(struct socket *so)
55e303ae 5752{
39236c6e
A
5753 size_t n = 0;
5754 int i;
5755 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
5756
5757 bzero(lock_history_str, sizeof (lock_history_str));
5758 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
5759 n += snprintf(lock_history_str + n,
5760 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
5761 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
5762 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
b0d623f7 5763 }
39236c6e 5764 return (lock_history_str);
55e303ae
A
5765}
5766
91447636 5767int
2d21ac55 5768socket_lock(struct socket *so, int refcount)
91447636 5769{
b0d623f7
A
5770 int error = 0;
5771 void *lr_saved;
0c530ab8 5772
b0d623f7 5773 lr_saved = __builtin_return_address(0);
91447636
A
5774
5775 if (so->so_proto->pr_lock) {
5776 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
2d21ac55 5777 } else {
91447636 5778#ifdef MORE_LOCKING_DEBUG
2d21ac55
A
5779 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
5780 LCK_MTX_ASSERT_NOTOWNED);
91447636
A
5781#endif
5782 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
5783 if (refcount)
5784 so->so_usecount++;
b0d623f7 5785 so->lock_lr[so->next_lock_lr] = lr_saved;
0c530ab8 5786 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
91447636
A
5787 }
5788
2d21ac55 5789 return (error);
91447636
A
5790}
5791
5792int
2d21ac55 5793socket_unlock(struct socket *so, int refcount)
91447636 5794{
b0d623f7
A
5795 int error = 0;
5796 void *lr_saved;
2d21ac55 5797 lck_mtx_t *mutex_held;
91447636 5798
b0d623f7 5799 lr_saved = __builtin_return_address(0);
91447636 5800
39236c6e
A
5801 if (so->so_proto == NULL) {
5802 panic("%s: null so_proto so=%p\n", __func__, so);
5803 /* NOTREACHED */
5804 }
91447636 5805
2d21ac55 5806 if (so && so->so_proto->pr_unlock) {
91447636 5807 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
2d21ac55 5808 } else {
91447636
A
5809 mutex_held = so->so_proto->pr_domain->dom_mtx;
5810#ifdef MORE_LOCKING_DEBUG
5811 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
5812#endif
b0d623f7 5813 so->unlock_lr[so->next_unlock_lr] = lr_saved;
0c530ab8
A
5814 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
5815
91447636 5816 if (refcount) {
39236c6e
A
5817 if (so->so_usecount <= 0) {
5818 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
5819 "lrh=%s", __func__, so->so_usecount, so,
5820 SOCK_DOM(so), so->so_type,
5821 SOCK_PROTO(so), solockhistory_nr(so));
5822 /* NOTREACHED */
5823 }
5824
91447636 5825 so->so_usecount--;
39236c6e 5826 if (so->so_usecount == 0)
91447636 5827 sofreelastref(so, 1);
91447636
A
5828 }
5829 lck_mtx_unlock(mutex_held);
5830 }
5831
2d21ac55 5832 return (error);
91447636 5833}
2d21ac55
A
5834
5835/* Called with socket locked, will unlock socket */
91447636 5836void
2d21ac55 5837sofree(struct socket *so)
91447636 5838{
2d21ac55 5839 lck_mtx_t *mutex_held;
39236c6e 5840
2d21ac55 5841 if (so->so_proto->pr_getlock != NULL)
91447636 5842 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2d21ac55 5843 else
91447636
A
5844 mutex_held = so->so_proto->pr_domain->dom_mtx;
5845 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
2d21ac55 5846
91447636
A
5847 sofreelastref(so, 0);
5848}
5849
5850void
2d21ac55 5851soreference(struct socket *so)
91447636
A
5852{
5853 socket_lock(so, 1); /* locks & take one reference on socket */
5854 socket_unlock(so, 0); /* unlock only */
5855}
5856
5857void
2d21ac55 5858sodereference(struct socket *so)
91447636
A
5859{
5860 socket_lock(so, 0);
5861 socket_unlock(so, 1);
5862}
2d21ac55
A
5863
5864/*
5865 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
5866 * possibility of using jumbo clusters. Caller must ensure to hold
5867 * the socket lock.
5868 */
5869void
5870somultipages(struct socket *so, boolean_t set)
5871{
5872 if (set)
5873 so->so_flags |= SOF_MULTIPAGES;
5874 else
5875 so->so_flags &= ~SOF_MULTIPAGES;
5876}
b0d623f7 5877
fe8ab488
A
5878void
5879soif2kcl(struct socket *so, boolean_t set)
5880{
5881 if (set)
5882 so->so_flags1 |= SOF1_IF_2KCL;
5883 else
5884 so->so_flags1 &= ~SOF1_IF_2KCL;
5885}
5886
b0d623f7
A
5887int
5888so_isdstlocal(struct socket *so) {
5889
5890 struct inpcb *inp = (struct inpcb *)so->so_pcb;
5891
39236c6e
A
5892 if (SOCK_DOM(so) == PF_INET)
5893 return (inaddr_local(inp->inp_faddr));
5894 else if (SOCK_DOM(so) == PF_INET6)
5895 return (in6addr_local(&inp->in6p_faddr));
5896
5897 return (0);
b0d623f7 5898}
6d2010ae
A
5899
5900int
5901sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
5902{
39236c6e 5903 struct sockbuf *rcv, *snd;
6d2010ae
A
5904 int err = 0, defunct;
5905
39236c6e
A
5906 rcv = &so->so_rcv;
5907 snd = &so->so_snd;
5908
6d2010ae
A
5909 defunct = (so->so_flags & SOF_DEFUNCT);
5910 if (defunct) {
39236c6e 5911 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
6d2010ae 5912 panic("%s: SB_DROP not set", __func__);
39236c6e
A
5913 /* NOTREACHED */
5914 }
6d2010ae
A
5915 goto done;
5916 }
5917
5918 if (so->so_flags & SOF_NODEFUNCT) {
5919 if (noforce) {
5920 err = EOPNOTSUPP;
39236c6e
A
5921 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) "
5922 "so 0x%llx [%d,%d] is not eligible for defunct "
5923 "(%d)\n", __func__, proc_selfpid(), proc_pid(p),
5924 level, (uint64_t)VM_KERNEL_ADDRPERM(so),
5925 SOCK_DOM(so), SOCK_TYPE(so), err));
6d2010ae
A
5926 return (err);
5927 }
5928 so->so_flags &= ~SOF_NODEFUNCT;
39236c6e
A
5929 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
5930 "[%d,%d] defunct by force\n", __func__, proc_selfpid(),
5931 proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
5932 SOCK_DOM(so), SOCK_TYPE(so)));
6d2010ae
A
5933 }
5934
5935 so->so_flags |= SOF_DEFUNCT;
39236c6e 5936
6d2010ae 5937 /* Prevent further data from being appended to the socket buffers */
39236c6e
A
5938 snd->sb_flags |= SB_DROP;
5939 rcv->sb_flags |= SB_DROP;
5940
5941 /* Flush any existing data in the socket buffers */
5942 if (rcv->sb_cc != 0) {
5943 rcv->sb_flags &= ~SB_SEL;
5944 selthreadclear(&rcv->sb_sel);
5945 sbrelease(rcv);
5946 }
5947 if (snd->sb_cc != 0) {
5948 snd->sb_flags &= ~SB_SEL;
5949 selthreadclear(&snd->sb_sel);
5950 sbrelease(snd);
5951 }
6d2010ae
A
5952
5953done:
39236c6e
A
5954 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%d,%d] %s "
5955 "defunct\n", __func__, proc_selfpid(), proc_pid(p), level,
5956 (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so),
6d2010ae
A
5957 defunct ? "is already" : "marked as"));
5958
5959 return (err);
5960}
5961
5962int
5963sodefunct(struct proc *p, struct socket *so, int level)
5964{
5965 struct sockbuf *rcv, *snd;
5966
39236c6e 5967 if (!(so->so_flags & SOF_DEFUNCT)) {
6d2010ae 5968 panic("%s improperly called", __func__);
39236c6e
A
5969 /* NOTREACHED */
5970 }
6d2010ae
A
5971 if (so->so_state & SS_DEFUNCT)
5972 goto done;
5973
5974 rcv = &so->so_rcv;
5975 snd = &so->so_snd;
5976
39236c6e
A
5977 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
5978 char s[MAX_IPv6_STR_LEN];
5979 char d[MAX_IPv6_STR_LEN];
5980 struct inpcb *inp = sotoinpcb(so);
5981
5982 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%s "
5983 "%s:%d -> %s:%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
5984 "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
5985 proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
5986 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
5987 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
5988 (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr),
5989 s, sizeof (s)), ntohs(inp->in6p_lport),
5990 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
5991 (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr,
5992 d, sizeof (d)), ntohs(inp->in6p_fport),
5993 (uint32_t)rcv->sb_sel.si_flags,
5994 (uint32_t)snd->sb_sel.si_flags,
5995 rcv->sb_flags, snd->sb_flags));
5996 } else {
5997 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
5998 "[%d,%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
5999 "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
6000 proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
6001 SOCK_DOM(so), SOCK_TYPE(so), (uint32_t)rcv->sb_sel.si_flags,
6002 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
6003 snd->sb_flags));
6004 }
6d2010ae
A
6005
6006 /*
6007 * Unwedge threads blocked on sbwait() and sb_lock().
6008 */
6009 sbwakeup(rcv);
6010 sbwakeup(snd);
6011
fe8ab488 6012 so->so_flags1 |= SOF1_DEFUNCTINPROG;
6d2010ae 6013 if (rcv->sb_flags & SB_LOCK)
39236c6e 6014 sbunlock(rcv, TRUE); /* keep socket locked */
6d2010ae 6015 if (snd->sb_flags & SB_LOCK)
39236c6e 6016 sbunlock(snd, TRUE); /* keep socket locked */
6d2010ae
A
6017
6018 /*
6019 * Flush the buffers and disconnect. We explicitly call shutdown
6020 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
6021 * states are set for the socket. This would also flush out data
6022 * hanging off the receive list of this socket.
6023 */
fe8ab488
A
6024 (void) soshutdownlock_final(so, SHUT_RD);
6025 (void) soshutdownlock_final(so, SHUT_WR);
6d2010ae
A
6026 (void) sodisconnectlocked(so);
6027
6028 /*
6029 * Explicitly handle connectionless-protocol disconnection
6030 * and release any remaining data in the socket buffers.
6031 */
6032 if (!(so->so_flags & SS_ISDISCONNECTED))
6033 (void) soisdisconnected(so);
6034
6035 if (so->so_error == 0)
6036 so->so_error = EBADF;
6037
39236c6e
A
6038 if (rcv->sb_cc != 0) {
6039 rcv->sb_flags &= ~SB_SEL;
6040 selthreadclear(&rcv->sb_sel);
6d2010ae 6041 sbrelease(rcv);
39236c6e
A
6042 }
6043 if (snd->sb_cc != 0) {
6044 snd->sb_flags &= ~SB_SEL;
6045 selthreadclear(&snd->sb_sel);
6d2010ae 6046 sbrelease(snd);
39236c6e 6047 }
6d2010ae
A
6048 so->so_state |= SS_DEFUNCT;
6049
6050done:
6051 return (0);
6052}
316670eb
A
6053
6054__private_extern__ int
6055so_set_recv_anyif(struct socket *so, int optval)
6056{
6057 int ret = 0;
6058
6059#if INET6
39236c6e 6060 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
316670eb 6061#else
39236c6e 6062 if (SOCK_DOM(so) == PF_INET) {
316670eb
A
6063#endif /* !INET6 */
6064 if (optval)
6065 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
6066 else
6067 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
316670eb
A
6068 }
6069
6070 return (ret);
6071}
6072
6073__private_extern__ int
6074so_get_recv_anyif(struct socket *so)
6075{
6076 int ret = 0;
6077
6078#if INET6
39236c6e 6079 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
316670eb 6080#else
39236c6e 6081 if (SOCK_DOM(so) == PF_INET) {
316670eb
A
6082#endif /* !INET6 */
6083 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
6084 }
6085
6086 return (ret);
6087}
39236c6e
A
6088
6089int
6090so_set_restrictions(struct socket *so, uint32_t vals)
6091{
6092 int nocell_old, nocell_new;
fe8ab488 6093 int noexpensive_old, noexpensive_new;
39236c6e
A
6094
6095 /*
6096 * Deny-type restrictions are trapdoors; once set they cannot be
6097 * unset for the lifetime of the socket. This allows them to be
6098 * issued by a framework on behalf of the application without
6099 * having to worry that they can be undone.
6100 *
6101 * Note here that socket-level restrictions overrides any protocol
6102 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
6103 * socket restriction issued on the socket has a higher precendence
6104 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
6105 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
6106 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
6107 */
6108 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
fe8ab488 6109 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
39236c6e 6110 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
fe8ab488
A
6111 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
6112 SO_RESTRICT_DENY_EXPENSIVE));
39236c6e 6113 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
fe8ab488 6114 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
39236c6e
A
6115
6116 /* we can only set, not clear restrictions */
fe8ab488
A
6117 if ((nocell_new - nocell_old) == 0 &&
6118 (noexpensive_new - noexpensive_old) == 0)
6119 return (0);
39236c6e
A
6120#if INET6
6121 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6122#else
6123 if (SOCK_DOM(so) == PF_INET) {
6124#endif /* !INET6 */
fe8ab488
A
6125 if (nocell_new - nocell_old != 0) {
6126 /* if deny cellular is now set, do what's needed for INPCB */
6127 inp_set_nocellular(sotoinpcb(so));
6128 }
6129 if (noexpensive_new - noexpensive_old != 0) {
6130 inp_set_noexpensive(sotoinpcb(so));
6131 }
39236c6e
A
6132 }
6133
fe8ab488 6134 return (0);
39236c6e
A
6135}
6136
6137uint32_t
6138so_get_restrictions(struct socket *so)
6139{
6140 return (so->so_restrictions & (SO_RESTRICT_DENY_IN |
fe8ab488
A
6141 SO_RESTRICT_DENY_OUT |
6142 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE));
39236c6e
A
6143}
6144
6145struct sockaddr_entry *
6146sockaddrentry_alloc(int how)
6147{
6148 struct sockaddr_entry *se;
6149
6150 se = (how == M_WAITOK) ? zalloc(se_zone) : zalloc_noblock(se_zone);
6151 if (se != NULL)
6152 bzero(se, se_zone_size);
6153
6154 return (se);
6155}
6156
6157void
6158sockaddrentry_free(struct sockaddr_entry *se)
6159{
6160 if (se->se_addr != NULL) {
6161 FREE(se->se_addr, M_SONAME);
6162 se->se_addr = NULL;
6163 }
6164 zfree(se_zone, se);
6165}
6166
6167struct sockaddr_entry *
6168sockaddrentry_dup(const struct sockaddr_entry *src_se, int how)
6169{
6170 struct sockaddr_entry *dst_se;
6171
6172 dst_se = sockaddrentry_alloc(how);
6173 if (dst_se != NULL) {
6174 int len = src_se->se_addr->sa_len;
6175
6176 MALLOC(dst_se->se_addr, struct sockaddr *,
6177 len, M_SONAME, how | M_ZERO);
6178 if (dst_se->se_addr != NULL) {
6179 bcopy(src_se->se_addr, dst_se->se_addr, len);
6180 } else {
6181 sockaddrentry_free(dst_se);
6182 dst_se = NULL;
6183 }
6184 }
6185
6186 return (dst_se);
6187}
6188
6189struct sockaddr_list *
6190sockaddrlist_alloc(int how)
6191{
6192 struct sockaddr_list *sl;
6193
6194 sl = (how == M_WAITOK) ? zalloc(sl_zone) : zalloc_noblock(sl_zone);
6195 if (sl != NULL) {
6196 bzero(sl, sl_zone_size);
6197 TAILQ_INIT(&sl->sl_head);
6198 }
6199 return (sl);
6200}
6201
6202void
6203sockaddrlist_free(struct sockaddr_list *sl)
6204{
6205 struct sockaddr_entry *se, *tse;
6206
6207 TAILQ_FOREACH_SAFE(se, &sl->sl_head, se_link, tse) {
6208 sockaddrlist_remove(sl, se);
6209 sockaddrentry_free(se);
6210 }
6211 VERIFY(sl->sl_cnt == 0 && TAILQ_EMPTY(&sl->sl_head));
6212 zfree(sl_zone, sl);
6213}
6214
6215void
6216sockaddrlist_insert(struct sockaddr_list *sl, struct sockaddr_entry *se)
6217{
6218 VERIFY(!(se->se_flags & SEF_ATTACHED));
6219 se->se_flags |= SEF_ATTACHED;
6220 TAILQ_INSERT_TAIL(&sl->sl_head, se, se_link);
6221 sl->sl_cnt++;
6222 VERIFY(sl->sl_cnt != 0);
6223}
6224
6225void
6226sockaddrlist_remove(struct sockaddr_list *sl, struct sockaddr_entry *se)
6227{
6228 VERIFY(se->se_flags & SEF_ATTACHED);
6229 se->se_flags &= ~SEF_ATTACHED;
6230 VERIFY(sl->sl_cnt != 0);
6231 sl->sl_cnt--;
6232 TAILQ_REMOVE(&sl->sl_head, se, se_link);
6233}
6234
6235struct sockaddr_list *
6236sockaddrlist_dup(const struct sockaddr_list *src_sl, int how)
6237{
6238 struct sockaddr_entry *src_se, *tse;
6239 struct sockaddr_list *dst_sl;
6240
6241 dst_sl = sockaddrlist_alloc(how);
6242 if (dst_sl == NULL)
6243 return (NULL);
6244
6245 TAILQ_FOREACH_SAFE(src_se, &src_sl->sl_head, se_link, tse) {
6246 struct sockaddr_entry *dst_se;
6247
6248 if (src_se->se_addr == NULL)
6249 continue;
6250
6251 dst_se = sockaddrentry_dup(src_se, how);
6252 if (dst_se == NULL) {
6253 sockaddrlist_free(dst_sl);
6254 return (NULL);
6255 }
6256
6257 sockaddrlist_insert(dst_sl, dst_se);
6258 }
6259 VERIFY(src_sl->sl_cnt == dst_sl->sl_cnt);
6260
6261 return (dst_sl);
6262}
6263
6264int
6265so_set_effective_pid(struct socket *so, int epid, struct proc *p)
6266{
6267 struct proc *ep = PROC_NULL;
6268 int error = 0;
6269
6270 /* pid 0 is reserved for kernel */
6271 if (epid == 0) {
6272 error = EINVAL;
6273 goto done;
6274 }
6275
6276 /*
6277 * If this is an in-kernel socket, prevent its delegate
6278 * association from changing unless the socket option is
6279 * coming from within the kernel itself.
6280 */
6281 if (so->last_pid == 0 && p != kernproc) {
6282 error = EACCES;
6283 goto done;
6284 }
6285
6286 /*
6287 * If this is issued by a process that's recorded as the
6288 * real owner of the socket, or if the pid is the same as
6289 * the process's own pid, then proceed. Otherwise ensure
6290 * that the issuing process has the necessary privileges.
6291 */
6292 if (epid != so->last_pid || epid != proc_pid(p)) {
6293 if ((error = priv_check_cred(kauth_cred_get(),
6294 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
6295 error = EACCES;
6296 goto done;
6297 }
6298 }
6299
6300 /* Find the process that corresponds to the effective pid */
6301 if ((ep = proc_find(epid)) == PROC_NULL) {
6302 error = ESRCH;
6303 goto done;
6304 }
6305
6306 /*
6307 * If a process tries to delegate the socket to itself, then
6308 * there's really nothing to do; treat it as a way for the
6309 * delegate association to be cleared. Note that we check
6310 * the passed-in proc rather than calling proc_selfpid(),
6311 * as we need to check the process issuing the socket option
6312 * which could be kernproc. Given that we don't allow 0 for
6313 * effective pid, it means that a delegated in-kernel socket
6314 * stays delegated during its lifetime (which is probably OK.)
6315 */
6316 if (epid == proc_pid(p)) {
6317 so->so_flags &= ~SOF_DELEGATED;
6318 so->e_upid = 0;
6319 so->e_pid = 0;
6320 uuid_clear(so->e_uuid);
6321 } else {
6322 so->so_flags |= SOF_DELEGATED;
6323 so->e_upid = proc_uniqueid(ep);
6324 so->e_pid = proc_pid(ep);
6325 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
6326 }
39236c6e
A
6327done:
6328 if (error == 0 && net_io_policy_log) {
6329 uuid_string_t buf;
6330
6331 uuid_unparse(so->e_uuid, buf);
6332 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
6333 "euuid %s%s\n", __func__, proc_name_address(p),
6334 proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6335 SOCK_TYPE(so), so->e_pid, proc_name_address(ep), buf,
6336 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
6337 } else if (error != 0 && net_io_policy_log) {
6338 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
6339 "ERROR (%d)\n", __func__, proc_name_address(p),
6340 proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6341 SOCK_TYPE(so), epid, (ep == PROC_NULL) ? "PROC_NULL" :
6342 proc_name_address(ep), error);
6343 }
6344
fe8ab488
A
6345 /* Update this socket's policy upon success */
6346 if (error == 0) {
6347 so->so_policy_gencnt *= -1;
6348 so_update_policy(so);
6349#if NECP
6350 so_update_necp_policy(so, NULL, NULL);
6351#endif /* NECP */
6352 }
6353
39236c6e
A
6354 if (ep != PROC_NULL)
6355 proc_rele(ep);
6356
6357 return (error);
6358}
6359
6360int
6361so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
6362{
6363 uuid_string_t buf;
6364 uuid_t uuid;
6365 int error = 0;
6366
6367 /* UUID must not be all-zeroes (reserved for kernel) */
6368 if (uuid_is_null(euuid)) {
6369 error = EINVAL;
6370 goto done;;
6371 }
6372
6373 /*
6374 * If this is an in-kernel socket, prevent its delegate
6375 * association from changing unless the socket option is
6376 * coming from within the kernel itself.
6377 */
6378 if (so->last_pid == 0 && p != kernproc) {
6379 error = EACCES;
6380 goto done;
6381 }
6382
6383 /* Get the UUID of the issuing process */
6384 proc_getexecutableuuid(p, uuid, sizeof (uuid));
6385
6386 /*
6387 * If this is issued by a process that's recorded as the
6388 * real owner of the socket, or if the uuid is the same as
6389 * the process's own uuid, then proceed. Otherwise ensure
6390 * that the issuing process has the necessary privileges.
6391 */
6392 if (uuid_compare(euuid, so->last_uuid) != 0 ||
6393 uuid_compare(euuid, uuid) != 0) {
6394 if ((error = priv_check_cred(kauth_cred_get(),
6395 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
6396 error = EACCES;
6397 goto done;
6398 }
6399 }
6400
6401 /*
6402 * If a process tries to delegate the socket to itself, then
6403 * there's really nothing to do; treat it as a way for the
6404 * delegate association to be cleared. Note that we check
6405 * the uuid of the passed-in proc rather than that of the
6406 * current process, as we need to check the process issuing
6407 * the socket option which could be kernproc itself. Given
6408 * that we don't allow 0 for effective uuid, it means that
6409 * a delegated in-kernel socket stays delegated during its
6410 * lifetime (which is okay.)
6411 */
6412 if (uuid_compare(euuid, uuid) == 0) {
6413 so->so_flags &= ~SOF_DELEGATED;
6414 so->e_upid = 0;
6415 so->e_pid = 0;
6416 uuid_clear(so->e_uuid);
6417 } else {
6418 so->so_flags |= SOF_DELEGATED;
6419 /*
6420 * Unlike so_set_effective_pid(), we only have the UUID
6421 * here and the process ID is not known. Inherit the
6422 * real {pid,upid} of the socket.
6423 */
6424 so->e_upid = so->last_upid;
6425 so->e_pid = so->last_pid;
6426 uuid_copy(so->e_uuid, euuid);
6427 }
6428
6429done:
6430 if (error == 0 && net_io_policy_log) {
6431 uuid_unparse(so->e_uuid, buf);
6432 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
6433 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
6434 (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6435 SOCK_TYPE(so), so->e_pid, buf,
6436 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
6437 } else if (error != 0 && net_io_policy_log) {
6438 uuid_unparse(euuid, buf);
6439 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
6440 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
6441 (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6442 SOCK_TYPE(so), buf, error);
6443 }
6444
fe8ab488
A
6445 /* Update this socket's policy upon success */
6446 if (error == 0) {
6447 so->so_policy_gencnt *= -1;
6448 so_update_policy(so);
6449#if NECP
6450 so_update_necp_policy(so, NULL, NULL);
6451#endif /* NECP */
6452 }
6453
39236c6e
A
6454 return (error);
6455}
6456
6457void
6458netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
6459 uint32_t ev_datalen)
6460{
6461 struct kev_msg ev_msg;
6462
6463 /*
6464 * A netpolicy event always starts with a netpolicy_event_data
6465 * structure, but the caller can provide for a longer event
6466 * structure to post, depending on the event code.
6467 */
6468 VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
6469
6470 bzero(&ev_msg, sizeof (ev_msg));
6471 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6472 ev_msg.kev_class = KEV_NETWORK_CLASS;
6473 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
6474 ev_msg.event_code = ev_code;
6475
6476 ev_msg.dv[0].data_ptr = ev_data;
6477 ev_msg.dv[0].data_length = ev_datalen;
6478
6479 kev_post_msg(&ev_msg);
6480}
fe8ab488
A
6481
6482void
6483socket_post_kev_msg(uint32_t ev_code,
6484 struct kev_socket_event_data *ev_data,
6485 uint32_t ev_datalen)
6486{
6487 struct kev_msg ev_msg;
6488
6489 bzero(&ev_msg, sizeof(ev_msg));
6490 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6491 ev_msg.kev_class = KEV_NETWORK_CLASS;
6492 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
6493 ev_msg.event_code = ev_code;
6494
6495 ev_msg.dv[0].data_ptr = ev_data;
6496 ev_msg.dv[0]. data_length = ev_datalen;
6497
6498 kev_post_msg(&ev_msg);
6499}
6500
6501void
6502socket_post_kev_msg_closed(struct socket *so)
6503{
6504 struct kev_socket_closed ev;
6505 struct sockaddr *socksa = NULL, *peersa = NULL;
6506 int err;
6507 bzero(&ev, sizeof(ev));
6508 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
6509 if (err == 0) {
6510 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
6511 &peersa);
6512 if (err == 0) {
6513 memcpy(&ev.ev_data.kev_sockname, socksa,
6514 min(socksa->sa_len,
6515 sizeof (ev.ev_data.kev_sockname)));
6516 memcpy(&ev.ev_data.kev_peername, peersa,
6517 min(peersa->sa_len,
6518 sizeof (ev.ev_data.kev_peername)));
6519 socket_post_kev_msg(KEV_SOCKET_CLOSED,
6520 &ev.ev_data, sizeof (ev));
6521 }
6522 }
6523 if (socksa != NULL)
6524 FREE(socksa, M_SONAME);
6525 if (peersa != NULL)
6526 FREE(peersa, M_SONAME);
6527}