]> git.saurik.com Git - apple/xnu.git/blame - bsd/kern/uipc_socket.c
xnu-2422.100.13.tar.gz
[apple/xnu.git] / bsd / kern / uipc_socket.c
CommitLineData
1c79356b 1/*
39236c6e 2 * Copyright (c) 1998-2013 Apple Inc. All rights reserved.
5d5c5d0d 3 *
2d21ac55 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
39236c6e 5 *
2d21ac55
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
39236c6e 14 *
2d21ac55
A
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
39236c6e 17 *
2d21ac55
A
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55
A
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
39236c6e 25 *
2d21ac55 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b 27 */
1c79356b
A
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
9bccf70c 61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
1c79356b 62 */
2d21ac55
A
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
1c79356b
A
69
70#include <sys/param.h>
71#include <sys/systm.h>
55e303ae 72#include <sys/filedesc.h>
2d21ac55 73#include <sys/proc.h>
91447636
A
74#include <sys/proc_internal.h>
75#include <sys/kauth.h>
76#include <sys/file_internal.h>
1c79356b
A
77#include <sys/fcntl.h>
78#include <sys/malloc.h>
79#include <sys/mbuf.h>
80#include <sys/domain.h>
81#include <sys/kernel.h>
55e303ae 82#include <sys/event.h>
1c79356b
A
83#include <sys/poll.h>
84#include <sys/protosw.h>
85#include <sys/socket.h>
86#include <sys/socketvar.h>
87#include <sys/resourcevar.h>
88#include <sys/signalvar.h>
89#include <sys/sysctl.h>
39236c6e 90#include <sys/syslog.h>
1c79356b
A
91#include <sys/uio.h>
92#include <sys/ev.h>
93#include <sys/kdebug.h>
2d21ac55 94#include <sys/un.h>
d1ecb069 95#include <sys/user.h>
316670eb 96#include <sys/priv.h>
39236c6e 97#include <sys/kern_event.h>
1c79356b 98#include <net/route.h>
39236c6e 99#include <net/init.h>
316670eb 100#include <net/ntstat.h>
1c79356b
A
101#include <netinet/in.h>
102#include <netinet/in_pcb.h>
6d2010ae
A
103#include <netinet/ip6.h>
104#include <netinet6/ip6_var.h>
39236c6e 105#include <netinet/flow_divert.h>
1c79356b 106#include <kern/zalloc.h>
91447636 107#include <kern/locks.h>
1c79356b 108#include <machine/limits.h>
2d21ac55
A
109#include <libkern/OSAtomic.h>
110#include <pexpert/pexpert.h>
b0d623f7 111#include <kern/assert.h>
6d2010ae 112#include <kern/task.h>
316670eb 113#include <sys/kpi_mbuf.h>
6d2010ae 114#include <sys/mcache.h>
2d21ac55
A
115
116#if CONFIG_MACF
117#include <security/mac.h>
118#include <security/mac_framework.h>
119#endif /* MAC */
120
39236c6e
A
121#if MULTIPATH
122#include <netinet/mp_pcb.h>
123#endif /* MULTIPATH */
124
125/* TODO: this should be in a header file somewhere */
126extern char *proc_name_address(void *p);
127
128static u_int32_t so_cache_hw; /* High water mark for socache */
129static u_int32_t so_cache_timeouts; /* number of timeouts */
130static u_int32_t so_cache_max_freed; /* max freed per timeout */
131static u_int32_t cached_sock_count = 0;
132STAILQ_HEAD(, socket) so_cache_head;
133int max_cached_sock_count = MAX_CACHED_SOCKETS;
134static u_int32_t so_cache_time;
135static int socketinit_done;
136static struct zone *so_cache_zone;
137
138static lck_grp_t *so_cache_mtx_grp;
139static lck_attr_t *so_cache_mtx_attr;
91447636 140static lck_grp_attr_t *so_cache_mtx_grp_attr;
39236c6e 141static lck_mtx_t *so_cache_mtx;
91447636 142
1c79356b
A
143#include <machine/limits.h>
144
2d21ac55
A
145static void filt_sordetach(struct knote *kn);
146static int filt_soread(struct knote *kn, long hint);
147static void filt_sowdetach(struct knote *kn);
148static int filt_sowrite(struct knote *kn, long hint);
316670eb
A
149static void filt_sockdetach(struct knote *kn);
150static int filt_sockev(struct knote *kn, long hint);
2d21ac55 151
39236c6e
A
152static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
153static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
55e303ae 154
b0d623f7 155static struct filterops soread_filtops = {
39236c6e
A
156 .f_isfd = 1,
157 .f_detach = filt_sordetach,
158 .f_event = filt_soread,
b0d623f7 159};
39236c6e 160
b0d623f7 161static struct filterops sowrite_filtops = {
39236c6e
A
162 .f_isfd = 1,
163 .f_detach = filt_sowdetach,
164 .f_event = filt_sowrite,
b0d623f7 165};
39236c6e 166
316670eb
A
167static struct filterops sock_filtops = {
168 .f_isfd = 1,
169 .f_detach = filt_sockdetach,
170 .f_event = filt_sockev,
171};
55e303ae 172
2d21ac55 173#define EVEN_MORE_LOCKING_DEBUG 0
1c79356b 174int socket_debug = 0;
39236c6e 175static int socket_zone = M_SOCKET;
1c79356b
A
176so_gen_t so_gencnt; /* generation count for sockets */
177
178MALLOC_DEFINE(M_SONAME, "soname", "socket name");
179MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
180
2d21ac55
A
181#define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
182#define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
183#define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
184#define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
185#define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
186#define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
187#define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
1c79356b 188
2d21ac55 189#define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
1c79356b
A
190
191SYSCTL_DECL(_kern_ipc);
192
2d21ac55 193int somaxconn = SOMAXCONN;
39236c6e
A
194SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
195 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
1c79356b
A
196
197/* Should we get a maximum also ??? */
fa4905b1 198static int sosendmaxchain = 65536;
1c79356b 199static int sosendminchain = 16384;
55e303ae 200static int sorecvmincopy = 16384;
39236c6e
A
201SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
202 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
203SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
204 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
2d21ac55
A
205
206/*
207 * Set to enable jumbo clusters (if available) for large writes when
208 * the socket is marked with SOF_MULTIPAGES; see below.
209 */
210int sosendjcl = 1;
39236c6e
A
211SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
212 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
1c79356b 213
2d21ac55
A
214/*
215 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
216 * writes on the socket for all protocols on any network interfaces,
217 * depending upon sosendjcl above. Be extra careful when setting this
218 * to 1, because sending down packets that cross physical pages down to
219 * broken drivers (those that falsely assume that the physical pages
220 * are contiguous) might lead to system panics or silent data corruption.
221 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
222 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
223 * capable. Set this to 1 only for testing/debugging purposes.
224 */
225int sosendjcl_ignore_capab = 0;
39236c6e
A
226SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
227 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
1c79356b 228
6d2010ae
A
229int sodefunctlog = 0;
230SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
39236c6e 231 &sodefunctlog, 0, "");
6d2010ae 232
316670eb
A
233int sothrottlelog = 0;
234SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
39236c6e
A
235 &sothrottlelog, 0, "");
236
237int sorestrictrecv = 1;
238SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
239 &sorestrictrecv, 0, "Enable inbound interface restrictions");
316670eb 240
1c79356b
A
241/*
242 * Socket operation routines.
243 * These routines are called by the routines in
244 * sys_socket.c or from a system process, and
245 * implement the semantics of socket operations by
246 * switching out to the protocol specific routines.
247 */
248
2d21ac55
A
249/* sys_generic.c */
250extern void postevent(struct socket *, struct sockbuf *, int);
251extern void evsofree(struct socket *);
316670eb 252extern int tcp_notsent_lowat_check(struct socket *so);
39236c6e 253extern struct inpcbinfo tcbinfo;
2d21ac55
A
254
255/* TODO: these should be in header file */
256extern int get_inpcb_str_size(void);
257extern int get_tcp_str_size(void);
2d21ac55 258
39236c6e
A
259static unsigned int sl_zone_size; /* size of sockaddr_list */
260static struct zone *sl_zone; /* zone for sockaddr_list */
261
262static unsigned int se_zone_size; /* size of sockaddr_entry */
263static struct zone *se_zone; /* zone for sockaddr_entry */
91447636
A
264
265vm_size_t so_cache_zone_element_size;
266
39236c6e 267static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **, user_ssize_t *);
2d21ac55
A
268static void cached_sock_alloc(struct socket **, int);
269static void cached_sock_free(struct socket *);
91447636 270
316670eb
A
271/*
272 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
273 * setting the DSCP code on the packet based on the service class; see
274 * <rdar://problem/11277343> for details.
275 */
276__private_extern__ u_int32_t sotcdb = SOTCDB_NO_DSCP;
6d2010ae 277SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
39236c6e 278 &sotcdb, 0, "");
91447636 279
2d21ac55
A
280void
281socketinit(void)
1c79356b 282{
39236c6e 283 if (socketinit_done) {
91447636
A
284 printf("socketinit: already called...\n");
285 return;
286 }
39236c6e 287 socketinit_done = 1;
91447636 288
39236c6e
A
289 PE_parse_boot_argn("socket_debug", &socket_debug,
290 sizeof (socket_debug));
2d21ac55 291
91447636
A
292 /*
293 * allocate lock group attribute and group for socket cache mutex
294 */
295 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
2d21ac55
A
296 so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
297 so_cache_mtx_grp_attr);
298
91447636
A
299 /*
300 * allocate the lock attribute for socket cache mutex
301 */
302 so_cache_mtx_attr = lck_attr_alloc_init();
91447636 303
2d21ac55
A
304 /* cached sockets mutex */
305 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
39236c6e
A
306 if (so_cache_mtx == NULL) {
307 panic("%s: unable to allocate so_cache_mtx\n", __func__);
308 /* NOTREACHED */
309 }
310 STAILQ_INIT(&so_cache_head);
1c79356b 311
39236c6e
A
312 so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4
313 + get_inpcb_str_size() + 4 + get_tcp_str_size());
2d21ac55 314
39236c6e
A
315 so_cache_zone = zinit(so_cache_zone_element_size,
316 (120000 * so_cache_zone_element_size), 8192, "socache zone");
6d2010ae 317 zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
0b4c1975 318 zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
1c79356b 319
39236c6e
A
320 sl_zone_size = sizeof (struct sockaddr_list);
321 if ((sl_zone = zinit(sl_zone_size, 1024 * sl_zone_size, 1024,
322 "sockaddr_list")) == NULL) {
323 panic("%s: unable to allocate sockaddr_list zone\n", __func__);
324 /* NOTREACHED */
325 }
326 zone_change(sl_zone, Z_CALLERACCT, FALSE);
327 zone_change(sl_zone, Z_EXPAND, TRUE);
328
329 se_zone_size = sizeof (struct sockaddr_entry);
330 if ((se_zone = zinit(se_zone_size, 1024 * se_zone_size, 1024,
331 "sockaddr_entry")) == NULL) {
332 panic("%s: unable to allocate sockaddr_entry zone\n", __func__);
333 /* NOTREACHED */
334 }
335 zone_change(se_zone, Z_CALLERACCT, FALSE);
336 zone_change(se_zone, Z_EXPAND, TRUE);
6d2010ae 337
316670eb 338
39236c6e
A
339 in_pcbinit();
340 sflt_init();
6d2010ae 341 socket_tclass_init();
39236c6e
A
342#if MULTIPATH
343 mp_pcbinit();
344#endif /* MULTIPATH */
1c79356b
A
345}
346
2d21ac55
A
347static void
348cached_sock_alloc(struct socket **so, int waitok)
1c79356b 349{
2d21ac55 350 caddr_t temp;
39236c6e 351 uintptr_t offset;
1c79356b 352
91447636
A
353 lck_mtx_lock(so_cache_mtx);
354
39236c6e
A
355 if (!STAILQ_EMPTY(&so_cache_head)) {
356 VERIFY(cached_sock_count > 0);
1c79356b 357
39236c6e
A
358 *so = STAILQ_FIRST(&so_cache_head);
359 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
360 STAILQ_NEXT((*so), so_cache_ent) = NULL;
91447636 361
39236c6e 362 cached_sock_count--;
91447636 363 lck_mtx_unlock(so_cache_mtx);
1c79356b 364
2d21ac55
A
365 temp = (*so)->so_saved_pcb;
366 bzero((caddr_t)*so, sizeof (struct socket));
39236c6e 367
2d21ac55 368 (*so)->so_saved_pcb = temp;
2d21ac55 369 } else {
1c79356b 370
2d21ac55 371 lck_mtx_unlock(so_cache_mtx);
1c79356b 372
2d21ac55
A
373 if (waitok)
374 *so = (struct socket *)zalloc(so_cache_zone);
375 else
376 *so = (struct socket *)zalloc_noblock(so_cache_zone);
1c79356b 377
39236c6e 378 if (*so == NULL)
2d21ac55 379 return;
1c79356b 380
2d21ac55 381 bzero((caddr_t)*so, sizeof (struct socket));
1c79356b 382
2d21ac55 383 /*
39236c6e
A
384 * Define offsets for extra structures into our
385 * single block of memory. Align extra structures
386 * on longword boundaries.
2d21ac55 387 */
b0d623f7 388
39236c6e 389 offset = (uintptr_t)*so;
2d21ac55 390 offset += sizeof (struct socket);
b0d623f7
A
391
392 offset = ALIGN(offset);
393
2d21ac55
A
394 (*so)->so_saved_pcb = (caddr_t)offset;
395 offset += get_inpcb_str_size();
b0d623f7
A
396
397 offset = ALIGN(offset);
1c79356b 398
316670eb 399 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
2d21ac55 400 (caddr_t)offset;
2d21ac55 401 }
1c79356b 402
39236c6e 403 (*so)->cached_in_sock_layer = true;
1c79356b
A
404}
405
2d21ac55
A
406static void
407cached_sock_free(struct socket *so)
1c79356b 408{
1c79356b 409
91447636 410 lck_mtx_lock(so_cache_mtx);
1c79356b 411
39236c6e 412 so_cache_time = net_uptime();
b0d623f7 413 if (++cached_sock_count > max_cached_sock_count) {
1c79356b 414 --cached_sock_count;
91447636 415 lck_mtx_unlock(so_cache_mtx);
91447636 416 zfree(so_cache_zone, so);
2d21ac55 417 } else {
1c79356b
A
418 if (so_cache_hw < cached_sock_count)
419 so_cache_hw = cached_sock_count;
420
39236c6e 421 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
1c79356b
A
422
423 so->cache_timestamp = so_cache_time;
91447636 424 lck_mtx_unlock(so_cache_mtx);
1c79356b 425 }
1c79356b
A
426}
427
39236c6e
A
428void
429so_update_last_owner_locked(struct socket *so, proc_t self)
6d2010ae 430{
39236c6e
A
431 if (so->last_pid != 0) {
432 /*
433 * last_pid and last_upid should remain zero for sockets
434 * created using sock_socket. The check above achieves that
435 */
436 if (self == PROC_NULL)
316670eb 437 self = current_proc();
39236c6e
A
438
439 if (so->last_upid != proc_uniqueid(self) ||
440 so->last_pid != proc_pid(self)) {
316670eb
A
441 so->last_upid = proc_uniqueid(self);
442 so->last_pid = proc_pid(self);
39236c6e
A
443 proc_getexecutableuuid(self, so->last_uuid,
444 sizeof (so->last_uuid));
316670eb 445 }
6d2010ae
A
446 }
447}
448
39236c6e
A
449void
450so_update_policy(struct socket *so)
1c79356b 451{
39236c6e
A
452 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
453 (void) inp_update_policy(sotoinpcb(so));
454}
1c79356b 455
39236c6e
A
456boolean_t
457so_cache_timer(void)
458{
459 struct socket *p;
460 int n_freed = 0;
461 boolean_t rc = FALSE;
1c79356b 462
39236c6e
A
463 lck_mtx_lock(so_cache_mtx);
464 so_cache_timeouts++;
465 so_cache_time = net_uptime();
466
467 while (!STAILQ_EMPTY(&so_cache_head)) {
468 VERIFY(cached_sock_count > 0);
469 p = STAILQ_FIRST(&so_cache_head);
470 if ((so_cache_time - p->cache_timestamp) <
471 SO_CACHE_TIME_LIMIT)
2d21ac55 472 break;
1c79356b 473
39236c6e
A
474 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
475 --cached_sock_count;
1c79356b 476
91447636 477 zfree(so_cache_zone, p);
2d21ac55
A
478
479 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
480 so_cache_max_freed++;
1c79356b
A
481 break;
482 }
483 }
1c79356b 484
39236c6e
A
485 /* Schedule again if there is more to cleanup */
486 if (!STAILQ_EMPTY(&so_cache_head))
487 rc = TRUE;
488
489 lck_mtx_unlock(so_cache_mtx);
490 return (rc);
1c79356b 491}
1c79356b
A
492
493/*
494 * Get a socket structure from our zone, and initialize it.
495 * We don't implement `waitok' yet (see comments in uipc_domain.c).
496 * Note that it would probably be better to allocate socket
497 * and PCB at the same time, but I'm not convinced that all
498 * the protocols can be easily modified to do this.
499 */
500struct socket *
2d21ac55 501soalloc(int waitok, int dom, int type)
1c79356b
A
502{
503 struct socket *so;
504
2d21ac55
A
505 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
506 cached_sock_alloc(&so, waitok);
507 } else {
508 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
509 M_WAITOK);
510 if (so != NULL)
511 bzero(so, sizeof (*so));
1c79356b 512 }
2d21ac55
A
513 if (so != NULL) {
514 so->so_gencnt = ++so_gencnt;
515 so->so_zone = socket_zone;
516#if CONFIG_MACF_SOCKET
39236c6e
A
517 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
518 if (mac_socket_label_init(so, !waitok) != 0) {
2d21ac55
A
519 sodealloc(so);
520 return (NULL);
521 }
522#endif /* MAC_SOCKET */
1c79356b
A
523 }
524
2d21ac55 525 return (so);
1c79356b
A
526}
527
528int
39236c6e
A
529socreate_internal(int dom, struct socket **aso, int type, int proto,
530 struct proc *p, uint32_t flags, struct proc *ep)
1c79356b 531{
39236c6e
A
532 struct protosw *prp;
533 struct socket *so;
534 int error = 0;
d1ecb069 535
55e303ae
A
536#if TCPDEBUG
537 extern int tcpconsdebug;
538#endif
39236c6e
A
539
540 VERIFY(aso != NULL);
541 *aso = NULL;
542
543 if (proto != 0)
1c79356b
A
544 prp = pffindproto(dom, proto, type);
545 else
546 prp = pffindtype(dom, type);
9bccf70c 547
39236c6e
A
548 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
549 if (pffinddomain(dom) == NULL)
2d21ac55 550 return (EAFNOSUPPORT);
2d21ac55 551 if (proto != 0) {
39236c6e 552 if (pffindprotonotype(dom, proto) != NULL)
2d21ac55 553 return (EPROTOTYPE);
2d21ac55 554 }
9bccf70c
A
555 return (EPROTONOSUPPORT);
556 }
1c79356b
A
557 if (prp->pr_type != type)
558 return (EPROTOTYPE);
b0d623f7 559 so = soalloc(1, dom, type);
39236c6e 560 if (so == NULL)
1c79356b
A
561 return (ENOBUFS);
562
39236c6e
A
563 if (flags & SOCF_ASYNC)
564 so->so_state |= SS_NBIO;
565#if MULTIPATH
566 if (flags & SOCF_MP_SUBFLOW) {
567 /*
568 * A multipath subflow socket is used internally in the kernel,
569 * therefore it does not have a file desciptor associated by
570 * default.
571 */
572 so->so_state |= SS_NOFDREF;
573 so->so_flags |= SOF_MP_SUBFLOW;
574 }
575#endif /* MULTIPATH */
576
1c79356b
A
577 TAILQ_INIT(&so->so_incomp);
578 TAILQ_INIT(&so->so_comp);
579 so->so_type = type;
316670eb
A
580 so->last_upid = proc_uniqueid(p);
581 so->last_pid = proc_pid(p);
39236c6e
A
582 proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
583
584 if (ep != PROC_NULL && ep != p) {
585 so->e_upid = proc_uniqueid(ep);
586 so->e_pid = proc_pid(ep);
587 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
588 so->so_flags |= SOF_DELEGATED;
589 }
1c79356b 590
316670eb 591 so->so_cred = kauth_cred_proc_ref(p);
b0d623f7 592 if (!suser(kauth_cred_get(), NULL))
39236c6e 593 so->so_state |= SS_PRIV;
b0d623f7 594
1c79356b 595 so->so_proto = prp;
39236c6e 596 so->so_rcv.sb_flags |= SB_RECV;
91447636 597 so->so_rcv.sb_so = so->so_snd.sb_so = so;
0c530ab8
A
598 so->next_lock_lr = 0;
599 so->next_unlock_lr = 0;
2d21ac55
A
600
601#if CONFIG_MACF_SOCKET
602 mac_socket_label_associate(kauth_cred_get(), so);
603#endif /* MAC_SOCKET */
604
2d21ac55 605 /*
39236c6e
A
606 * Attachment will create the per pcb lock if necessary and
607 * increase refcount for creation, make sure it's done before
608 * socket is inserted in lists.
2d21ac55
A
609 */
610 so->so_usecount++;
91447636
A
611
612 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
39236c6e 613 if (error != 0) {
2d21ac55
A
614 /*
615 * Warning:
616 * If so_pcb is not zero, the socket will be leaked,
617 * so protocol attachment handler must be coded carefuly
55e303ae 618 */
1c79356b 619 so->so_state |= SS_NOFDREF;
37839358
A
620 so->so_usecount--;
621 sofreelastref(so, 1); /* will deallocate the socket */
1c79356b
A
622 return (error);
623 }
39236c6e
A
624
625 atomic_add_32(&prp->pr_domain->dom_refs, 1);
1c79356b 626 TAILQ_INIT(&so->so_evlist);
91447636
A
627
628 /* Attach socket filters for this protocol */
629 sflt_initsock(so);
55e303ae
A
630#if TCPDEBUG
631 if (tcpconsdebug == 2)
632 so->so_options |= SO_DEBUG;
9bccf70c 633#endif
6d2010ae 634 so_set_default_traffic_class(so);
39236c6e 635
d1ecb069 636 /*
39236c6e
A
637 * If this thread or task is marked to create backgrounded sockets,
638 * mark the socket as background.
d1ecb069 639 */
39236c6e 640 if (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
d1ecb069 641 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
6d2010ae
A
642 so->so_background_thread = current_thread();
643 }
644
645 switch (dom) {
316670eb 646 /*
39236c6e
A
647 * Don't mark Unix domain, system or multipath sockets as
648 * eligible for defunct by default.
649 */
6d2010ae 650 case PF_LOCAL:
316670eb 651 case PF_SYSTEM:
39236c6e 652 case PF_MULTIPATH:
6d2010ae
A
653 so->so_flags |= SOF_NODEFUNCT;
654 break;
316670eb
A
655 default:
656 break;
d1ecb069
A
657 }
658
1c79356b 659 *aso = so;
39236c6e 660
1c79356b
A
661 return (0);
662}
663
39236c6e
A
664/*
665 * Returns: 0 Success
666 * EAFNOSUPPORT
667 * EPROTOTYPE
668 * EPROTONOSUPPORT
669 * ENOBUFS
670 * <pru_attach>:ENOBUFS[AF_UNIX]
671 * <pru_attach>:ENOBUFS[TCP]
672 * <pru_attach>:ENOMEM[TCP]
673 * <pru_attach>:??? [other protocol families, IPSEC]
674 */
675int
676socreate(int dom, struct socket **aso, int type, int proto)
677{
678 return (socreate_internal(dom, aso, type, proto, current_proc(), 0,
679 PROC_NULL));
680}
681
682int
683socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
684{
685 int error = 0;
686 struct proc *ep = PROC_NULL;
687
688 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
689 error = ESRCH;
690 goto done;
691 }
692
693 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
694
695 /*
696 * It might not be wise to hold the proc reference when calling
697 * socreate_internal since it calls soalloc with M_WAITOK
698 */
699done:
700 if (ep != PROC_NULL)
701 proc_rele(ep);
702
703 return (error);
704}
705
2d21ac55
A
706/*
707 * Returns: 0 Success
708 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
709 * <pru_bind>:EAFNOSUPPORT Address family not supported
710 * <pru_bind>:EADDRNOTAVAIL Address not available.
711 * <pru_bind>:EINVAL Invalid argument
712 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
713 * <pru_bind>:EACCES Permission denied
714 * <pru_bind>:EADDRINUSE Address in use
715 * <pru_bind>:EAGAIN Resource unavailable, try again
716 * <pru_bind>:EPERM Operation not permitted
717 * <pru_bind>:???
718 * <sf_bind>:???
719 *
720 * Notes: It's not possible to fully enumerate the return codes above,
721 * since socket filter authors and protocol family authors may
722 * not choose to limit their error returns to those listed, even
723 * though this may result in some software operating incorrectly.
724 *
725 * The error codes which are enumerated above are those known to
726 * be returned by the tcp_usr_bind function supplied.
727 */
1c79356b 728int
39236c6e 729sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
1c79356b
A
730{
731 struct proc *p = current_proc();
91447636 732 int error = 0;
1c79356b 733
39236c6e
A
734 if (dolock)
735 socket_lock(so, 1);
736 VERIFY(so->so_usecount > 1);
737
6d2010ae 738 so_update_last_owner_locked(so, p);
39236c6e 739 so_update_policy(so);
91447636 740
2d21ac55 741 /*
6d2010ae
A
742 * If this is a bind request on a socket that has been marked
743 * as inactive, reject it now before we go any further.
2d21ac55
A
744 */
745 if (so->so_flags & SOF_DEFUNCT) {
746 error = EINVAL;
39236c6e
A
747 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
748 __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
749 SOCK_DOM(so), SOCK_TYPE(so), error));
2d21ac55
A
750 goto out;
751 }
752
91447636 753 /* Socket filter */
6d2010ae 754 error = sflt_bind(so, nam);
2d21ac55 755
91447636
A
756 if (error == 0)
757 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
2d21ac55 758out:
39236c6e
A
759 if (dolock)
760 socket_unlock(so, 1);
2d21ac55 761
91447636
A
762 if (error == EJUSTRETURN)
763 error = 0;
2d21ac55 764
1c79356b
A
765 return (error);
766}
767
768void
2d21ac55 769sodealloc(struct socket *so)
1c79356b 770{
316670eb
A
771 kauth_cred_unref(&so->so_cred);
772
6d2010ae
A
773 /* Remove any filters */
774 sflt_termsock(so);
775
39236c6e
A
776 /* Delete the state allocated for msg queues on a socket */
777 if (so->so_flags & SOF_ENABLE_MSGS) {
778 FREE(so->so_msg_state, M_TEMP);
779 so->so_msg_state = NULL;
780 }
781 VERIFY(so->so_msg_state == NULL);
782
1c79356b
A
783 so->so_gencnt = ++so_gencnt;
784
2d21ac55
A
785#if CONFIG_MACF_SOCKET
786 mac_socket_label_destroy(so);
787#endif /* MAC_SOCKET */
39236c6e
A
788
789 if (so->cached_in_sock_layer) {
2d21ac55
A
790 cached_sock_free(so);
791 } else {
2d21ac55 792 FREE_ZONE(so, sizeof (*so), so->so_zone);
91447636 793 }
1c79356b
A
794}
795
2d21ac55
A
796/*
797 * Returns: 0 Success
798 * EINVAL
799 * EOPNOTSUPP
800 * <pru_listen>:EINVAL[AF_UNIX]
801 * <pru_listen>:EINVAL[TCP]
802 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
803 * <pru_listen>:EINVAL[TCP] Invalid argument
804 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
805 * <pru_listen>:EACCES[TCP] Permission denied
806 * <pru_listen>:EADDRINUSE[TCP] Address in use
807 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
808 * <pru_listen>:EPERM[TCP] Operation not permitted
809 * <sf_listen>:???
810 *
811 * Notes: Other <pru_listen> returns depend on the protocol family; all
812 * <sf_listen> returns depend on what the filter author causes
813 * their filter to return.
814 */
1c79356b 815int
2d21ac55 816solisten(struct socket *so, int backlog)
1c79356b 817{
1c79356b 818 struct proc *p = current_proc();
2d21ac55 819 int error = 0;
1c79356b 820
91447636 821 socket_lock(so, 1);
39236c6e
A
822
823 so_update_last_owner_locked(so, p);
824 so_update_policy(so);
825
2d21ac55
A
826 if (so->so_proto == NULL) {
827 error = EINVAL;
828 goto out;
829 }
830 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
831 error = EOPNOTSUPP;
832 goto out;
833 }
834
835 /*
836 * If the listen request is made on a socket that is not fully
6d2010ae
A
837 * disconnected, or on a socket that has been marked as inactive,
838 * reject the request now.
2d21ac55
A
839 */
840 if ((so->so_state &
841 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
842 (so->so_flags & SOF_DEFUNCT)) {
843 error = EINVAL;
6d2010ae 844 if (so->so_flags & SOF_DEFUNCT) {
39236c6e
A
845 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
846 "(%d)\n", __func__, proc_pid(p),
847 (uint64_t)VM_KERNEL_ADDRPERM(so),
848 SOCK_DOM(so), SOCK_TYPE(so), error));
6d2010ae 849 }
2d21ac55
A
850 goto out;
851 }
852
39236c6e 853 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
2d21ac55
A
854 error = EPERM;
855 goto out;
856 }
857
6d2010ae 858 error = sflt_listen(so);
39236c6e 859 if (error == 0)
91447636 860 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
2d21ac55 861
1c79356b 862 if (error) {
91447636
A
863 if (error == EJUSTRETURN)
864 error = 0;
2d21ac55 865 goto out;
1c79356b 866 }
2d21ac55 867
91447636 868 if (TAILQ_EMPTY(&so->so_comp))
1c79356b 869 so->so_options |= SO_ACCEPTCONN;
2d21ac55
A
870 /*
871 * POSIX: The implementation may have an upper limit on the length of
872 * the listen queue-either global or per accepting socket. If backlog
873 * exceeds this limit, the length of the listen queue is set to the
874 * limit.
875 *
876 * If listen() is called with a backlog argument value that is less
877 * than 0, the function behaves as if it had been called with a backlog
878 * argument value of 0.
879 *
880 * A backlog argument of 0 may allow the socket to accept connections,
881 * in which case the length of the listen queue may be set to an
882 * implementation-defined minimum value.
883 */
884 if (backlog <= 0 || backlog > somaxconn)
1c79356b 885 backlog = somaxconn;
1c79356b 886
2d21ac55
A
887 so->so_qlimit = backlog;
888out:
91447636 889 socket_unlock(so, 1);
2d21ac55 890 return (error);
1c79356b
A
891}
892
1c79356b 893void
2d21ac55 894sofreelastref(struct socket *so, int dealloc)
9bccf70c 895{
1c79356b
A
896 struct socket *head = so->so_head;
897
2d21ac55 898 /* Assume socket is locked */
1c79356b 899
39236c6e 900 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
0b4e3aa0
A
901 selthreadclear(&so->so_snd.sb_sel);
902 selthreadclear(&so->so_rcv.sb_sel);
39236c6e
A
903 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
904 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
905 so->so_event = NULL;
1c79356b 906 return;
0b4e3aa0 907 }
9bccf70c 908 if (head != NULL) {
91447636 909 socket_lock(head, 1);
9bccf70c
A
910 if (so->so_state & SS_INCOMP) {
911 TAILQ_REMOVE(&head->so_incomp, so, so_list);
912 head->so_incqlen--;
913 } else if (so->so_state & SS_COMP) {
914 /*
915 * We must not decommission a socket that's
916 * on the accept(2) queue. If we do, then
917 * accept(2) may hang after select(2) indicated
918 * that the listening socket was ready.
919 */
9bccf70c
A
920 selthreadclear(&so->so_snd.sb_sel);
921 selthreadclear(&so->so_rcv.sb_sel);
39236c6e
A
922 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
923 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
924 so->so_event = NULL;
91447636 925 socket_unlock(head, 1);
9bccf70c
A
926 return;
927 } else {
928 panic("sofree: not queued");
929 }
1c79356b 930 head->so_qlen--;
9bccf70c 931 so->so_state &= ~SS_INCOMP;
1c79356b 932 so->so_head = NULL;
91447636 933 socket_unlock(head, 1);
1c79356b 934 }
39236c6e 935 sowflush(so);
1c79356b 936 sorflush(so);
2d21ac55 937
39236c6e
A
938#if FLOW_DIVERT
939 if (so->so_flags & SOF_FLOW_DIVERT) {
940 flow_divert_detach(so);
941 }
942#endif /* FLOW_DIVERT */
943
91447636
A
944 /* 3932268: disable upcall */
945 so->so_rcv.sb_flags &= ~SB_UPCALL;
946 so->so_snd.sb_flags &= ~SB_UPCALL;
39236c6e 947 so->so_event = NULL;
2d21ac55 948
91447636
A
949 if (dealloc)
950 sodealloc(so);
1c79356b
A
951}
952
2d21ac55
A
953void
954soclose_wait_locked(struct socket *so)
955{
956 lck_mtx_t *mutex_held;
957
958 if (so->so_proto->pr_getlock != NULL)
959 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
960 else
961 mutex_held = so->so_proto->pr_domain->dom_mtx;
962 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
963
4a3eedf9
A
964 /*
965 * Double check here and return if there's no outstanding upcall;
966 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
967 */
316670eb 968 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
2d21ac55 969 return;
316670eb
A
970 so->so_rcv.sb_flags &= ~SB_UPCALL;
971 so->so_snd.sb_flags &= ~SB_UPCALL;
2d21ac55 972 so->so_flags |= SOF_CLOSEWAIT;
39236c6e 973 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
2d21ac55
A
974 "soclose_wait_locked", NULL);
975 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
976 so->so_flags &= ~SOF_CLOSEWAIT;
977}
978
1c79356b
A
979/*
980 * Close a socket on last file table reference removal.
981 * Initiate disconnect if connected.
982 * Free socket when disconnect complete.
983 */
984int
2d21ac55 985soclose_locked(struct socket *so)
1c79356b 986{
1c79356b 987 int error = 0;
2d21ac55 988 lck_mtx_t *mutex_held;
91447636 989 struct timespec ts;
1c79356b 990
91447636 991 if (so->so_usecount == 0) {
2d21ac55 992 panic("soclose: so=%p refcount=0\n", so);
39236c6e 993 /* NOTREACHED */
1c79356b
A
994 }
995
91447636 996 sflt_notify(so, sock_evt_closing, NULL);
2d21ac55 997
39236c6e
A
998 if (so->so_upcallusecount)
999 soclose_wait_locked(so);
1000
91447636 1001 if ((so->so_options & SO_ACCEPTCONN)) {
2d21ac55
A
1002 struct socket *sp, *sonext;
1003 int socklock = 0;
1004
1005 /*
1006 * We do not want new connection to be added
1007 * to the connection queues
1008 */
91447636 1009 so->so_options &= ~SO_ACCEPTCONN;
2d21ac55 1010
39236c6e
A
1011 for (sp = TAILQ_FIRST(&so->so_incomp);
1012 sp != NULL; sp = sonext) {
2d21ac55
A
1013 sonext = TAILQ_NEXT(sp, so_list);
1014
39236c6e
A
1015 /*
1016 * Radar 5350314
2d21ac55
A
1017 * skip sockets thrown away by tcpdropdropblreq
1018 * they will get cleanup by the garbage collection.
1019 * otherwise, remove the incomp socket from the queue
1020 * and let soabort trigger the appropriate cleanup.
91447636 1021 */
39236c6e 1022 if (sp->so_flags & SOF_OVERFLOW)
2d21ac55
A
1023 continue;
1024
ff6e181a 1025 if (so->so_proto->pr_getlock != NULL) {
39236c6e
A
1026 /*
1027 * Lock ordering for consistency with the
1028 * rest of the stack, we lock the socket
1029 * first and then grabb the head.
2d21ac55 1030 */
91447636 1031 socket_unlock(so, 0);
ff6e181a 1032 socket_lock(sp, 1);
ff6e181a 1033 socket_lock(so, 0);
39236c6e 1034 socklock = 1;
2d21ac55
A
1035 }
1036
1037 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1038 so->so_incqlen--;
1039
1040 if (sp->so_state & SS_INCOMP) {
1041 sp->so_state &= ~SS_INCOMP;
1042 sp->so_head = NULL;
1043
1044 (void) soabort(sp);
ff6e181a 1045 }
2d21ac55 1046
39236c6e 1047 if (socklock)
2d21ac55 1048 socket_unlock(sp, 1);
91447636
A
1049 }
1050
1051 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
91447636 1052 /* Dequeue from so_comp since sofree() won't do it */
2d21ac55 1053 TAILQ_REMOVE(&so->so_comp, sp, so_list);
91447636 1054 so->so_qlen--;
ff6e181a
A
1055
1056 if (so->so_proto->pr_getlock != NULL) {
1057 socket_unlock(so, 0);
1058 socket_lock(sp, 1);
1059 }
1060
2d21ac55
A
1061 if (sp->so_state & SS_COMP) {
1062 sp->so_state &= ~SS_COMP;
1063 sp->so_head = NULL;
1064
1065 (void) soabort(sp);
1066 }
91447636 1067
ff6e181a 1068 if (so->so_proto->pr_getlock != NULL) {
91447636 1069 socket_unlock(sp, 1);
ff6e181a
A
1070 socket_lock(so, 0);
1071 }
91447636
A
1072 }
1073 }
39236c6e 1074 if (so->so_pcb == NULL) {
91447636
A
1075 /* 3915887: mark the socket as ready for dealloc */
1076 so->so_flags |= SOF_PCBCLEARING;
1c79356b 1077 goto discard;
91447636 1078 }
1c79356b
A
1079 if (so->so_state & SS_ISCONNECTED) {
1080 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
91447636 1081 error = sodisconnectlocked(so);
1c79356b
A
1082 if (error)
1083 goto drop;
1084 }
1085 if (so->so_options & SO_LINGER) {
1086 if ((so->so_state & SS_ISDISCONNECTING) &&
1087 (so->so_state & SS_NBIO))
1088 goto drop;
2d21ac55 1089 if (so->so_proto->pr_getlock != NULL)
91447636 1090 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2d21ac55 1091 else
91447636 1092 mutex_held = so->so_proto->pr_domain->dom_mtx;
1c79356b 1093 while (so->so_state & SS_ISCONNECTED) {
91447636 1094 ts.tv_sec = (so->so_linger/100);
2d21ac55
A
1095 ts.tv_nsec = (so->so_linger % 100) *
1096 NSEC_PER_USEC * 1000 * 10;
1097 error = msleep((caddr_t)&so->so_timeo,
1098 mutex_held, PSOCK | PCATCH, "soclose", &ts);
91447636 1099 if (error) {
2d21ac55
A
1100 /*
1101 * It's OK when the time fires,
1102 * don't report an error
1103 */
91447636
A
1104 if (error == EWOULDBLOCK)
1105 error = 0;
1c79356b 1106 break;
91447636 1107 }
1c79356b
A
1108 }
1109 }
1110 }
1111drop:
39236c6e 1112 if (so->so_usecount == 0) {
2d21ac55 1113 panic("soclose: usecount is zero so=%p\n", so);
39236c6e
A
1114 /* NOTREACHED */
1115 }
1116 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
316670eb
A
1117 /*
1118 * Let NetworkStatistics know this PCB is going away
1119 * before we detach it.
1120 */
1121 if (nstat_collect &&
39236c6e 1122 (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6))
316670eb
A
1123 nstat_pcb_detach(so->so_pcb);
1124
1c79356b
A
1125 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1126 if (error == 0)
1127 error = error2;
1128 }
39236c6e 1129 if (so->so_usecount <= 0) {
2d21ac55 1130 panic("soclose: usecount is zero so=%p\n", so);
39236c6e
A
1131 /* NOTREACHED */
1132 }
1c79356b 1133discard:
39236c6e
A
1134 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1135 (so->so_state & SS_NOFDREF)) {
1c79356b 1136 panic("soclose: NOFDREF");
39236c6e
A
1137 /* NOTREACHED */
1138 }
1c79356b 1139 so->so_state |= SS_NOFDREF;
39236c6e
A
1140
1141 if (so->so_flags & SOF_MP_SUBFLOW)
1142 so->so_flags &= ~SOF_MP_SUBFLOW;
1143
316670eb
A
1144 if ((so->so_flags & SOF_KNOTE) != 0)
1145 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
39236c6e
A
1146
1147 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1c79356b 1148 evsofree(so);
39236c6e 1149
91447636 1150 so->so_usecount--;
1c79356b 1151 sofree(so);
1c79356b
A
1152 return (error);
1153}
1154
91447636 1155int
2d21ac55 1156soclose(struct socket *so)
91447636
A
1157{
1158 int error = 0;
1159 socket_lock(so, 1);
2d21ac55 1160
2d21ac55 1161 if (so->so_retaincnt == 0) {
91447636 1162 error = soclose_locked(so);
2d21ac55
A
1163 } else {
1164 /*
1165 * if the FD is going away, but socket is
1166 * retained in kernel remove its reference
1167 */
91447636
A
1168 so->so_usecount--;
1169 if (so->so_usecount < 2)
2d21ac55
A
1170 panic("soclose: retaincnt non null and so=%p "
1171 "usecount=%d\n", so, so->so_usecount);
91447636
A
1172 }
1173 socket_unlock(so, 1);
1174 return (error);
1175}
1176
1c79356b
A
1177/*
1178 * Must be called at splnet...
1179 */
2d21ac55 1180/* Should already be locked */
1c79356b 1181int
2d21ac55 1182soabort(struct socket *so)
1c79356b 1183{
9bccf70c 1184 int error;
1c79356b 1185
91447636 1186#ifdef MORE_LOCKING_DEBUG
2d21ac55 1187 lck_mtx_t *mutex_held;
91447636 1188
2d21ac55 1189 if (so->so_proto->pr_getlock != NULL)
91447636 1190 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2d21ac55 1191 else
91447636
A
1192 mutex_held = so->so_proto->pr_domain->dom_mtx;
1193 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1194#endif
1195
2d21ac55
A
1196 if ((so->so_flags & SOF_ABORTED) == 0) {
1197 so->so_flags |= SOF_ABORTED;
1198 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1199 if (error) {
1200 sofree(so);
1201 return (error);
1202 }
9bccf70c
A
1203 }
1204 return (0);
1c79356b
A
1205}
1206
1207int
2d21ac55 1208soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
9bccf70c 1209{
1c79356b 1210 int error;
91447636 1211
2d21ac55
A
1212 if (dolock)
1213 socket_lock(so, 1);
1c79356b 1214
39236c6e
A
1215 so_update_last_owner_locked(so, PROC_NULL);
1216 so_update_policy(so);
1217
1c79356b
A
1218 if ((so->so_state & SS_NOFDREF) == 0)
1219 panic("soaccept: !NOFDREF");
1220 so->so_state &= ~SS_NOFDREF;
1221 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
2d21ac55
A
1222
1223 if (dolock)
1224 socket_unlock(so, 1);
1c79356b
A
1225 return (error);
1226}
2d21ac55 1227
91447636 1228int
2d21ac55 1229soaccept(struct socket *so, struct sockaddr **nam)
91447636
A
1230{
1231 return (soacceptlock(so, nam, 1));
1232}
1c79356b
A
1233
1234int
2d21ac55
A
1235soacceptfilter(struct socket *so)
1236{
1237 struct sockaddr *local = NULL, *remote = NULL;
6d2010ae 1238 int error = 0;
2d21ac55
A
1239 struct socket *head = so->so_head;
1240
1241 /*
39236c6e
A
1242 * Hold the lock even if this socket has not been made visible
1243 * to the filter(s). For sockets with global locks, this protects
1244 * against the head or peer going away
2d21ac55 1245 */
b0d623f7
A
1246 socket_lock(so, 1);
1247 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1248 sogetaddr_locked(so, &local, 0) != 0) {
2d21ac55
A
1249 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1250 so->so_head = NULL;
b0d623f7 1251 socket_unlock(so, 1);
2d21ac55
A
1252 soclose(so);
1253 /* Out of resources; try it again next time */
1254 error = ECONNABORTED;
1255 goto done;
1256 }
1257
6d2010ae 1258 error = sflt_accept(head, so, local, remote);
2d21ac55
A
1259
1260 /*
1261 * If we get EJUSTRETURN from one of the filters, mark this socket
1262 * as inactive and return it anyway. This newly accepted socket
1263 * will be disconnected later before we hand it off to the caller.
1264 */
1265 if (error == EJUSTRETURN) {
1266 error = 0;
6d2010ae
A
1267 (void) sosetdefunct(current_proc(), so,
1268 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
2d21ac55
A
1269 }
1270
1271 if (error != 0) {
1272 /*
1273 * This may seem like a duplication to the above error
1274 * handling part when we return ECONNABORTED, except
1275 * the following is done while holding the lock since
1276 * the socket has been exposed to the filter(s) earlier.
1277 */
1278 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1279 so->so_head = NULL;
1280 socket_unlock(so, 1);
1281 soclose(so);
1282 /* Propagate socket filter's error code to the caller */
1283 } else {
1284 socket_unlock(so, 1);
1285 }
1286done:
1287 /* Callee checks for NULL pointer */
1288 sock_freeaddr(remote);
1289 sock_freeaddr(local);
1290 return (error);
1291}
1c79356b 1292
2d21ac55
A
1293/*
1294 * Returns: 0 Success
1295 * EOPNOTSUPP Operation not supported on socket
1296 * EISCONN Socket is connected
1297 * <pru_connect>:EADDRNOTAVAIL Address not available.
1298 * <pru_connect>:EINVAL Invalid argument
1299 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1300 * <pru_connect>:EACCES Permission denied
1301 * <pru_connect>:EADDRINUSE Address in use
1302 * <pru_connect>:EAGAIN Resource unavailable, try again
1303 * <pru_connect>:EPERM Operation not permitted
1304 * <sf_connect_out>:??? [anything a filter writer might set]
1305 */
1306int
1307soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1c79356b 1308{
1c79356b
A
1309 int error;
1310 struct proc *p = current_proc();
1c79356b 1311
2d21ac55
A
1312 if (dolock)
1313 socket_lock(so, 1);
39236c6e
A
1314
1315 so_update_last_owner_locked(so, p);
1316 so_update_policy(so);
1317
2d21ac55
A
1318 /*
1319 * If this is a listening socket or if this is a previously-accepted
1320 * socket that has been marked as inactive, reject the connect request.
1321 */
1322 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
6d2010ae
A
1323 error = EOPNOTSUPP;
1324 if (so->so_flags & SOF_DEFUNCT) {
39236c6e
A
1325 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1326 "(%d)\n", __func__, proc_pid(p),
1327 (uint64_t)VM_KERNEL_ADDRPERM(so),
1328 SOCK_DOM(so), SOCK_TYPE(so), error));
6d2010ae 1329 }
2d21ac55
A
1330 if (dolock)
1331 socket_unlock(so, 1);
6d2010ae 1332 return (error);
91447636 1333 }
2d21ac55 1334
39236c6e 1335 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
2d21ac55
A
1336 if (dolock)
1337 socket_unlock(so, 1);
1338 return (EPERM);
1339 }
1340
1c79356b
A
1341 /*
1342 * If protocol is connection-based, can only connect once.
1343 * Otherwise, if connected, try to disconnect first.
1344 * This allows user to disconnect by connecting to, e.g.,
1345 * a null address.
1346 */
1347 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1348 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
2d21ac55 1349 (error = sodisconnectlocked(so)))) {
1c79356b 1350 error = EISCONN;
2d21ac55 1351 } else {
91447636
A
1352 /*
1353 * Run connect filter before calling protocol:
1354 * - non-blocking connect returns before completion;
1355 */
6d2010ae 1356 error = sflt_connectout(so, nam);
39236c6e 1357 if (error != 0) {
91447636
A
1358 if (error == EJUSTRETURN)
1359 error = 0;
6d2010ae 1360 } else {
39236c6e
A
1361 error = (*so->so_proto->pr_usrreqs->pru_connect)
1362 (so, nam, p);
91447636 1363 }
1c79356b 1364 }
2d21ac55
A
1365 if (dolock)
1366 socket_unlock(so, 1);
1c79356b
A
1367 return (error);
1368}
1369
91447636 1370int
2d21ac55 1371soconnect(struct socket *so, struct sockaddr *nam)
91447636
A
1372{
1373 return (soconnectlock(so, nam, 1));
1374}
1375
2d21ac55
A
1376/*
1377 * Returns: 0 Success
1378 * <pru_connect2>:EINVAL[AF_UNIX]
1379 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1380 * <pru_connect2>:??? [other protocol families]
1381 *
1382 * Notes: <pru_connect2> is not supported by [TCP].
1383 */
1c79356b 1384int
2d21ac55 1385soconnect2(struct socket *so1, struct socket *so2)
1c79356b 1386{
1c79356b 1387 int error;
91447636 1388
0c530ab8 1389 socket_lock(so1, 1);
2d21ac55 1390 if (so2->so_proto->pr_lock)
0c530ab8 1391 socket_lock(so2, 1);
1c79356b
A
1392
1393 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
2d21ac55 1394
0c530ab8 1395 socket_unlock(so1, 1);
2d21ac55 1396 if (so2->so_proto->pr_lock)
0c530ab8 1397 socket_unlock(so2, 1);
1c79356b
A
1398 return (error);
1399}
1400
39236c6e
A
1401int
1402soconnectxlocked(struct socket *so, struct sockaddr_list **src_sl,
1403 struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
1404 associd_t aid, connid_t *pcid, uint32_t flags, void *arg,
1405 uint32_t arglen)
1406{
1407 int error;
1408
1409 /*
1410 * If this is a listening socket or if this is a previously-accepted
1411 * socket that has been marked as inactive, reject the connect request.
1412 */
1413 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1414 error = EOPNOTSUPP;
1415 if (so->so_flags & SOF_DEFUNCT) {
1416 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1417 "(%d)\n", __func__, proc_pid(p),
1418 (uint64_t)VM_KERNEL_ADDRPERM(so),
1419 SOCK_DOM(so), SOCK_TYPE(so), error));
1420 }
1421 return (error);
1422 }
1423
1424 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0)
1425 return (EPERM);
1426
1427 /*
1428 * If protocol is connection-based, can only connect once
1429 * unless PR_MULTICONN is set. Otherwise, if connected,
1430 * try to disconnect first. This allows user to disconnect
1431 * by connecting to, e.g., a null address.
1432 */
1433 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) &&
1434 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1435 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1436 (error = sodisconnectlocked(so)) != 0)) {
1437 error = EISCONN;
1438 } else {
1439 /*
1440 * Run connect filter before calling protocol:
1441 * - non-blocking connect returns before completion;
1442 */
1443 error = sflt_connectxout(so, dst_sl);
1444 if (error != 0) {
1445 if (error == EJUSTRETURN)
1446 error = 0;
1447 } else {
1448 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1449 (so, src_sl, dst_sl, p, ifscope, aid, pcid,
1450 flags, arg, arglen);
1451 }
1452 }
1453
1454 return (error);
1455}
1456
1c79356b 1457int
2d21ac55 1458sodisconnectlocked(struct socket *so)
1c79356b 1459{
1c79356b 1460 int error;
1c79356b
A
1461
1462 if ((so->so_state & SS_ISCONNECTED) == 0) {
1463 error = ENOTCONN;
1464 goto bad;
1465 }
1466 if (so->so_state & SS_ISDISCONNECTING) {
1467 error = EALREADY;
1468 goto bad;
1469 }
2d21ac55 1470
1c79356b 1471 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
39236c6e 1472 if (error == 0)
91447636 1473 sflt_notify(so, sock_evt_disconnected, NULL);
39236c6e 1474
1c79356b 1475bad:
1c79356b
A
1476 return (error);
1477}
2d21ac55
A
1478
1479/* Locking version */
91447636 1480int
2d21ac55 1481sodisconnect(struct socket *so)
91447636 1482{
2d21ac55 1483 int error;
91447636
A
1484
1485 socket_lock(so, 1);
1486 error = sodisconnectlocked(so);
1487 socket_unlock(so, 1);
2d21ac55 1488 return (error);
91447636 1489}
1c79356b 1490
39236c6e
A
1491int
1492sodisconnectxlocked(struct socket *so, associd_t aid, connid_t cid)
1493{
1494 int error;
1495
1496 /*
1497 * Call the protocol disconnectx handler; let it handle all
1498 * matters related to the connection state of this session.
1499 */
1500 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1501 if (error == 0) {
1502 /*
1503 * The event applies only for the session, not for
1504 * the disconnection of individual subflows.
1505 */
1506 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
1507 sflt_notify(so, sock_evt_disconnected, NULL);
1508 }
1509 return (error);
1510}
1511
1512int
1513sodisconnectx(struct socket *so, associd_t aid, connid_t cid)
1514{
1515 int error;
1516
1517 socket_lock(so, 1);
1518 error = sodisconnectxlocked(so, aid, cid);
1519 socket_unlock(so, 1);
1520 return (error);
1521}
1522
1523int
1524sopeelofflocked(struct socket *so, associd_t aid, struct socket **psop)
1525{
1526 return ((*so->so_proto->pr_usrreqs->pru_peeloff)(so, aid, psop));
1527}
1528
1529#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
91447636
A
1530
1531/*
1532 * sosendcheck will lock the socket buffer if it isn't locked and
1533 * verify that there is space for the data being inserted.
2d21ac55
A
1534 *
1535 * Returns: 0 Success
1536 * EPIPE
1537 * sblock:EWOULDBLOCK
1538 * sblock:EINTR
1539 * sbwait:EBADF
1540 * sbwait:EINTR
1541 * [so_error]:???
91447636 1542 */
39236c6e
A
1543int
1544sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1545 int32_t clen, int32_t atomic, int flags, int *sblocked,
1546 struct mbuf *control)
91447636 1547{
39236c6e 1548 int error = 0;
b0d623f7 1549 int32_t space;
3a60a9f5 1550 int assumelock = 0;
91447636
A
1551
1552restart:
1553 if (*sblocked == 0) {
3a60a9f5 1554 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
2d21ac55
A
1555 so->so_send_filt_thread != 0 &&
1556 so->so_send_filt_thread == current_thread()) {
3a60a9f5
A
1557 /*
1558 * We're being called recursively from a filter,
1559 * allow this to continue. Radar 4150520.
1560 * Don't set sblocked because we don't want
1561 * to perform an unlock later.
1562 */
1563 assumelock = 1;
2d21ac55 1564 } else {
3a60a9f5
A
1565 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1566 if (error) {
6d2010ae
A
1567 if (so->so_flags & SOF_DEFUNCT)
1568 goto defunct;
2d21ac55 1569 return (error);
3a60a9f5
A
1570 }
1571 *sblocked = 1;
1572 }
91447636 1573 }
2d21ac55
A
1574
1575 /*
6d2010ae
A
1576 * If a send attempt is made on a socket that has been marked
1577 * as inactive (disconnected), reject the request.
2d21ac55 1578 */
6d2010ae
A
1579 if (so->so_flags & SOF_DEFUNCT) {
1580defunct:
1581 error = EPIPE;
39236c6e
A
1582 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
1583 __func__, proc_selfpid(), (uint64_t)VM_KERNEL_ADDRPERM(so),
1584 SOCK_DOM(so), SOCK_TYPE(so), error));
6d2010ae
A
1585 return (error);
1586 }
2d21ac55
A
1587
1588 if (so->so_state & SS_CANTSENDMORE)
1589 return (EPIPE);
1590
91447636
A
1591 if (so->so_error) {
1592 error = so->so_error;
1593 so->so_error = 0;
2d21ac55 1594 return (error);
91447636 1595 }
2d21ac55 1596
91447636 1597 if ((so->so_state & SS_ISCONNECTED) == 0) {
2d21ac55 1598 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
91447636 1599 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
2d21ac55
A
1600 !(resid == 0 && clen != 0))
1601 return (ENOTCONN);
1602 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1603 return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1604 ENOTCONN : EDESTADDRREQ);
1605 }
91447636 1606 }
39236c6e
A
1607 if (so->so_flags & SOF_ENABLE_MSGS)
1608 space = msgq_sbspace(so, control);
1609 else
1610 space = sbspace(&so->so_snd);
1611
91447636
A
1612 if (flags & MSG_OOB)
1613 space += 1024;
1614 if ((atomic && resid > so->so_snd.sb_hiwat) ||
2d21ac55
A
1615 clen > so->so_snd.sb_hiwat)
1616 return (EMSGSIZE);
39236c6e 1617
316670eb
A
1618 if ((space < resid + clen &&
1619 (atomic || space < (int32_t)so->so_snd.sb_lowat || space < clen)) ||
1620 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2d21ac55
A
1621 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1622 assumelock) {
1623 return (EWOULDBLOCK);
3a60a9f5 1624 }
39236c6e 1625 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
6d2010ae 1626 *sblocked = 0;
91447636
A
1627 error = sbwait(&so->so_snd);
1628 if (error) {
6d2010ae
A
1629 if (so->so_flags & SOF_DEFUNCT)
1630 goto defunct;
2d21ac55 1631 return (error);
91447636
A
1632 }
1633 goto restart;
1634 }
2d21ac55 1635 return (0);
91447636
A
1636}
1637
1c79356b
A
1638/*
1639 * Send on a socket.
1640 * If send must go all at once and message is larger than
1641 * send buffering, then hard error.
1642 * Lock against other senders.
1643 * If must go all at once and not enough room now, then
1644 * inform user that this would block and do nothing.
1645 * Otherwise, if nonblocking, send as much as possible.
1646 * The data to be sent is described by "uio" if nonzero,
1647 * otherwise by the mbuf chain "top" (which must be null
1648 * if uio is not). Data provided in mbuf chain must be small
1649 * enough to send all at once.
1650 *
1651 * Returns nonzero on error, timeout or signal; callers
1652 * must check for short counts if EINTR/ERESTART are returned.
1653 * Data and control buffers are freed on return.
1654 * Experiment:
1655 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1656 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1657 * point at the mbuf chain being constructed and go from there.
2d21ac55
A
1658 *
1659 * Returns: 0 Success
1660 * EOPNOTSUPP
1661 * EINVAL
1662 * ENOBUFS
1663 * uiomove:EFAULT
1664 * sosendcheck:EPIPE
1665 * sosendcheck:EWOULDBLOCK
1666 * sosendcheck:EINTR
1667 * sosendcheck:EBADF
1668 * sosendcheck:EINTR
1669 * sosendcheck:??? [value from so_error]
1670 * <pru_send>:ECONNRESET[TCP]
1671 * <pru_send>:EINVAL[TCP]
1672 * <pru_send>:ENOBUFS[TCP]
1673 * <pru_send>:EADDRINUSE[TCP]
1674 * <pru_send>:EADDRNOTAVAIL[TCP]
1675 * <pru_send>:EAFNOSUPPORT[TCP]
1676 * <pru_send>:EACCES[TCP]
1677 * <pru_send>:EAGAIN[TCP]
1678 * <pru_send>:EPERM[TCP]
1679 * <pru_send>:EMSGSIZE[TCP]
1680 * <pru_send>:EHOSTUNREACH[TCP]
1681 * <pru_send>:ENETUNREACH[TCP]
1682 * <pru_send>:ENETDOWN[TCP]
1683 * <pru_send>:ENOMEM[TCP]
1684 * <pru_send>:ENOBUFS[TCP]
1685 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
1686 * <pru_send>:EINVAL[AF_UNIX]
1687 * <pru_send>:EOPNOTSUPP[AF_UNIX]
1688 * <pru_send>:EPIPE[AF_UNIX]
1689 * <pru_send>:ENOTCONN[AF_UNIX]
1690 * <pru_send>:EISCONN[AF_UNIX]
1691 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
1692 * <sf_data_out>:??? [whatever a filter author chooses]
1693 *
1694 * Notes: Other <pru_send> returns depend on the protocol family; all
1695 * <sf_data_out> returns depend on what the filter author causes
1696 * their filter to return.
1c79356b
A
1697 */
1698int
2d21ac55
A
1699sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1700 struct mbuf *top, struct mbuf *control, int flags)
1c79356b
A
1701{
1702 struct mbuf **mp;
39236c6e
A
1703 struct mbuf *m, *freelist = NULL;
1704 user_ssize_t space, len, resid;
91447636 1705 int clen = 0, error, dontroute, mlen, sendflags;
1c79356b 1706 int atomic = sosendallatonce(so) || top;
91447636 1707 int sblocked = 0;
1c79356b 1708 struct proc *p = current_proc();
39236c6e 1709 struct mbuf *control_copy = NULL;
1c79356b 1710
39236c6e 1711 if (uio != NULL)
91447636 1712 resid = uio_resid(uio);
39236c6e 1713 else
1c79356b 1714 resid = top->m_pkthdr.len;
39236c6e 1715
2d21ac55
A
1716 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
1717 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1c79356b 1718
91447636 1719 socket_lock(so, 1);
6d2010ae 1720 so_update_last_owner_locked(so, p);
39236c6e
A
1721 so_update_policy(so);
1722
2d21ac55
A
1723 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
1724 error = EOPNOTSUPP;
1725 socket_unlock(so, 1);
1726 goto out;
1727 }
91447636 1728
1c79356b
A
1729 /*
1730 * In theory resid should be unsigned.
1731 * However, space must be signed, as it might be less than 0
1732 * if we over-committed, and we must use a signed comparison
1733 * of space and resid. On the other hand, a negative resid
1734 * causes us to loop sending 0-length segments to the protocol.
1735 *
39236c6e
A
1736 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
1737 * But it will be used by sockets doing message delivery.
1738 *
1739 * Note: We limit resid to be a positive 32 bits value as we use
1740 * imin() to set bytes_to_copy -- radr://14558484
1c79356b 1741 */
39236c6e
A
1742 if ((int32_t)resid < 0 || (so->so_type == SOCK_STREAM &&
1743 !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
1c79356b 1744 error = EINVAL;
91447636 1745 socket_unlock(so, 1);
1c79356b
A
1746 goto out;
1747 }
1748
39236c6e
A
1749 dontroute = (flags & MSG_DONTROUTE) &&
1750 (so->so_options & SO_DONTROUTE) == 0 &&
1c79356b 1751 (so->so_proto->pr_flags & PR_ATOMIC);
b0d623f7 1752 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
39236c6e
A
1753
1754 if (control != NULL)
1c79356b 1755 clen = control->m_len;
1c79356b 1756
1c79356b 1757 do {
2d21ac55 1758 error = sosendcheck(so, addr, resid, clen, atomic, flags,
39236c6e
A
1759 &sblocked, control);
1760 if (error)
3a60a9f5 1761 goto release;
39236c6e 1762
1c79356b 1763 mp = &top;
39236c6e
A
1764 if (so->so_flags & SOF_ENABLE_MSGS)
1765 space = msgq_sbspace(so, control);
1766 else
1767 space = sbspace(&so->so_snd) - clen;
1768 space += ((flags & MSG_OOB) ? 1024 : 0);
fa4905b1 1769
1c79356b 1770 do {
2d21ac55 1771 if (uio == NULL) {
91447636
A
1772 /*
1773 * Data is prepackaged in "top".
1774 */
1775 resid = 0;
1c79356b
A
1776 if (flags & MSG_EOR)
1777 top->m_flags |= M_EOR;
91447636 1778 } else {
2d21ac55
A
1779 int chainlength;
1780 int bytes_to_copy;
1781 boolean_t jumbocl;
1782
b0d623f7 1783 bytes_to_copy = imin(resid, space);
2d21ac55 1784
39236c6e 1785 if (sosendminchain > 0)
91447636 1786 chainlength = 0;
39236c6e 1787 else
91447636 1788 chainlength = sosendmaxchain;
2d21ac55
A
1789
1790 /*
1791 * Attempt to use larger than system page-size
1792 * clusters for large writes only if there is
1793 * a jumbo cluster pool and if the socket is
1794 * marked accordingly.
1795 */
1796 jumbocl = sosendjcl && njcl > 0 &&
1797 ((so->so_flags & SOF_MULTIPAGES) ||
1798 sosendjcl_ignore_capab);
1799
91447636 1800 socket_unlock(so, 0);
2d21ac55 1801
91447636
A
1802 do {
1803 int num_needed;
39236c6e 1804 int hdrs_needed = (top == NULL) ? 1 : 0;
2d21ac55 1805
91447636 1806 /*
2d21ac55
A
1807 * try to maintain a local cache of mbuf
1808 * clusters needed to complete this
1809 * write the list is further limited to
1810 * the number that are currently needed
1811 * to fill the socket this mechanism
1812 * allows a large number of mbufs/
1813 * clusters to be grabbed under a single
1814 * mbuf lock... if we can't get any
1815 * clusters, than fall back to trying
1816 * for mbufs if we fail early (or
1817 * miscalcluate the number needed) make
1818 * sure to release any clusters we
1819 * haven't yet consumed.
91447636 1820 */
2d21ac55 1821 if (freelist == NULL &&
6d2010ae
A
1822 bytes_to_copy > MBIGCLBYTES &&
1823 jumbocl) {
2d21ac55
A
1824 num_needed =
1825 bytes_to_copy / M16KCLBYTES;
1826
1827 if ((bytes_to_copy -
1828 (num_needed * M16KCLBYTES))
1829 >= MINCLSIZE)
1830 num_needed++;
91447636 1831
2d21ac55
A
1832 freelist =
1833 m_getpackets_internal(
1834 (unsigned int *)&num_needed,
1835 hdrs_needed, M_WAIT, 0,
1836 M16KCLBYTES);
1837 /*
1838 * Fall back to 4K cluster size
1839 * if allocation failed
1840 */
1841 }
1842
1843 if (freelist == NULL &&
1844 bytes_to_copy > MCLBYTES) {
1845 num_needed =
6d2010ae 1846 bytes_to_copy / MBIGCLBYTES;
2d21ac55
A
1847
1848 if ((bytes_to_copy -
6d2010ae 1849 (num_needed * MBIGCLBYTES)) >=
2d21ac55 1850 MINCLSIZE)
91447636 1851 num_needed++;
2d21ac55
A
1852
1853 freelist =
1854 m_getpackets_internal(
1855 (unsigned int *)&num_needed,
1856 hdrs_needed, M_WAIT, 0,
6d2010ae 1857 MBIGCLBYTES);
2d21ac55
A
1858 /*
1859 * Fall back to cluster size
1860 * if allocation failed
1861 */
91447636 1862 }
2d21ac55
A
1863
1864 if (freelist == NULL &&
1865 bytes_to_copy > MINCLSIZE) {
1866 num_needed =
1867 bytes_to_copy / MCLBYTES;
1868
1869 if ((bytes_to_copy -
1870 (num_needed * MCLBYTES)) >=
1871 MINCLSIZE)
91447636 1872 num_needed++;
2d21ac55
A
1873
1874 freelist =
1875 m_getpackets_internal(
1876 (unsigned int *)&num_needed,
1877 hdrs_needed, M_WAIT, 0,
1878 MCLBYTES);
1879 /*
1880 * Fall back to a single mbuf
1881 * if allocation failed
1882 */
91447636 1883 }
2d21ac55 1884
91447636 1885 if (freelist == NULL) {
39236c6e 1886 if (top == NULL)
2d21ac55
A
1887 MGETHDR(freelist,
1888 M_WAIT, MT_DATA);
91447636 1889 else
2d21ac55
A
1890 MGET(freelist,
1891 M_WAIT, MT_DATA);
91447636
A
1892
1893 if (freelist == NULL) {
1894 error = ENOBUFS;
1895 socket_lock(so, 0);
3a60a9f5 1896 goto release;
91447636
A
1897 }
1898 /*
2d21ac55
A
1899 * For datagram protocols,
1900 * leave room for protocol
1901 * headers in first mbuf.
91447636 1902 */
39236c6e 1903 if (atomic && top == NULL &&
2d21ac55
A
1904 bytes_to_copy < MHLEN) {
1905 MH_ALIGN(freelist,
1906 bytes_to_copy);
1907 }
91447636
A
1908 }
1909 m = freelist;
1910 freelist = m->m_next;
1911 m->m_next = NULL;
2d21ac55 1912
91447636
A
1913 if ((m->m_flags & M_EXT))
1914 mlen = m->m_ext.ext_size;
1915 else if ((m->m_flags & M_PKTHDR))
2d21ac55
A
1916 mlen =
1917 MHLEN - m_leadingspace(m);
91447636
A
1918 else
1919 mlen = MLEN;
b0d623f7 1920 len = imin(mlen, bytes_to_copy);
91447636
A
1921
1922 chainlength += len;
2d21ac55 1923
91447636 1924 space -= len;
fa4905b1 1925
2d21ac55 1926 error = uiomove(mtod(m, caddr_t),
b0d623f7 1927 len, uio);
2d21ac55 1928
91447636 1929 resid = uio_resid(uio);
2d21ac55 1930
91447636
A
1931 m->m_len = len;
1932 *mp = m;
1933 top->m_pkthdr.len += len;
2d21ac55 1934 if (error)
91447636
A
1935 break;
1936 mp = &m->m_next;
1937 if (resid <= 0) {
1938 if (flags & MSG_EOR)
1939 top->m_flags |= M_EOR;
1940 break;
1941 }
1942 bytes_to_copy = min(resid, space);
2d21ac55
A
1943
1944 } while (space > 0 &&
1945 (chainlength < sosendmaxchain || atomic ||
1946 resid < MINCLSIZE));
1947
91447636 1948 socket_lock(so, 0);
2d21ac55 1949
91447636
A
1950 if (error)
1951 goto release;
1952 }
2d21ac55
A
1953
1954 if (flags & (MSG_HOLD|MSG_SEND)) {
3a60a9f5 1955 /* Enqueue for later, go away if HOLD */
39236c6e 1956 struct mbuf *mb1;
2d21ac55 1957 if (so->so_temp && (flags & MSG_FLUSH)) {
3a60a9f5
A
1958 m_freem(so->so_temp);
1959 so->so_temp = NULL;
1960 }
1961 if (so->so_temp)
1962 so->so_tail->m_next = top;
1963 else
1964 so->so_temp = top;
1965 mb1 = top;
1966 while (mb1->m_next)
2d21ac55 1967 mb1 = mb1->m_next;
3a60a9f5 1968 so->so_tail = mb1;
2d21ac55 1969 if (flags & MSG_HOLD) {
3a60a9f5
A
1970 top = NULL;
1971 goto release;
1972 }
1973 top = so->so_temp;
2d21ac55
A
1974 }
1975 if (dontroute)
1976 so->so_options |= SO_DONTROUTE;
1977
1978 /* Compute flags here, for pru_send and NKEs */
1979 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
1980 /*
1981 * If the user set MSG_EOF, the protocol
1982 * understands this flag and nothing left to
1983 * send then use PRU_SEND_EOF instead of PRU_SEND.
1984 */
1985 ((flags & MSG_EOF) &&
1986 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
39236c6e
A
1987 (resid <= 0)) ? PRUS_EOF :
1988 /* If there is more to send set PRUS_MORETOCOME */
1989 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2d21ac55 1990
91447636
A
1991 /*
1992 * Socket filter processing
1993 */
39236c6e
A
1994 error = sflt_data_out(so, addr, &top,
1995 &control, (sendflags & MSG_OOB) ?
1996 sock_data_filt_flag_oob : 0);
6d2010ae
A
1997 if (error) {
1998 if (error == EJUSTRETURN) {
1999 error = 0;
2000 clen = 0;
39236c6e
A
2001 control = NULL;
2002 top = NULL;
91447636 2003 }
2d21ac55 2004
6d2010ae 2005 goto release;
1c79356b 2006 }
91447636
A
2007 /*
2008 * End Socket filter processing
2009 */
2d21ac55 2010
39236c6e
A
2011 if (so->so_flags & SOF_ENABLE_MSGS) {
2012 /*
2013 * Make a copy of control mbuf,
2014 * so that msg priority can be
2015 * passed to subsequent mbufs.
2016 */
2017 control_copy = m_dup(control, M_NOWAIT);
2018 }
6d2010ae 2019 error = (*so->so_proto->pr_usrreqs->pru_send)
39236c6e
A
2020 (so, sendflags, top, addr, control, p);
2021
2d21ac55
A
2022 if (flags & MSG_SEND)
2023 so->so_temp = NULL;
39236c6e 2024
2d21ac55
A
2025 if (dontroute)
2026 so->so_options &= ~SO_DONTROUTE;
2027
2028 clen = 0;
39236c6e
A
2029 control = control_copy;
2030 control_copy = NULL;
2031 top = NULL;
2d21ac55
A
2032 mp = &top;
2033 if (error)
2034 goto release;
1c79356b
A
2035 } while (resid && space > 0);
2036 } while (resid);
2037
2038release:
3a60a9f5 2039 if (sblocked)
39236c6e 2040 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
3a60a9f5
A
2041 else
2042 socket_unlock(so, 1);
1c79356b 2043out:
39236c6e 2044 if (top != NULL)
1c79356b 2045 m_freem(top);
39236c6e 2046 if (control != NULL)
1c79356b 2047 m_freem(control);
39236c6e 2048 if (freelist != NULL)
2d21ac55 2049 m_freem_list(freelist);
39236c6e
A
2050 if (control_copy != NULL)
2051 m_freem(control_copy);
1c79356b 2052
2d21ac55
A
2053 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid, so->so_snd.sb_cc,
2054 space, error);
1c79356b
A
2055
2056 return (error);
2057}
2058
2059/*
2060 * Implement receive operations on a socket.
2061 * We depend on the way that records are added to the sockbuf
2062 * by sbappend*. In particular, each record (mbufs linked through m_next)
2063 * must begin with an address if the protocol so specifies,
2064 * followed by an optional mbuf or mbufs containing ancillary data,
2065 * and then zero or more mbufs of data.
2066 * In order to avoid blocking network interrupts for the entire time here,
2067 * we splx() while doing the actual copy to user space.
2068 * Although the sockbuf is locked, new data may still be appended,
2069 * and thus we must maintain consistency of the sockbuf during that time.
2070 *
2071 * The caller may receive the data as a single mbuf chain by supplying
2072 * an mbuf **mp0 for use in returning the chain. The uio is then used
2073 * only for the count in uio_resid.
2d21ac55
A
2074 *
2075 * Returns: 0 Success
2076 * ENOBUFS
2077 * ENOTCONN
2078 * EWOULDBLOCK
2079 * uiomove:EFAULT
2080 * sblock:EWOULDBLOCK
2081 * sblock:EINTR
2082 * sbwait:EBADF
2083 * sbwait:EINTR
2084 * sodelayed_copy:EFAULT
2085 * <pru_rcvoob>:EINVAL[TCP]
2086 * <pru_rcvoob>:EWOULDBLOCK[TCP]
2087 * <pru_rcvoob>:???
2088 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
2089 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
2090 * <pr_domain->dom_externalize>:???
2091 *
2092 * Notes: Additional return values from calls through <pru_rcvoob> and
2093 * <pr_domain->dom_externalize> depend on protocols other than
2094 * TCP or AF_UNIX, which are documented above.
1c79356b
A
2095 */
2096int
2d21ac55
A
2097soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2098 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1c79356b 2099{
39236c6e
A
2100 struct mbuf *m, **mp, *ml = NULL;
2101 struct mbuf *nextrecord, *free_list;
2102 int flags, error, offset;
2103 user_ssize_t len;
1c79356b 2104 struct protosw *pr = so->so_proto;
39236c6e
A
2105 int moff, type =0;
2106 user_ssize_t orig_resid = uio_resid(uio);
2107 user_ssize_t delayed_copy_len;
55e303ae
A
2108 int can_delay;
2109 int need_event;
2110 struct proc *p = current_proc();
2111
2d21ac55
A
2112 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so, uio_resid(uio),
2113 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
1c79356b 2114
91447636 2115 socket_lock(so, 1);
6d2010ae 2116 so_update_last_owner_locked(so, p);
39236c6e 2117 so_update_policy(so);
1c79356b 2118
91447636 2119#ifdef MORE_LOCKING_DEBUG
39236c6e
A
2120 if (so->so_usecount == 1) {
2121 panic("%s: so=%x no other reference on socket\n", __func__, so);
2122 /* NOTREACHED */
2123 }
91447636 2124#endif
1c79356b 2125 mp = mp0;
39236c6e
A
2126 if (psa != NULL)
2127 *psa = NULL;
2128 if (controlp != NULL)
2129 *controlp = NULL;
2130 if (flagsp != NULL)
1c79356b
A
2131 flags = *flagsp &~ MSG_EOR;
2132 else
2133 flags = 0;
2d21ac55
A
2134
2135 /*
2136 * If a recv attempt is made on a previously-accepted socket
2137 * that has been marked as inactive (disconnected), reject
2138 * the request.
2139 */
2140 if (so->so_flags & SOF_DEFUNCT) {
2141 struct sockbuf *sb = &so->so_rcv;
2142
6d2010ae 2143 error = ENOTCONN;
39236c6e
A
2144 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
2145 __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
2146 SOCK_DOM(so), SOCK_TYPE(so), error));
2d21ac55
A
2147 /*
2148 * This socket should have been disconnected and flushed
6d2010ae
A
2149 * prior to being returned from sodefunct(); there should
2150 * be no data on its receive list, so panic otherwise.
2d21ac55 2151 */
6d2010ae
A
2152 if (so->so_state & SS_DEFUNCT)
2153 sb_empty_assert(sb, __func__);
2d21ac55 2154 socket_unlock(so, 1);
6d2010ae 2155 return (error);
2d21ac55
A
2156 }
2157
2158 /*
2159 * When SO_WANTOOBFLAG is set we try to get out-of-band data
2160 * regardless of the flags argument. Here is the case were
2161 * out-of-band data is not inline.
2162 */
2163 if ((flags & MSG_OOB) ||
2164 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2165 (so->so_options & SO_OOBINLINE) == 0 &&
2166 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
1c79356b 2167 m = m_get(M_WAIT, MT_DATA);
55e303ae 2168 if (m == NULL) {
91447636 2169 socket_unlock(so, 1);
2d21ac55
A
2170 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
2171 ENOBUFS, 0, 0, 0, 0);
9bccf70c 2172 return (ENOBUFS);
55e303ae 2173 }
1c79356b
A
2174 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
2175 if (error)
2176 goto bad;
91447636 2177 socket_unlock(so, 0);
1c79356b
A
2178 do {
2179 error = uiomove(mtod(m, caddr_t),
b0d623f7 2180 imin(uio_resid(uio), m->m_len), uio);
1c79356b 2181 m = m_free(m);
39236c6e 2182 } while (uio_resid(uio) && error == 0 && m != NULL);
91447636 2183 socket_lock(so, 0);
1c79356b 2184bad:
39236c6e 2185 if (m != NULL)
1c79356b 2186 m_freem(m);
39236c6e 2187
9bccf70c
A
2188 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
2189 if (error == EWOULDBLOCK || error == EINVAL) {
2d21ac55 2190 /*
9bccf70c 2191 * Let's try to get normal data:
2d21ac55
A
2192 * EWOULDBLOCK: out-of-band data not
2193 * receive yet. EINVAL: out-of-band data
2194 * already read.
9bccf70c
A
2195 */
2196 error = 0;
2197 goto nooob;
39236c6e 2198 } else if (error == 0 && flagsp != NULL) {
9bccf70c 2199 *flagsp |= MSG_OOB;
2d21ac55
A
2200 }
2201 }
91447636 2202 socket_unlock(so, 1);
2d21ac55
A
2203 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2204 0, 0, 0, 0);
39236c6e 2205
1c79356b
A
2206 return (error);
2207 }
2208nooob:
39236c6e
A
2209 if (mp != NULL)
2210 *mp = NULL;
91447636 2211 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio))
1c79356b
A
2212 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
2213
39236c6e 2214 free_list = NULL;
55e303ae 2215 delayed_copy_len = 0;
1c79356b 2216restart:
91447636
A
2217#ifdef MORE_LOCKING_DEBUG
2218 if (so->so_usecount <= 1)
2d21ac55
A
2219 printf("soreceive: sblock so=%p ref=%d on socket\n",
2220 so, so->so_usecount);
91447636 2221#endif
6601e61a
A
2222 /*
2223 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2224 * and if so just return to the caller. This could happen when
2225 * soreceive() is called by a socket upcall function during the
2226 * time the socket is freed. The socket buffer would have been
2227 * locked across the upcall, therefore we cannot put this thread
2228 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2229 * we may livelock), because the lock on the socket buffer will
2230 * only be released when the upcall routine returns to its caller.
2231 * Because the socket has been officially closed, there can be
2232 * no further read on it.
39236c6e
A
2233 *
2234 * A multipath subflow socket would have its SS_NOFDREF set by
2235 * default, so check for SOF_MP_SUBFLOW socket flag; when the
2236 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
6601e61a
A
2237 */
2238 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
39236c6e 2239 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
6601e61a
A
2240 socket_unlock(so, 1);
2241 return (0);
2242 }
2243
9bccf70c
A
2244 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
2245 if (error) {
91447636 2246 socket_unlock(so, 1);
2d21ac55
A
2247 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2248 0, 0, 0, 0);
1c79356b
A
2249 return (error);
2250 }
1c79356b
A
2251
2252 m = so->so_rcv.sb_mb;
2253 /*
2254 * If we have less data than requested, block awaiting more
2255 * (subject to any timeout) if:
2256 * 1. the current count is less than the low water mark, or
2257 * 2. MSG_WAITALL is set, and it is possible to do the entire
2258 * receive operation at once if we block (resid <= hiwat).
2259 * 3. MSG_DONTWAIT is not set
2260 * If MSG_WAITALL is set but resid is larger than the receive buffer,
2261 * we have to do the receive in sections, and thus risk returning
2262 * a short count if a timeout or signal occurs after we start.
2263 */
39236c6e 2264 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
91447636 2265 so->so_rcv.sb_cc < uio_resid(uio)) &&
2d21ac55 2266 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
91447636 2267 ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
39236c6e 2268 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
2d21ac55
A
2269 /*
2270 * Panic if we notice inconsistencies in the socket's
2271 * receive list; both sb_mb and sb_cc should correctly
2272 * reflect the contents of the list, otherwise we may
2273 * end up with false positives during select() or poll()
2274 * which could put the application in a bad state.
2275 */
316670eb 2276 SB_MB_CHECK(&so->so_rcv);
55e303ae 2277
1c79356b 2278 if (so->so_error) {
39236c6e 2279 if (m != NULL)
1c79356b
A
2280 goto dontblock;
2281 error = so->so_error;
2282 if ((flags & MSG_PEEK) == 0)
2283 so->so_error = 0;
2284 goto release;
2285 }
2286 if (so->so_state & SS_CANTRCVMORE) {
39236c6e 2287 if (m != NULL)
1c79356b
A
2288 goto dontblock;
2289 else
2290 goto release;
2291 }
39236c6e 2292 for (; m != NULL; m = m->m_next)
2d21ac55 2293 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
1c79356b
A
2294 m = so->so_rcv.sb_mb;
2295 goto dontblock;
2296 }
2297 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
2298 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2299 error = ENOTCONN;
2300 goto release;
2301 }
91447636 2302 if (uio_resid(uio) == 0)
1c79356b 2303 goto release;
2d21ac55
A
2304 if ((so->so_state & SS_NBIO) ||
2305 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
1c79356b
A
2306 error = EWOULDBLOCK;
2307 goto release;
2308 }
2d21ac55
A
2309 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
2310 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
39236c6e 2311 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
2d21ac55 2312#if EVEN_MORE_LOCKING_DEBUG
1c79356b 2313 if (socket_debug)
2d21ac55 2314 printf("Waiting for socket data\n");
91447636 2315#endif
55e303ae 2316
1c79356b 2317 error = sbwait(&so->so_rcv);
2d21ac55 2318#if EVEN_MORE_LOCKING_DEBUG
1c79356b 2319 if (socket_debug)
2d21ac55 2320 printf("SORECEIVE - sbwait returned %d\n", error);
91447636 2321#endif
39236c6e
A
2322 if (so->so_usecount < 1) {
2323 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
2324 __func__, so, so->so_usecount);
2325 /* NOTREACHED */
2326 }
9bccf70c 2327 if (error) {
91447636 2328 socket_unlock(so, 1);
2d21ac55
A
2329 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2330 0, 0, 0, 0);
2331 return (error);
1c79356b
A
2332 }
2333 goto restart;
2334 }
2335dontblock:
b0d623f7 2336 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
2d21ac55
A
2337 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
2338 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
1c79356b
A
2339 nextrecord = m->m_nextpkt;
2340 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
2341 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2d21ac55
A
2342#if CONFIG_MACF_SOCKET_SUBSET
2343 /*
2344 * Call the MAC framework for policy checking if we're in
2345 * the user process context and the socket isn't connected.
2346 */
2347 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2348 struct mbuf *m0 = m;
2349 /*
2350 * Dequeue this record (temporarily) from the receive
2351 * list since we're about to drop the socket's lock
2352 * where a new record may arrive and be appended to
2353 * the list. Upon MAC policy failure, the record
2354 * will be freed. Otherwise, we'll add it back to
2355 * the head of the list. We cannot rely on SB_LOCK
2356 * because append operation uses the socket's lock.
2357 */
2358 do {
2359 m->m_nextpkt = NULL;
2360 sbfree(&so->so_rcv, m);
2361 m = m->m_next;
2362 } while (m != NULL);
2363 m = m0;
2364 so->so_rcv.sb_mb = nextrecord;
2365 SB_EMPTY_FIXUP(&so->so_rcv);
2366 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2367 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2368 socket_unlock(so, 0);
2369 if (mac_socket_check_received(proc_ucred(p), so,
2370 mtod(m, struct sockaddr *)) != 0) {
2371 /*
2372 * MAC policy failure; free this record and
2373 * process the next record (or block until
2374 * one is available). We have adjusted sb_cc
2375 * and sb_mbcnt above so there is no need to
2376 * call sbfree() again.
2377 */
2378 do {
2379 m = m_free(m);
2380 } while (m != NULL);
2381 /*
2382 * Clear SB_LOCK but don't unlock the socket.
2383 * Process the next record or wait for one.
2384 */
2385 socket_lock(so, 0);
39236c6e 2386 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2d21ac55
A
2387 goto restart;
2388 }
2389 socket_lock(so, 0);
6d2010ae
A
2390 /*
2391 * If the socket has been defunct'd, drop it.
2392 */
2393 if (so->so_flags & SOF_DEFUNCT) {
2394 m_freem(m);
2395 error = ENOTCONN;
2396 goto release;
2397 }
2d21ac55
A
2398 /*
2399 * Re-adjust the socket receive list and re-enqueue
2400 * the record in front of any packets which may have
2401 * been appended while we dropped the lock.
2402 */
2403 for (m = m0; m->m_next != NULL; m = m->m_next)
2404 sballoc(&so->so_rcv, m);
2405 sballoc(&so->so_rcv, m);
2406 if (so->so_rcv.sb_mb == NULL) {
2407 so->so_rcv.sb_lastrecord = m0;
2408 so->so_rcv.sb_mbtail = m;
2409 }
2410 m = m0;
2411 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2412 so->so_rcv.sb_mb = m;
2413 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2414 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2415 }
2416#endif /* CONFIG_MACF_SOCKET_SUBSET */
1c79356b 2417 orig_resid = 0;
39236c6e 2418 if (psa != NULL) {
1c79356b 2419 *psa = dup_sockaddr(mtod(m, struct sockaddr *),
39236c6e
A
2420 mp0 == NULL);
2421 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
4a249263
A
2422 error = EWOULDBLOCK;
2423 goto release;
2424 }
2425 }
1c79356b
A
2426 if (flags & MSG_PEEK) {
2427 m = m->m_next;
2428 } else {
2429 sbfree(&so->so_rcv, m);
39236c6e
A
2430 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2431 panic("%s: about to create invalid socketbuf",
2432 __func__);
2433 /* NOTREACHED */
2434 }
1c79356b
A
2435 MFREE(m, so->so_rcv.sb_mb);
2436 m = so->so_rcv.sb_mb;
2d21ac55
A
2437 if (m != NULL) {
2438 m->m_nextpkt = nextrecord;
2439 } else {
2440 so->so_rcv.sb_mb = nextrecord;
2441 SB_EMPTY_FIXUP(&so->so_rcv);
2442 }
1c79356b
A
2443 }
2444 }
2d21ac55
A
2445
2446 /*
2447 * Process one or more MT_CONTROL mbufs present before any data mbufs
2448 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2449 * just copy the data; if !MSG_PEEK, we call into the protocol to
2450 * perform externalization.
2451 */
2452 if (m != NULL && m->m_type == MT_CONTROL) {
2453 struct mbuf *cm = NULL, *cmn;
2454 struct mbuf **cme = &cm;
2455 struct sockbuf *sb_rcv = &so->so_rcv;
6d2010ae 2456 struct mbuf **msgpcm = NULL;
2d21ac55
A
2457
2458 /*
2459 * Externalizing the control messages would require us to
2460 * drop the socket's lock below. Once we re-acquire the
2461 * lock, the mbuf chain might change. In order to preserve
2462 * consistency, we unlink all control messages from the
2463 * first mbuf chain in one shot and link them separately
2464 * onto a different chain.
2465 */
2466 do {
2467 if (flags & MSG_PEEK) {
2468 if (controlp != NULL) {
6d2010ae
A
2469 if (*controlp == NULL) {
2470 msgpcm = controlp;
2471 }
2d21ac55 2472 *controlp = m_copy(m, 0, m->m_len);
6d2010ae 2473
39236c6e
A
2474 /*
2475 * If we failed to allocate an mbuf,
6d2010ae 2476 * release any previously allocated
39236c6e 2477 * mbufs for control data. Return
6d2010ae 2478 * an error. Keep the mbufs in the
39236c6e 2479 * socket as this is using
6d2010ae
A
2480 * MSG_PEEK flag.
2481 */
2482 if (*controlp == NULL) {
2483 m_freem(*msgpcm);
2484 error = ENOBUFS;
2485 goto release;
2486 }
2d21ac55 2487 controlp = &(*controlp)->m_next;
91447636 2488 }
2d21ac55 2489 m = m->m_next;
1c79356b 2490 } else {
2d21ac55
A
2491 m->m_nextpkt = NULL;
2492 sbfree(sb_rcv, m);
2493 sb_rcv->sb_mb = m->m_next;
2494 m->m_next = NULL;
2495 *cme = m;
2496 cme = &(*cme)->m_next;
2497 m = sb_rcv->sb_mb;
2498 }
2499 } while (m != NULL && m->m_type == MT_CONTROL);
2500
2501 if (!(flags & MSG_PEEK)) {
2502 if (sb_rcv->sb_mb != NULL) {
2503 sb_rcv->sb_mb->m_nextpkt = nextrecord;
2504 } else {
2505 sb_rcv->sb_mb = nextrecord;
2506 SB_EMPTY_FIXUP(sb_rcv);
1c79356b 2507 }
2d21ac55
A
2508 if (nextrecord == NULL)
2509 sb_rcv->sb_lastrecord = m;
1c79356b 2510 }
2d21ac55
A
2511
2512 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2513 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2514
2515 while (cm != NULL) {
2516 int cmsg_type;
2517
2518 cmn = cm->m_next;
2519 cm->m_next = NULL;
2520 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2521
2522 /*
2523 * Call the protocol to externalize SCM_RIGHTS message
2524 * and return the modified message to the caller upon
2525 * success. Otherwise, all other control messages are
2526 * returned unmodified to the caller. Note that we
2527 * only get into this loop if MSG_PEEK is not set.
2528 */
2529 if (pr->pr_domain->dom_externalize != NULL &&
2530 cmsg_type == SCM_RIGHTS) {
2531 /*
2532 * Release socket lock: see 3903171. This
2533 * would also allow more records to be appended
2534 * to the socket buffer. We still have SB_LOCK
2535 * set on it, so we can be sure that the head
2536 * of the mbuf chain won't change.
2537 */
2538 socket_unlock(so, 0);
2539 error = (*pr->pr_domain->dom_externalize)(cm);
2540 socket_lock(so, 0);
2541 } else {
2542 error = 0;
2543 }
2544
2545 if (controlp != NULL && error == 0) {
2546 *controlp = cm;
2547 controlp = &(*controlp)->m_next;
2548 orig_resid = 0;
2549 } else {
2550 (void) m_free(cm);
2551 }
2552 cm = cmn;
1c79356b 2553 }
39236c6e 2554 /*
316670eb 2555 * Update the value of nextrecord in case we received new
39236c6e 2556 * records when the socket was unlocked above for
316670eb
A
2557 * externalizing SCM_RIGHTS.
2558 */
2559 if (m != NULL)
2d21ac55
A
2560 nextrecord = sb_rcv->sb_mb->m_nextpkt;
2561 else
316670eb
A
2562 nextrecord = sb_rcv->sb_mb;
2563 orig_resid = 0;
1c79356b 2564 }
2d21ac55 2565
39236c6e
A
2566 /*
2567 * If the socket is a TCP socket with message delivery
2568 * enabled, then create a control msg to deliver the
2569 * relative TCP sequence number for this data. Waiting
2570 * until this point will protect against failures to
2571 * allocate an mbuf for control msgs.
2572 */
2573 if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
2574 (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
2575 struct mbuf *seq_cm;
2576
2577 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
2578 sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
2579 if (seq_cm == NULL) {
2580 /* unable to allocate a control mbuf */
2581 error = ENOBUFS;
2582 goto release;
2583 }
2584 *controlp = seq_cm;
2585 controlp = &seq_cm->m_next;
2586 }
2587
2d21ac55
A
2588 if (m != NULL) {
2589 if (!(flags & MSG_PEEK)) {
2590 /*
2591 * We get here because m points to an mbuf following
2592 * any MT_SONAME or MT_CONTROL mbufs which have been
2593 * processed above. In any case, m should be pointing
2594 * to the head of the mbuf chain, and the nextrecord
2595 * should be either NULL or equal to m->m_nextpkt.
2596 * See comments above about SB_LOCK.
2597 */
39236c6e
A
2598 if (m != so->so_rcv.sb_mb ||
2599 m->m_nextpkt != nextrecord) {
2600 panic("%s: post-control !sync so=%p m=%p "
2601 "nextrecord=%p\n", __func__, so, m,
2602 nextrecord);
2603 /* NOTREACHED */
2604 }
2d21ac55
A
2605 if (nextrecord == NULL)
2606 so->so_rcv.sb_lastrecord = m;
2607 }
1c79356b
A
2608 type = m->m_type;
2609 if (type == MT_OOBDATA)
2610 flags |= MSG_OOB;
2d21ac55
A
2611 } else {
2612 if (!(flags & MSG_PEEK)) {
2d21ac55
A
2613 SB_EMPTY_FIXUP(&so->so_rcv);
2614 }
1c79356b 2615 }
2d21ac55
A
2616 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
2617 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
2618
1c79356b
A
2619 moff = 0;
2620 offset = 0;
fa4905b1 2621
91447636 2622 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
2d21ac55 2623 can_delay = 1;
55e303ae 2624 else
2d21ac55 2625 can_delay = 0;
55e303ae
A
2626
2627 need_event = 0;
fa4905b1 2628
39236c6e
A
2629 while (m != NULL &&
2630 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
1c79356b
A
2631 if (m->m_type == MT_OOBDATA) {
2632 if (type != MT_OOBDATA)
2633 break;
2d21ac55 2634 } else if (type == MT_OOBDATA) {
1c79356b 2635 break;
2d21ac55 2636 }
9bccf70c 2637 /*
2d21ac55 2638 * Make sure to allways set MSG_OOB event when getting
9bccf70c
A
2639 * out of band data inline.
2640 */
1c79356b 2641 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2d21ac55
A
2642 (so->so_options & SO_OOBINLINE) != 0 &&
2643 (so->so_state & SS_RCVATMARK) != 0) {
9bccf70c
A
2644 flags |= MSG_OOB;
2645 }
1c79356b 2646 so->so_state &= ~SS_RCVATMARK;
91447636 2647 len = uio_resid(uio) - delayed_copy_len;
1c79356b
A
2648 if (so->so_oobmark && len > so->so_oobmark - offset)
2649 len = so->so_oobmark - offset;
2650 if (len > m->m_len - moff)
2651 len = m->m_len - moff;
2652 /*
2653 * If mp is set, just pass back the mbufs.
2654 * Otherwise copy them out via the uio, then free.
2655 * Sockbuf must be consistent here (points to current mbuf,
2656 * it points to next record) when we drop priority;
2657 * we must note any additions to the sockbuf when we
2658 * block interrupts again.
2659 */
39236c6e 2660 if (mp == NULL) {
2d21ac55
A
2661 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
2662 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
55e303ae 2663 if (can_delay && len == m->m_len) {
2d21ac55 2664 /*
55e303ae
A
2665 * only delay the copy if we're consuming the
2666 * mbuf and we're NOT in MSG_PEEK mode
2667 * and we have enough data to make it worthwile
2d21ac55
A
2668 * to drop and retake the lock... can_delay
2669 * reflects the state of the 2 latter
2670 * constraints moff should always be zero
2671 * in these cases
55e303ae 2672 */
2d21ac55 2673 delayed_copy_len += len;
55e303ae 2674 } else {
2d21ac55
A
2675 if (delayed_copy_len) {
2676 error = sodelayed_copy(so, uio,
2677 &free_list, &delayed_copy_len);
55e303ae
A
2678
2679 if (error) {
55e303ae
A
2680 goto release;
2681 }
2d21ac55
A
2682 /*
2683 * can only get here if MSG_PEEK is not
2684 * set therefore, m should point at the
2685 * head of the rcv queue; if it doesn't,
2686 * it means something drastically
2687 * changed while we were out from behind
2688 * the lock in sodelayed_copy. perhaps
2689 * a RST on the stream. in any event,
2690 * the stream has been interrupted. it's
2691 * probably best just to return whatever
2692 * data we've moved and let the caller
2693 * sort it out...
2694 */
55e303ae 2695 if (m != so->so_rcv.sb_mb) {
2d21ac55 2696 break;
55e303ae
A
2697 }
2698 }
91447636 2699 socket_unlock(so, 0);
2d21ac55
A
2700 error = uiomove(mtod(m, caddr_t) + moff,
2701 (int)len, uio);
91447636 2702 socket_lock(so, 0);
55e303ae 2703
55e303ae 2704 if (error)
2d21ac55 2705 goto release;
55e303ae 2706 }
2d21ac55 2707 } else {
91447636 2708 uio_setresid(uio, (uio_resid(uio) - len));
2d21ac55 2709 }
1c79356b
A
2710 if (len == m->m_len - moff) {
2711 if (m->m_flags & M_EOR)
2712 flags |= MSG_EOR;
2713 if (flags & MSG_PEEK) {
2714 m = m->m_next;
2715 moff = 0;
2716 } else {
2717 nextrecord = m->m_nextpkt;
2718 sbfree(&so->so_rcv, m);
91447636 2719 m->m_nextpkt = NULL;
55e303ae 2720
39236c6e
A
2721 /*
2722 * If this packet is an unordered packet
2723 * (indicated by M_UNORDERED_DATA flag), remove
2724 * the additional bytes added to the
2725 * receive socket buffer size.
2726 */
2727 if ((so->so_flags & SOF_ENABLE_MSGS) &&
2728 m->m_len &&
2729 (m->m_flags & M_UNORDERED_DATA) &&
2730 sbreserve(&so->so_rcv,
2731 so->so_rcv.sb_hiwat - m->m_len)) {
2732 if (so->so_msg_state->msg_uno_bytes >
2733 m->m_len) {
2734 so->so_msg_state->
2735 msg_uno_bytes -= m->m_len;
2736 } else {
2737 so->so_msg_state->
2738 msg_uno_bytes = 0;
2739 }
2740 m->m_flags &= ~M_UNORDERED_DATA;
2741 }
2742
2743 if (mp != NULL) {
1c79356b
A
2744 *mp = m;
2745 mp = &m->m_next;
2746 so->so_rcv.sb_mb = m = m->m_next;
39236c6e 2747 *mp = NULL;
1c79356b 2748 } else {
55e303ae 2749 if (free_list == NULL)
2d21ac55
A
2750 free_list = m;
2751 else
2752 ml->m_next = m;
2753 ml = m;
14353aa8 2754 so->so_rcv.sb_mb = m = m->m_next;
39236c6e 2755 ml->m_next = NULL;
1c79356b 2756 }
2d21ac55 2757 if (m != NULL) {
1c79356b 2758 m->m_nextpkt = nextrecord;
2d21ac55
A
2759 if (nextrecord == NULL)
2760 so->so_rcv.sb_lastrecord = m;
2761 } else {
2762 so->so_rcv.sb_mb = nextrecord;
2763 SB_EMPTY_FIXUP(&so->so_rcv);
2764 }
2765 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
2766 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
1c79356b
A
2767 }
2768 } else {
2d21ac55 2769 if (flags & MSG_PEEK) {
1c79356b 2770 moff += len;
2d21ac55 2771 } else {
6d2010ae
A
2772 if (mp != NULL) {
2773 int copy_flag;
2774
2775 if (flags & MSG_DONTWAIT)
2776 copy_flag = M_DONTWAIT;
2777 else
2778 copy_flag = M_WAIT;
2779 *mp = m_copym(m, 0, len, copy_flag);
39236c6e
A
2780 /*
2781 * Failed to allocate an mbuf?
2782 * Adjust uio_resid back, it was
2783 * adjusted down by len bytes which
2784 * we didn't copy over.
2785 */
6d2010ae 2786 if (*mp == NULL) {
39236c6e
A
2787 uio_setresid(uio,
2788 (uio_resid(uio) + len));
6d2010ae
A
2789 break;
2790 }
2791 }
1c79356b
A
2792 m->m_data += len;
2793 m->m_len -= len;
2794 so->so_rcv.sb_cc -= len;
2795 }
2796 }
2797 if (so->so_oobmark) {
2798 if ((flags & MSG_PEEK) == 0) {
2799 so->so_oobmark -= len;
2800 if (so->so_oobmark == 0) {
2d21ac55
A
2801 so->so_state |= SS_RCVATMARK;
2802 /*
2803 * delay posting the actual event until
2804 * after any delayed copy processing
2805 * has finished
2806 */
2807 need_event = 1;
2808 break;
1c79356b
A
2809 }
2810 } else {
2811 offset += len;
2812 if (offset == so->so_oobmark)
2813 break;
2814 }
2815 }
2d21ac55 2816 if (flags & MSG_EOR)
1c79356b
A
2817 break;
2818 /*
2d21ac55
A
2819 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
2820 * (for non-atomic socket), we must not quit until
2821 * "uio->uio_resid == 0" or an error termination.
2822 * If a signal/timeout occurs, return with a short
2823 * count but without error. Keep sockbuf locked
2824 * against other readers.
1c79356b 2825 */
39236c6e 2826 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL &&
2d21ac55 2827 (uio_resid(uio) - delayed_copy_len) > 0 &&
1c79356b
A
2828 !sosendallatonce(so) && !nextrecord) {
2829 if (so->so_error || so->so_state & SS_CANTRCVMORE)
2d21ac55 2830 goto release;
fa4905b1 2831
2d21ac55
A
2832 /*
2833 * Depending on the protocol (e.g. TCP), the following
2834 * might cause the socket lock to be dropped and later
2835 * be reacquired, and more data could have arrived and
2836 * have been appended to the receive socket buffer by
2837 * the time it returns. Therefore, we only sleep in
2838 * sbwait() below if and only if the socket buffer is
2839 * empty, in order to avoid a false sleep.
2840 */
2841 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
2842 (((struct inpcb *)so->so_pcb)->inp_state !=
2843 INPCB_STATE_DEAD))
2844 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2845
2846 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
2847 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
2848
2849 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
2850 error = 0;
55e303ae 2851 goto release;
fa4905b1 2852 }
55e303ae 2853 /*
2d21ac55
A
2854 * have to wait until after we get back from the sbwait
2855 * to do the copy because we will drop the lock if we
2856 * have enough data that has been delayed... by dropping
2857 * the lock we open up a window allowing the netisr
2858 * thread to process the incoming packets and to change
2859 * the state of this socket... we're issuing the sbwait
2860 * because the socket is empty and we're expecting the
2861 * netisr thread to wake us up when more packets arrive;
2862 * if we allow that processing to happen and then sbwait
2863 * we could stall forever with packets sitting in the
2864 * socket if no further packets arrive from the remote
2865 * side.
55e303ae 2866 *
2d21ac55
A
2867 * we want to copy before we've collected all the data
2868 * to satisfy this request to allow the copy to overlap
2869 * the incoming packet processing on an MP system
55e303ae 2870 */
2d21ac55
A
2871 if (delayed_copy_len > sorecvmincopy &&
2872 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
2873 error = sodelayed_copy(so, uio,
2874 &free_list, &delayed_copy_len);
55e303ae
A
2875
2876 if (error)
2d21ac55 2877 goto release;
1c79356b
A
2878 }
2879 m = so->so_rcv.sb_mb;
39236c6e 2880 if (m != NULL) {
1c79356b 2881 nextrecord = m->m_nextpkt;
fa4905b1 2882 }
316670eb 2883 SB_MB_CHECK(&so->so_rcv);
1c79356b
A
2884 }
2885 }
91447636 2886#ifdef MORE_LOCKING_DEBUG
39236c6e
A
2887 if (so->so_usecount <= 1) {
2888 panic("%s: after big while so=%p ref=%d on socket\n",
2889 __func__, so, so->so_usecount);
2890 /* NOTREACHED */
2891 }
91447636 2892#endif
1c79356b 2893
39236c6e 2894 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
2d21ac55 2895 if (so->so_options & SO_DONTTRUNC) {
1c79356b 2896 flags |= MSG_RCVMORE;
2d21ac55 2897 } else {
9bccf70c 2898 flags |= MSG_TRUNC;
1c79356b
A
2899 if ((flags & MSG_PEEK) == 0)
2900 (void) sbdroprecord(&so->so_rcv);
2901 }
2902 }
2d21ac55
A
2903
2904 /*
2905 * pru_rcvd below (for TCP) may cause more data to be received
2906 * if the socket lock is dropped prior to sending the ACK; some
2907 * legacy OpenTransport applications don't handle this well
2908 * (if it receives less data than requested while MSG_HAVEMORE
2909 * is set), and so we set the flag now based on what we know
2910 * prior to calling pru_rcvd.
2911 */
2912 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
2913 flags |= MSG_HAVEMORE;
2914
1c79356b 2915 if ((flags & MSG_PEEK) == 0) {
39236c6e 2916 if (m == NULL) {
1c79356b 2917 so->so_rcv.sb_mb = nextrecord;
2d21ac55
A
2918 /*
2919 * First part is an inline SB_EMPTY_FIXUP(). Second
2920 * part makes sure sb_lastrecord is up-to-date if
2921 * there is still data in the socket buffer.
2922 */
2923 if (so->so_rcv.sb_mb == NULL) {
2924 so->so_rcv.sb_mbtail = NULL;
2925 so->so_rcv.sb_lastrecord = NULL;
2926 } else if (nextrecord->m_nextpkt == NULL) {
2927 so->so_rcv.sb_lastrecord = nextrecord;
2928 }
316670eb 2929 SB_MB_CHECK(&so->so_rcv);
2d21ac55
A
2930 }
2931 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
2932 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
1c79356b
A
2933 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
2934 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2935 }
39236c6e 2936
55e303ae 2937 if (delayed_copy_len) {
91447636 2938 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
55e303ae 2939 if (error)
2d21ac55 2940 goto release;
55e303ae 2941 }
39236c6e
A
2942 if (free_list != NULL) {
2943 m_freem_list(free_list);
2944 free_list = NULL;
55e303ae
A
2945 }
2946 if (need_event)
2d21ac55 2947 postevent(so, 0, EV_OOB);
39236c6e 2948
91447636 2949 if (orig_resid == uio_resid(uio) && orig_resid &&
1c79356b 2950 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
39236c6e 2951 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
1c79356b
A
2952 goto restart;
2953 }
2954
39236c6e 2955 if (flagsp != NULL)
1c79356b
A
2956 *flagsp |= flags;
2957release:
91447636 2958#ifdef MORE_LOCKING_DEBUG
39236c6e
A
2959 if (so->so_usecount <= 1) {
2960 panic("%s: release so=%p ref=%d on socket\n", __func__,
2d21ac55 2961 so, so->so_usecount);
39236c6e
A
2962 /* NOTREACHED */
2963 }
91447636 2964#endif
39236c6e 2965 if (delayed_copy_len)
2d21ac55 2966 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
1c79356b 2967
39236c6e
A
2968 if (free_list != NULL)
2969 m_freem_list(free_list);
2970
2971 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
2972
2d21ac55
A
2973 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
2974 so->so_rcv.sb_cc, 0, error);
1c79356b
A
2975
2976 return (error);
2977}
2978
2d21ac55
A
2979/*
2980 * Returns: 0 Success
2981 * uiomove:EFAULT
2982 */
2983static int
2984sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
39236c6e 2985 user_ssize_t *resid)
55e303ae 2986{
2d21ac55 2987 int error = 0;
55e303ae
A
2988 struct mbuf *m;
2989
2990 m = *free_list;
2991
91447636 2992 socket_unlock(so, 0);
55e303ae 2993
39236c6e 2994 while (m != NULL && error == 0) {
2d21ac55 2995 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
2d21ac55
A
2996 m = m->m_next;
2997 }
2998 m_freem_list(*free_list);
2999
39236c6e 3000 *free_list = NULL;
2d21ac55
A
3001 *resid = 0;
3002
3003 socket_lock(so, 0);
55e303ae 3004
2d21ac55
A
3005 return (error);
3006}
3007
2d21ac55
A
3008/*
3009 * Returns: 0 Success
3010 * EINVAL
3011 * ENOTCONN
3012 * <pru_shutdown>:EINVAL
3013 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
3014 * <pru_shutdown>:ENOBUFS[TCP]
3015 * <pru_shutdown>:EMSGSIZE[TCP]
3016 * <pru_shutdown>:EHOSTUNREACH[TCP]
3017 * <pru_shutdown>:ENETUNREACH[TCP]
3018 * <pru_shutdown>:ENETDOWN[TCP]
3019 * <pru_shutdown>:ENOMEM[TCP]
3020 * <pru_shutdown>:EACCES[TCP]
3021 * <pru_shutdown>:EMSGSIZE[TCP]
3022 * <pru_shutdown>:ENOBUFS[TCP]
3023 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
3024 * <pru_shutdown>:??? [other protocol families]
3025 */
3026int
3027soshutdown(struct socket *so, int how)
3028{
3029 int error;
55e303ae 3030
2d21ac55
A
3031 switch (how) {
3032 case SHUT_RD:
3033 case SHUT_WR:
3034 case SHUT_RDWR:
3035 socket_lock(so, 1);
3036 if ((so->so_state &
3037 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
3038 error = ENOTCONN;
3039 } else {
3040 error = soshutdownlock(so, how);
3041 }
3042 socket_unlock(so, 1);
3043 break;
3044 default:
3045 error = EINVAL;
3046 break;
55e303ae 3047 }
55e303ae
A
3048
3049 return (error);
3050}
3051
1c79356b 3052int
2d21ac55 3053soshutdownlock(struct socket *so, int how)
1c79356b 3054{
2d21ac55
A
3055 struct protosw *pr = so->so_proto;
3056 int error = 0;
1c79356b 3057
91447636 3058 sflt_notify(so, sock_evt_shutdown, &how);
1c79356b 3059
9bccf70c 3060 if (how != SHUT_WR) {
2d21ac55
A
3061 if ((so->so_state & SS_CANTRCVMORE) != 0) {
3062 /* read already shut down */
3063 error = ENOTCONN;
3064 goto done;
3065 }
1c79356b
A
3066 sorflush(so);
3067 postevent(so, 0, EV_RCLOSED);
3068 }
9bccf70c 3069 if (how != SHUT_RD) {
2d21ac55
A
3070 if ((so->so_state & SS_CANTSENDMORE) != 0) {
3071 /* write already shut down */
3072 error = ENOTCONN;
3073 goto done;
3074 }
3075 error = (*pr->pr_usrreqs->pru_shutdown)(so);
3076 postevent(so, 0, EV_WCLOSED);
1c79356b 3077 }
2d21ac55
A
3078done:
3079 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, 0, 0, 0, 0, 0);
3080 return (error);
1c79356b
A
3081}
3082
39236c6e
A
3083void
3084sowflush(struct socket *so)
3085{
3086 struct sockbuf *sb = &so->so_snd;
3087#ifdef notyet
3088 lck_mtx_t *mutex_held;
3089 /*
3090 * XXX: This code is currently commented out, because we may get here
3091 * as part of sofreelastref(), and at that time, pr_getlock() may no
3092 * longer be able to return us the lock; this will be fixed in future.
3093 */
3094 if (so->so_proto->pr_getlock != NULL)
3095 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
3096 else
3097 mutex_held = so->so_proto->pr_domain->dom_mtx;
3098
3099 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
3100#endif /* notyet */
3101
3102 /*
3103 * Obtain lock on the socket buffer (SB_LOCK). This is required
3104 * to prevent the socket buffer from being unexpectedly altered
3105 * while it is used by another thread in socket send/receive.
3106 *
3107 * sblock() must not fail here, hence the assertion.
3108 */
3109 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
3110 VERIFY(sb->sb_flags & SB_LOCK);
3111
3112 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
3113 sb->sb_flags |= SB_DROP;
3114 sb->sb_upcall = NULL;
3115 sb->sb_upcallarg = NULL;
3116
3117 sbunlock(sb, TRUE); /* keep socket locked */
3118
3119 selthreadclear(&sb->sb_sel);
3120 sbrelease(sb);
3121}
3122
1c79356b 3123void
2d21ac55 3124sorflush(struct socket *so)
1c79356b 3125{
39236c6e
A
3126 struct sockbuf *sb = &so->so_rcv;
3127 struct protosw *pr = so->so_proto;
1c79356b 3128 struct sockbuf asb;
39236c6e 3129#ifdef notyet
2d21ac55 3130 lck_mtx_t *mutex_held;
39236c6e
A
3131 /*
3132 * XXX: This code is currently commented out, because we may get here
3133 * as part of sofreelastref(), and at that time, pr_getlock() may no
3134 * longer be able to return us the lock; this will be fixed in future.
3135 */
2d21ac55 3136 if (so->so_proto->pr_getlock != NULL)
91447636 3137 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2d21ac55 3138 else
91447636 3139 mutex_held = so->so_proto->pr_domain->dom_mtx;
39236c6e 3140
91447636 3141 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
39236c6e 3142#endif /* notyet */
91447636
A
3143
3144 sflt_notify(so, sock_evt_flush_read, NULL);
1c79356b 3145
1c79356b 3146 socantrcvmore(so);
39236c6e
A
3147
3148 /*
3149 * Obtain lock on the socket buffer (SB_LOCK). This is required
3150 * to prevent the socket buffer from being unexpectedly altered
3151 * while it is used by another thread in socket send/receive.
3152 *
3153 * sblock() must not fail here, hence the assertion.
3154 */
3155 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
3156 VERIFY(sb->sb_flags & SB_LOCK);
3157
3158 /*
3159 * Copy only the relevant fields from "sb" to "asb" which we
3160 * need for sbrelease() to function. In particular, skip
3161 * sb_sel as it contains the wait queue linkage, which would
3162 * wreak havoc if we were to issue selthreadclear() on "asb".
3163 * Make sure to not carry over SB_LOCK in "asb", as we need
3164 * to acquire it later as part of sbrelease().
3165 */
3166 bzero(&asb, sizeof (asb));
3167 asb.sb_cc = sb->sb_cc;
3168 asb.sb_hiwat = sb->sb_hiwat;
3169 asb.sb_mbcnt = sb->sb_mbcnt;
3170 asb.sb_mbmax = sb->sb_mbmax;
3171 asb.sb_ctl = sb->sb_ctl;
3172 asb.sb_lowat = sb->sb_lowat;
3173 asb.sb_mb = sb->sb_mb;
3174 asb.sb_mbtail = sb->sb_mbtail;
3175 asb.sb_lastrecord = sb->sb_lastrecord;
3176 asb.sb_so = sb->sb_so;
3177 asb.sb_flags = sb->sb_flags;
3178 asb.sb_flags &= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL);
3179 asb.sb_flags |= SB_DROP;
3180
3181 /*
3182 * Ideally we'd bzero() these and preserve the ones we need;
3183 * but to do that we'd need to shuffle things around in the
3184 * sockbuf, and we can't do it now because there are KEXTS
3185 * that are directly referring to the socket structure.
3186 *
3187 * Setting SB_DROP acts as a barrier to prevent further appends.
3188 * Clearing SB_SEL is done for selthreadclear() below.
3189 */
3190 sb->sb_cc = 0;
3191 sb->sb_hiwat = 0;
3192 sb->sb_mbcnt = 0;
3193 sb->sb_mbmax = 0;
3194 sb->sb_ctl = 0;
3195 sb->sb_lowat = 0;
3196 sb->sb_mb = NULL;
3197 sb->sb_mbtail = NULL;
3198 sb->sb_lastrecord = NULL;
3199 sb->sb_timeo.tv_sec = 0;
3200 sb->sb_timeo.tv_usec = 0;
3201 sb->sb_upcall = NULL;
3202 sb->sb_upcallarg = NULL;
3203 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
3204 sb->sb_flags |= SB_DROP;
3205
3206 sbunlock(sb, TRUE); /* keep socket locked */
3207
3208 /*
3209 * Note that selthreadclear() is called on the original "sb" and
3210 * not the local "asb" because of the way wait queue linkage is
3211 * implemented. Given that selwakeup() may be triggered, SB_SEL
3212 * should no longer be set (cleared above.)
3213 */
0b4e3aa0 3214 selthreadclear(&sb->sb_sel);
39236c6e
A
3215
3216 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
1c79356b 3217 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
39236c6e 3218
1c79356b
A
3219 sbrelease(&asb);
3220}
3221
3222/*
3223 * Perhaps this routine, and sooptcopyout(), below, ought to come in
3224 * an additional variant to handle the case where the option value needs
3225 * to be some kind of integer, but not a specific size.
3226 * In addition to their use here, these functions are also called by the
3227 * protocol-level pr_ctloutput() routines.
2d21ac55
A
3228 *
3229 * Returns: 0 Success
3230 * EINVAL
3231 * copyin:EFAULT
1c79356b
A
3232 */
3233int
2d21ac55 3234sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
1c79356b
A
3235{
3236 size_t valsize;
3237
3238 /*
3239 * If the user gives us more than we wanted, we ignore it,
3240 * but if we don't get the minimum length the caller
3241 * wants, we return EINVAL. On success, sopt->sopt_valsize
3242 * is set to however much we actually retrieved.
3243 */
3244 if ((valsize = sopt->sopt_valsize) < minlen)
2d21ac55 3245 return (EINVAL);
1c79356b
A
3246 if (valsize > len)
3247 sopt->sopt_valsize = valsize = len;
3248
b0d623f7 3249 if (sopt->sopt_p != kernproc)
1c79356b
A
3250 return (copyin(sopt->sopt_val, buf, valsize));
3251
91447636 3252 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
2d21ac55
A
3253 return (0);
3254}
3255
3256/*
3257 * sooptcopyin_timeval
3258 * Copy in a timeval value into tv_p, and take into account whether the
3259 * the calling process is 64-bit or 32-bit. Moved the sanity checking
3260 * code here so that we can verify the 64-bit tv_sec value before we lose
3261 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
3262 */
3263static int
39236c6e 3264sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
2d21ac55
A
3265{
3266 int error;
b0d623f7 3267
2d21ac55 3268 if (proc_is64bit(sopt->sopt_p)) {
b0d623f7 3269 struct user64_timeval tv64;
2d21ac55 3270
39236c6e 3271 if (sopt->sopt_valsize < sizeof (tv64))
2d21ac55 3272 return (EINVAL);
39236c6e
A
3273
3274 sopt->sopt_valsize = sizeof (tv64);
b0d623f7 3275 if (sopt->sopt_p != kernproc) {
39236c6e 3276 error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
b0d623f7
A
3277 if (error != 0)
3278 return (error);
3279 } else {
3280 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
39236c6e 3281 sizeof (tv64));
2d21ac55 3282 }
39236c6e
A
3283 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
3284 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000)
2d21ac55 3285 return (EDOM);
39236c6e 3286
2d21ac55
A
3287 tv_p->tv_sec = tv64.tv_sec;
3288 tv_p->tv_usec = tv64.tv_usec;
3289 } else {
b0d623f7
A
3290 struct user32_timeval tv32;
3291
39236c6e 3292 if (sopt->sopt_valsize < sizeof (tv32))
2d21ac55 3293 return (EINVAL);
39236c6e
A
3294
3295 sopt->sopt_valsize = sizeof (tv32);
b0d623f7 3296 if (sopt->sopt_p != kernproc) {
39236c6e 3297 error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
2d21ac55
A
3298 if (error != 0) {
3299 return (error);
3300 }
3301 } else {
b0d623f7 3302 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
39236c6e 3303 sizeof (tv32));
2d21ac55 3304 }
39236c6e
A
3305#ifndef __LP64__
3306 /*
3307 * K64todo "comparison is always false due to
3308 * limited range of data type"
3309 */
3310 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
3311 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000)
2d21ac55 3312 return (EDOM);
b0d623f7
A
3313#endif
3314 tv_p->tv_sec = tv32.tv_sec;
3315 tv_p->tv_usec = tv32.tv_usec;
2d21ac55
A
3316 }
3317 return (0);
1c79356b
A
3318}
3319
2d21ac55
A
3320/*
3321 * Returns: 0 Success
3322 * EINVAL
3323 * ENOPROTOOPT
3324 * ENOBUFS
3325 * EDOM
3326 * sooptcopyin:EINVAL
3327 * sooptcopyin:EFAULT
3328 * sooptcopyin_timeval:EINVAL
3329 * sooptcopyin_timeval:EFAULT
3330 * sooptcopyin_timeval:EDOM
3331 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
3332 * <pr_ctloutput>:???w
3333 * sflt_attach_private:??? [whatever a filter author chooses]
3334 * <sf_setoption>:??? [whatever a filter author chooses]
3335 *
3336 * Notes: Other <pru_listen> returns depend on the protocol family; all
3337 * <sf_listen> returns depend on what the filter author causes
3338 * their filter to return.
3339 */
1c79356b 3340int
39236c6e 3341sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
1c79356b
A
3342{
3343 int error, optval;
3344 struct linger l;
3345 struct timeval tv;
2d21ac55
A
3346#if CONFIG_MACF_SOCKET
3347 struct mac extmac;
3348#endif /* MAC_SOCKET */
91447636 3349
39236c6e
A
3350 if (sopt->sopt_dir != SOPT_SET)
3351 sopt->sopt_dir = SOPT_SET;
3352
3353 if (dolock)
3354 socket_lock(so, 1);
3355
3356 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
3357 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
b0d623f7 3358 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
2d21ac55
A
3359 /* the socket has been shutdown, no more sockopt's */
3360 error = EINVAL;
39236c6e 3361 goto out;
9bccf70c
A
3362 }
3363
6d2010ae 3364 error = sflt_setsockopt(so, sopt);
39236c6e 3365 if (error != 0) {
6d2010ae
A
3366 if (error == EJUSTRETURN)
3367 error = 0;
39236c6e 3368 goto out;
1c79356b
A
3369 }
3370
1c79356b 3371 if (sopt->sopt_level != SOL_SOCKET) {
39236c6e
A
3372 if (so->so_proto != NULL &&
3373 so->so_proto->pr_ctloutput != NULL) {
2d21ac55 3374 error = (*so->so_proto->pr_ctloutput)(so, sopt);
39236c6e 3375 goto out;
91447636 3376 }
1c79356b
A
3377 error = ENOPROTOOPT;
3378 } else {
39236c6e
A
3379 /*
3380 * Allow socket-level (SOL_SOCKET) options to be filtered by
3381 * the protocol layer, if needed. A zero value returned from
3382 * the handler means use default socket-level processing as
3383 * done by the rest of this routine. Otherwise, any other
3384 * return value indicates that the option is unsupported.
3385 */
3386 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
3387 pru_socheckopt(so, sopt)) != 0)
3388 goto out;
3389
3390 error = 0;
1c79356b
A
3391 switch (sopt->sopt_name) {
3392 case SO_LINGER:
91447636 3393 case SO_LINGER_SEC:
2d21ac55 3394 error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
39236c6e
A
3395 if (error != 0)
3396 goto out;
1c79356b 3397
2d21ac55
A
3398 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
3399 l.l_linger : l.l_linger * hz;
39236c6e 3400 if (l.l_onoff != 0)
1c79356b
A
3401 so->so_options |= SO_LINGER;
3402 else
3403 so->so_options &= ~SO_LINGER;
3404 break;
3405
3406 case SO_DEBUG:
3407 case SO_KEEPALIVE:
3408 case SO_DONTROUTE:
3409 case SO_USELOOPBACK:
3410 case SO_BROADCAST:
3411 case SO_REUSEADDR:
3412 case SO_REUSEPORT:
3413 case SO_OOBINLINE:
3414 case SO_TIMESTAMP:
6d2010ae 3415 case SO_TIMESTAMP_MONOTONIC:
1c79356b
A
3416 case SO_DONTTRUNC:
3417 case SO_WANTMORE:
9bccf70c 3418 case SO_WANTOOBFLAG:
2d21ac55
A
3419 error = sooptcopyin(sopt, &optval, sizeof (optval),
3420 sizeof (optval));
39236c6e
A
3421 if (error != 0)
3422 goto out;
1c79356b
A
3423 if (optval)
3424 so->so_options |= sopt->sopt_name;
3425 else
3426 so->so_options &= ~sopt->sopt_name;
3427 break;
3428
3429 case SO_SNDBUF:
3430 case SO_RCVBUF:
3431 case SO_SNDLOWAT:
3432 case SO_RCVLOWAT:
2d21ac55
A
3433 error = sooptcopyin(sopt, &optval, sizeof (optval),
3434 sizeof (optval));
39236c6e
A
3435 if (error != 0)
3436 goto out;
1c79356b
A
3437
3438 /*
3439 * Values < 1 make no sense for any of these
3440 * options, so disallow them.
3441 */
3442 if (optval < 1) {
3443 error = EINVAL;
39236c6e 3444 goto out;
1c79356b
A
3445 }
3446
3447 switch (sopt->sopt_name) {
3448 case SO_SNDBUF:
39236c6e
A
3449 case SO_RCVBUF: {
3450 struct sockbuf *sb =
3451 (sopt->sopt_name == SO_SNDBUF) ?
3452 &so->so_snd : &so->so_rcv;
3453 if (sbreserve(sb, (u_int32_t)optval) == 0) {
1c79356b 3454 error = ENOBUFS;
39236c6e 3455 goto out;
1c79356b 3456 }
316670eb
A
3457 sb->sb_flags |= SB_USRSIZE;
3458 sb->sb_flags &= ~SB_AUTOSIZE;
3459 sb->sb_idealsize = (u_int32_t)optval;
1c79356b 3460 break;
316670eb 3461 }
1c79356b
A
3462 /*
3463 * Make sure the low-water is never greater than
3464 * the high-water.
3465 */
3466 case SO_SNDLOWAT:
3467 so->so_snd.sb_lowat =
3468 (optval > so->so_snd.sb_hiwat) ?
3469 so->so_snd.sb_hiwat : optval;
3470 break;
3471 case SO_RCVLOWAT:
3472 so->so_rcv.sb_lowat =
3473 (optval > so->so_rcv.sb_hiwat) ?
3474 so->so_rcv.sb_hiwat : optval;
3475 break;
3476 }
3477 break;
3478
3479 case SO_SNDTIMEO:
3480 case SO_RCVTIMEO:
2d21ac55 3481 error = sooptcopyin_timeval(sopt, &tv);
39236c6e
A
3482 if (error != 0)
3483 goto out;
1c79356b 3484
1c79356b
A
3485 switch (sopt->sopt_name) {
3486 case SO_SNDTIMEO:
91447636 3487 so->so_snd.sb_timeo = tv;
1c79356b
A
3488 break;
3489 case SO_RCVTIMEO:
91447636 3490 so->so_rcv.sb_timeo = tv;
1c79356b
A
3491 break;
3492 }
3493 break;
3494
39236c6e 3495 case SO_NKE: {
9bccf70c 3496 struct so_nke nke;
1c79356b 3497
2d21ac55
A
3498 error = sooptcopyin(sopt, &nke, sizeof (nke),
3499 sizeof (nke));
39236c6e
A
3500 if (error != 0)
3501 goto out;
1c79356b 3502
6d2010ae 3503 error = sflt_attach_internal(so, nke.nke_handle);
1c79356b
A
3504 break;
3505 }
3506
9bccf70c 3507 case SO_NOSIGPIPE:
2d21ac55
A
3508 error = sooptcopyin(sopt, &optval, sizeof (optval),
3509 sizeof (optval));
39236c6e
A
3510 if (error != 0)
3511 goto out;
3512 if (optval != 0)
2d21ac55
A
3513 so->so_flags |= SOF_NOSIGPIPE;
3514 else
3515 so->so_flags &= ~SOF_NOSIGPIPE;
9bccf70c
A
3516 break;
3517
55e303ae 3518 case SO_NOADDRERR:
2d21ac55
A
3519 error = sooptcopyin(sopt, &optval, sizeof (optval),
3520 sizeof (optval));
39236c6e
A
3521 if (error != 0)
3522 goto out;
3523 if (optval != 0)
2d21ac55
A
3524 so->so_flags |= SOF_NOADDRAVAIL;
3525 else
3526 so->so_flags &= ~SOF_NOADDRAVAIL;
2d21ac55
A
3527 break;
3528
3529 case SO_REUSESHAREUID:
3530 error = sooptcopyin(sopt, &optval, sizeof (optval),
3531 sizeof (optval));
39236c6e
A
3532 if (error != 0)
3533 goto out;
3534 if (optval != 0)
2d21ac55
A
3535 so->so_flags |= SOF_REUSESHAREUID;
3536 else
3537 so->so_flags &= ~SOF_REUSESHAREUID;
3538 break;
39236c6e 3539
2d21ac55
A
3540 case SO_NOTIFYCONFLICT:
3541 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3542 error = EPERM;
39236c6e 3543 goto out;
2d21ac55
A
3544 }
3545 error = sooptcopyin(sopt, &optval, sizeof (optval),
3546 sizeof (optval));
39236c6e
A
3547 if (error != 0)
3548 goto out;
3549 if (optval != 0)
2d21ac55
A
3550 so->so_flags |= SOF_NOTIFYCONFLICT;
3551 else
3552 so->so_flags &= ~SOF_NOTIFYCONFLICT;
3553 break;
39236c6e 3554
2d21ac55 3555 case SO_RESTRICTIONS:
2d21ac55
A
3556 error = sooptcopyin(sopt, &optval, sizeof (optval),
3557 sizeof (optval));
39236c6e
A
3558 if (error != 0)
3559 goto out;
3560
3561 error = so_set_restrictions(so, optval);
2d21ac55
A
3562 break;
3563
3564 case SO_LABEL:
3565#if CONFIG_MACF_SOCKET
3566 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
3567 sizeof (extmac))) != 0)
39236c6e 3568 goto out;
2d21ac55
A
3569
3570 error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
3571 so, &extmac);
3572#else
3573 error = EOPNOTSUPP;
3574#endif /* MAC_SOCKET */
55e303ae
A
3575 break;
3576
4a3eedf9
A
3577 case SO_UPCALLCLOSEWAIT:
3578 error = sooptcopyin(sopt, &optval, sizeof (optval),
3579 sizeof (optval));
39236c6e
A
3580 if (error != 0)
3581 goto out;
3582 if (optval != 0)
4a3eedf9
A
3583 so->so_flags |= SOF_UPCALLCLOSEWAIT;
3584 else
3585 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
3586 break;
4a3eedf9 3587
b0d623f7
A
3588 case SO_RANDOMPORT:
3589 error = sooptcopyin(sopt, &optval, sizeof (optval),
3590 sizeof (optval));
39236c6e
A
3591 if (error != 0)
3592 goto out;
3593 if (optval != 0)
b0d623f7
A
3594 so->so_flags |= SOF_BINDRANDOMPORT;
3595 else
3596 so->so_flags &= ~SOF_BINDRANDOMPORT;
3597 break;
3598
3599 case SO_NP_EXTENSIONS: {
3600 struct so_np_extensions sonpx;
3601
39236c6e
A
3602 error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
3603 sizeof (sonpx));
3604 if (error != 0)
3605 goto out;
b0d623f7
A
3606 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
3607 error = EINVAL;
39236c6e 3608 goto out;
b0d623f7
A
3609 }
3610 /*
3611 * Only one bit defined for now
3612 */
3613 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
3614 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
3615 so->so_flags |= SOF_NPX_SETOPTSHUT;
3616 else
3617 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
3618 }
3619 break;
3620 }
3621
d41d1dae
A
3622 case SO_TRAFFIC_CLASS: {
3623 error = sooptcopyin(sopt, &optval, sizeof (optval),
39236c6e
A
3624 sizeof (optval));
3625 if (error != 0)
3626 goto out;
6d2010ae 3627 error = so_set_traffic_class(so, optval);
39236c6e
A
3628 if (error != 0)
3629 goto out;
6d2010ae 3630 break;
d41d1dae 3631 }
6d2010ae
A
3632
3633 case SO_RECV_TRAFFIC_CLASS: {
3634 error = sooptcopyin(sopt, &optval, sizeof (optval),
39236c6e
A
3635 sizeof (optval));
3636 if (error != 0)
3637 goto out;
6d2010ae
A
3638 if (optval == 0)
3639 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
3640 else
3641 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
3642 break;
3643 }
316670eb 3644
6d2010ae
A
3645 case SO_TRAFFIC_CLASS_DBG: {
3646 struct so_tcdbg so_tcdbg;
316670eb
A
3647
3648 error = sooptcopyin(sopt, &so_tcdbg,
3649 sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
39236c6e
A
3650 if (error != 0)
3651 goto out;
6d2010ae 3652 error = so_set_tcdbg(so, &so_tcdbg);
39236c6e
A
3653 if (error != 0)
3654 goto out;
6d2010ae
A
3655 break;
3656 }
316670eb
A
3657
3658 case SO_PRIVILEGED_TRAFFIC_CLASS:
3659 error = priv_check_cred(kauth_cred_get(),
3660 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
39236c6e
A
3661 if (error != 0)
3662 goto out;
316670eb 3663 error = sooptcopyin(sopt, &optval, sizeof (optval),
39236c6e
A
3664 sizeof (optval));
3665 if (error != 0)
3666 goto out;
316670eb
A
3667 if (optval == 0)
3668 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
3669 else
3670 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
3671 break;
3672
6d2010ae
A
3673 case SO_DEFUNCTOK:
3674 error = sooptcopyin(sopt, &optval, sizeof (optval),
3675 sizeof (optval));
3676 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
3677 if (error == 0)
3678 error = EBADF;
39236c6e 3679 goto out;
6d2010ae
A
3680 }
3681 /*
3682 * Any process can set SO_DEFUNCTOK (clear
3683 * SOF_NODEFUNCT), but only root can clear
3684 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
3685 */
3686 if (optval == 0 &&
3687 kauth_cred_issuser(kauth_cred_get()) == 0) {
3688 error = EPERM;
39236c6e 3689 goto out;
6d2010ae
A
3690 }
3691 if (optval)
3692 so->so_flags &= ~SOF_NODEFUNCT;
3693 else
3694 so->so_flags |= SOF_NODEFUNCT;
3695
39236c6e
A
3696 if (SOCK_DOM(so) == PF_INET ||
3697 SOCK_DOM(so) == PF_INET6) {
3698 char s[MAX_IPv6_STR_LEN];
3699 char d[MAX_IPv6_STR_LEN];
3700 struct inpcb *inp = sotoinpcb(so);
3701
3702 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%s %s:%d -> "
3703 "%s:%d] is now marked as %seligible for "
3704 "defunct\n", __func__, proc_selfpid(),
3705 (uint64_t)VM_KERNEL_ADDRPERM(so),
3706 (SOCK_TYPE(so) == SOCK_STREAM) ?
3707 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
3708 ((SOCK_DOM(so) == PF_INET) ?
3709 (void *)&inp->inp_laddr.s_addr :
3710 (void *)&inp->in6p_laddr), s, sizeof (s)),
3711 ntohs(inp->in6p_lport),
3712 inet_ntop(SOCK_DOM(so),
3713 (SOCK_DOM(so) == PF_INET) ?
3714 (void *)&inp->inp_faddr.s_addr :
3715 (void *)&inp->in6p_faddr, d, sizeof (d)),
3716 ntohs(inp->in6p_fport),
3717 (so->so_flags & SOF_NODEFUNCT) ?
3718 "not " : ""));
3719 } else {
3720 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d] is "
3721 "now marked as %seligible for defunct\n",
3722 __func__, proc_selfpid(),
3723 (uint64_t)VM_KERNEL_ADDRPERM(so),
3724 SOCK_DOM(so), SOCK_TYPE(so),
3725 (so->so_flags & SOF_NODEFUNCT) ?
3726 "not " : ""));
3727 }
6d2010ae
A
3728 break;
3729
3730 case SO_ISDEFUNCT:
3731 /* This option is not settable */
3732 error = EINVAL;
3733 break;
d41d1dae 3734
316670eb
A
3735 case SO_OPPORTUNISTIC:
3736 error = sooptcopyin(sopt, &optval, sizeof (optval),
3737 sizeof (optval));
3738 if (error == 0)
3739 error = so_set_opportunistic(so, optval);
3740 break;
3741
3742 case SO_FLUSH:
3743 /* This option is handled by lower layer(s) */
3744 error = 0;
3745 break;
3746
3747 case SO_RECV_ANYIF:
3748 error = sooptcopyin(sopt, &optval, sizeof (optval),
3749 sizeof (optval));
3750 if (error == 0)
3751 error = so_set_recv_anyif(so, optval);
3752 break;
3753
39236c6e
A
3754 case SO_TRAFFIC_MGT_BACKGROUND: {
3755 /* This option is handled by lower layer(s) */
3756 error = 0;
3757 break;
3758 }
3759
3760#if FLOW_DIVERT
3761 case SO_FLOW_DIVERT_TOKEN:
3762 error = flow_divert_token_set(so, sopt);
3763 break;
3764#endif /* FLOW_DIVERT */
3765
3766
3767 case SO_DELEGATED:
3768 if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
3769 sizeof (optval))) != 0)
3770 break;
3771
3772 error = so_set_effective_pid(so, optval, sopt->sopt_p);
3773 break;
3774
3775 case SO_DELEGATED_UUID: {
3776 uuid_t euuid;
3777
3778 if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
3779 sizeof (euuid))) != 0)
3780 break;
3781
3782 error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
3783 break;
3784 }
3785
1c79356b
A
3786 default:
3787 error = ENOPROTOOPT;
3788 break;
3789 }
39236c6e
A
3790 if (error == 0 && so->so_proto != NULL &&
3791 so->so_proto->pr_ctloutput != NULL) {
3792 (void) so->so_proto->pr_ctloutput(so, sopt);
1c79356b
A
3793 }
3794 }
39236c6e
A
3795out:
3796 if (dolock)
3797 socket_unlock(so, 1);
1c79356b
A
3798 return (error);
3799}
3800
2d21ac55 3801/* Helper routines for getsockopt */
1c79356b 3802int
2d21ac55 3803sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
1c79356b
A
3804{
3805 int error;
3806 size_t valsize;
3807
3808 error = 0;
3809
3810 /*
3811 * Documented get behavior is that we always return a value,
3812 * possibly truncated to fit in the user's buffer.
3813 * Traditional behavior is that we always tell the user
3814 * precisely how much we copied, rather than something useful
3815 * like the total amount we had available for her.
3816 * Note that this interface is not idempotent; the entire answer must
3817 * generated ahead of time.
3818 */
3819 valsize = min(len, sopt->sopt_valsize);
3820 sopt->sopt_valsize = valsize;
91447636 3821 if (sopt->sopt_val != USER_ADDR_NULL) {
b0d623f7 3822 if (sopt->sopt_p != kernproc)
1c79356b
A
3823 error = copyout(buf, sopt->sopt_val, valsize);
3824 else
91447636 3825 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
1c79356b 3826 }
2d21ac55
A
3827 return (error);
3828}
3829
3830static int
39236c6e 3831sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
2d21ac55
A
3832{
3833 int error;
3834 size_t len;
b0d623f7
A
3835 struct user64_timeval tv64;
3836 struct user32_timeval tv32;
2d21ac55
A
3837 const void * val;
3838 size_t valsize;
b0d623f7 3839
2d21ac55
A
3840 error = 0;
3841 if (proc_is64bit(sopt->sopt_p)) {
39236c6e 3842 len = sizeof (tv64);
2d21ac55
A
3843 tv64.tv_sec = tv_p->tv_sec;
3844 tv64.tv_usec = tv_p->tv_usec;
3845 val = &tv64;
3846 } else {
39236c6e 3847 len = sizeof (tv32);
b0d623f7
A
3848 tv32.tv_sec = tv_p->tv_sec;
3849 tv32.tv_usec = tv_p->tv_usec;
3850 val = &tv32;
2d21ac55
A
3851 }
3852 valsize = min(len, sopt->sopt_valsize);
3853 sopt->sopt_valsize = valsize;
3854 if (sopt->sopt_val != USER_ADDR_NULL) {
b0d623f7 3855 if (sopt->sopt_p != kernproc)
2d21ac55
A
3856 error = copyout(val, sopt->sopt_val, valsize);
3857 else
3858 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
3859 }
3860 return (error);
1c79356b
A
3861}
3862
2d21ac55
A
3863/*
3864 * Return: 0 Success
3865 * ENOPROTOOPT
3866 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
3867 * <pr_ctloutput>:???
3868 * <sf_getoption>:???
3869 */
1c79356b 3870int
39236c6e 3871sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
1c79356b
A
3872{
3873 int error, optval;
3874 struct linger l;
3875 struct timeval tv;
2d21ac55
A
3876#if CONFIG_MACF_SOCKET
3877 struct mac extmac;
3878#endif /* MAC_SOCKET */
1c79356b 3879
39236c6e 3880 if (sopt->sopt_dir != SOPT_GET)
2d21ac55 3881 sopt->sopt_dir = SOPT_GET;
9bccf70c 3882
39236c6e
A
3883 if (dolock)
3884 socket_lock(so, 1);
2d21ac55 3885
6d2010ae 3886 error = sflt_getsockopt(so, sopt);
39236c6e 3887 if (error != 0) {
6d2010ae
A
3888 if (error == EJUSTRETURN)
3889 error = 0;
39236c6e 3890 goto out;
1c79356b 3891 }
39236c6e 3892
1c79356b 3893 if (sopt->sopt_level != SOL_SOCKET) {
39236c6e
A
3894 if (so->so_proto != NULL &&
3895 so->so_proto->pr_ctloutput != NULL) {
2d21ac55 3896 error = (*so->so_proto->pr_ctloutput)(so, sopt);
39236c6e 3897 goto out;
91447636 3898 }
39236c6e 3899 error = ENOPROTOOPT;
1c79356b 3900 } else {
39236c6e
A
3901 /*
3902 * Allow socket-level (SOL_SOCKET) options to be filtered by
3903 * the protocol layer, if needed. A zero value returned from
3904 * the handler means use default socket-level processing as
3905 * done by the rest of this routine. Otherwise, any other
3906 * return value indicates that the option is unsupported.
3907 */
3908 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
3909 pru_socheckopt(so, sopt)) != 0)
3910 goto out;
3911
3912 error = 0;
1c79356b
A
3913 switch (sopt->sopt_name) {
3914 case SO_LINGER:
91447636 3915 case SO_LINGER_SEC:
39236c6e 3916 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
2d21ac55
A
3917 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
3918 so->so_linger : so->so_linger / hz;
3919 error = sooptcopyout(sopt, &l, sizeof (l));
1c79356b
A
3920 break;
3921
3922 case SO_USELOOPBACK:
3923 case SO_DONTROUTE:
3924 case SO_DEBUG:
3925 case SO_KEEPALIVE:
3926 case SO_REUSEADDR:
3927 case SO_REUSEPORT:
3928 case SO_BROADCAST:
3929 case SO_OOBINLINE:
3930 case SO_TIMESTAMP:
6d2010ae 3931 case SO_TIMESTAMP_MONOTONIC:
1c79356b
A
3932 case SO_DONTTRUNC:
3933 case SO_WANTMORE:
9bccf70c 3934 case SO_WANTOOBFLAG:
1c79356b
A
3935 optval = so->so_options & sopt->sopt_name;
3936integer:
2d21ac55 3937 error = sooptcopyout(sopt, &optval, sizeof (optval));
1c79356b
A
3938 break;
3939
3940 case SO_TYPE:
3941 optval = so->so_type;
3942 goto integer;
3943
3944 case SO_NREAD:
2d21ac55
A
3945 if (so->so_proto->pr_flags & PR_ATOMIC) {
3946 int pkt_total;
3947 struct mbuf *m1;
1c79356b 3948
2d21ac55
A
3949 pkt_total = 0;
3950 m1 = so->so_rcv.sb_mb;
39236c6e
A
3951 while (m1 != NULL) {
3952 if (m1->m_type == MT_DATA ||
3953 m1->m_type == MT_HEADER ||
3954 m1->m_type == MT_OOBDATA)
1c79356b 3955 pkt_total += m1->m_len;
1c79356b
A
3956 m1 = m1->m_next;
3957 }
3958 optval = pkt_total;
2d21ac55
A
3959 } else {
3960 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3961 }
1c79356b 3962 goto integer;
39236c6e 3963
91447636
A
3964 case SO_NWRITE:
3965 optval = so->so_snd.sb_cc;
2d21ac55 3966 goto integer;
39236c6e 3967
1c79356b
A
3968 case SO_ERROR:
3969 optval = so->so_error;
3970 so->so_error = 0;
3971 goto integer;
3972
3973 case SO_SNDBUF:
3974 optval = so->so_snd.sb_hiwat;
3975 goto integer;
3976
3977 case SO_RCVBUF:
3978 optval = so->so_rcv.sb_hiwat;
3979 goto integer;
3980
3981 case SO_SNDLOWAT:
3982 optval = so->so_snd.sb_lowat;
3983 goto integer;
3984
3985 case SO_RCVLOWAT:
3986 optval = so->so_rcv.sb_lowat;
3987 goto integer;
3988
3989 case SO_SNDTIMEO:
3990 case SO_RCVTIMEO:
91447636 3991 tv = (sopt->sopt_name == SO_SNDTIMEO ?
2d21ac55 3992 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
1c79356b 3993
2d21ac55
A
3994 error = sooptcopyout_timeval(sopt, &tv);
3995 break;
1c79356b 3996
91447636
A
3997 case SO_NOSIGPIPE:
3998 optval = (so->so_flags & SOF_NOSIGPIPE);
3999 goto integer;
9bccf70c 4000
55e303ae 4001 case SO_NOADDRERR:
91447636
A
4002 optval = (so->so_flags & SOF_NOADDRAVAIL);
4003 goto integer;
55e303ae 4004
2d21ac55
A
4005 case SO_REUSESHAREUID:
4006 optval = (so->so_flags & SOF_REUSESHAREUID);
4007 goto integer;
4008
39236c6e 4009
2d21ac55
A
4010 case SO_NOTIFYCONFLICT:
4011 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
4012 goto integer;
39236c6e 4013
2d21ac55 4014 case SO_RESTRICTIONS:
39236c6e 4015 optval = so_get_restrictions(so);
2d21ac55
A
4016 goto integer;
4017
4018 case SO_LABEL:
4019#if CONFIG_MACF_SOCKET
4020 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
4021 sizeof (extmac))) != 0 ||
4022 (error = mac_socket_label_get(proc_ucred(
4023 sopt->sopt_p), so, &extmac)) != 0)
4024 break;
4025
4026 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
4027#else
4028 error = EOPNOTSUPP;
4029#endif /* MAC_SOCKET */
4030 break;
4031
4032 case SO_PEERLABEL:
4033#if CONFIG_MACF_SOCKET
4034 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
4035 sizeof (extmac))) != 0 ||
4036 (error = mac_socketpeer_label_get(proc_ucred(
4037 sopt->sopt_p), so, &extmac)) != 0)
4038 break;
4039
4040 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
4041#else
4042 error = EOPNOTSUPP;
4043#endif /* MAC_SOCKET */
4044 break;
4045
4a3eedf9
A
4046#ifdef __APPLE_API_PRIVATE
4047 case SO_UPCALLCLOSEWAIT:
4048 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
4049 goto integer;
4050#endif
b0d623f7
A
4051 case SO_RANDOMPORT:
4052 optval = (so->so_flags & SOF_BINDRANDOMPORT);
4053 goto integer;
4054
4055 case SO_NP_EXTENSIONS: {
4056 struct so_np_extensions sonpx;
4057
39236c6e
A
4058 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
4059 SONPX_SETOPTSHUT : 0;
b0d623f7 4060 sonpx.npx_mask = SONPX_MASK_VALID;
4a3eedf9 4061
39236c6e
A
4062 error = sooptcopyout(sopt, &sonpx,
4063 sizeof (struct so_np_extensions));
4064 break;
b0d623f7 4065 }
6d2010ae 4066
d41d1dae
A
4067 case SO_TRAFFIC_CLASS:
4068 optval = so->so_traffic_class;
4069 goto integer;
316670eb 4070
6d2010ae
A
4071 case SO_RECV_TRAFFIC_CLASS:
4072 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
4073 goto integer;
4074
4075 case SO_TRAFFIC_CLASS_STATS:
39236c6e
A
4076 error = sooptcopyout(sopt, &so->so_tc_stats,
4077 sizeof (so->so_tc_stats));
316670eb 4078 break;
6d2010ae 4079
39236c6e 4080 case SO_TRAFFIC_CLASS_DBG:
6d2010ae
A
4081 error = sogetopt_tcdbg(so, sopt);
4082 break;
316670eb
A
4083
4084 case SO_PRIVILEGED_TRAFFIC_CLASS:
4085 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
4086 goto integer;
4087
6d2010ae
A
4088 case SO_DEFUNCTOK:
4089 optval = !(so->so_flags & SOF_NODEFUNCT);
4090 goto integer;
4091
4092 case SO_ISDEFUNCT:
4093 optval = (so->so_flags & SOF_DEFUNCT);
4094 goto integer;
d41d1dae 4095
316670eb
A
4096 case SO_OPPORTUNISTIC:
4097 optval = so_get_opportunistic(so);
4098 goto integer;
4099
4100 case SO_FLUSH:
4101 /* This option is not gettable */
4102 error = EINVAL;
4103 break;
4104
4105 case SO_RECV_ANYIF:
4106 optval = so_get_recv_anyif(so);
4107 goto integer;
4108
39236c6e
A
4109 case SO_TRAFFIC_MGT_BACKGROUND:
4110 /* This option is handled by lower layer(s) */
4111 if (so->so_proto != NULL &&
4112 so->so_proto->pr_ctloutput != NULL) {
4113 (void) so->so_proto->pr_ctloutput(so, sopt);
4114 }
4115 break;
4116
4117#if FLOW_DIVERT
4118 case SO_FLOW_DIVERT_TOKEN:
4119 error = flow_divert_token_get(so, sopt);
4120 break;
4121#endif /* FLOW_DIVERT */
4122
1c79356b
A
4123 default:
4124 error = ENOPROTOOPT;
4125 break;
4126 }
1c79356b 4127 }
39236c6e
A
4128out:
4129 if (dolock)
4130 socket_unlock(so, 1);
4131 return (error);
1c79356b 4132}
39236c6e
A
4133
4134/*
4135 * The size limits on our soopt_getm is different from that on FreeBSD.
6d2010ae
A
4136 * We limit the size of options to MCLBYTES. This will have to change
4137 * if we need to define options that need more space than MCLBYTES.
4138 */
1c79356b 4139int
9bccf70c 4140soopt_getm(struct sockopt *sopt, struct mbuf **mp)
1c79356b
A
4141{
4142 struct mbuf *m, *m_prev;
4143 int sopt_size = sopt->sopt_valsize;
b0d623f7 4144 int how;
1c79356b 4145
6d2010ae 4146 if (sopt_size <= 0 || sopt_size > MCLBYTES)
2d21ac55 4147 return (EMSGSIZE);
a3d08fcd 4148
b0d623f7
A
4149 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
4150 MGET(m, how, MT_DATA);
39236c6e 4151 if (m == NULL)
2d21ac55 4152 return (ENOBUFS);
1c79356b 4153 if (sopt_size > MLEN) {
b0d623f7 4154 MCLGET(m, how);
1c79356b
A
4155 if ((m->m_flags & M_EXT) == 0) {
4156 m_free(m);
2d21ac55 4157 return (ENOBUFS);
1c79356b
A
4158 }
4159 m->m_len = min(MCLBYTES, sopt_size);
4160 } else {
4161 m->m_len = min(MLEN, sopt_size);
4162 }
4163 sopt_size -= m->m_len;
4164 *mp = m;
4165 m_prev = m;
4166
6d2010ae 4167 while (sopt_size > 0) {
b0d623f7 4168 MGET(m, how, MT_DATA);
39236c6e 4169 if (m == NULL) {
1c79356b 4170 m_freem(*mp);
2d21ac55 4171 return (ENOBUFS);
1c79356b
A
4172 }
4173 if (sopt_size > MLEN) {
b0d623f7 4174 MCLGET(m, how);
1c79356b
A
4175 if ((m->m_flags & M_EXT) == 0) {
4176 m_freem(*mp);
6d2010ae 4177 m_freem(m);
2d21ac55 4178 return (ENOBUFS);
1c79356b
A
4179 }
4180 m->m_len = min(MCLBYTES, sopt_size);
4181 } else {
4182 m->m_len = min(MLEN, sopt_size);
4183 }
4184 sopt_size -= m->m_len;
4185 m_prev->m_next = m;
4186 m_prev = m;
4187 }
2d21ac55 4188 return (0);
1c79356b
A
4189}
4190
6d2010ae 4191/* copyin sopt data into mbuf chain */
1c79356b 4192int
9bccf70c 4193soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
1c79356b
A
4194{
4195 struct mbuf *m0 = m;
4196
91447636 4197 if (sopt->sopt_val == USER_ADDR_NULL)
2d21ac55 4198 return (0);
1c79356b 4199 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
b0d623f7 4200 if (sopt->sopt_p != kernproc) {
1c79356b
A
4201 int error;
4202
2d21ac55
A
4203 error = copyin(sopt->sopt_val, mtod(m, char *),
4204 m->m_len);
1c79356b
A
4205 if (error != 0) {
4206 m_freem(m0);
2d21ac55 4207 return (error);
1c79356b 4208 }
2d21ac55
A
4209 } else {
4210 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
4211 mtod(m, char *), m->m_len);
4212 }
1c79356b 4213 sopt->sopt_valsize -= m->m_len;
2d21ac55 4214 sopt->sopt_val += m->m_len;
1c79356b
A
4215 m = m->m_next;
4216 }
39236c6e
A
4217 /* should be allocated enoughly at ip6_sooptmcopyin() */
4218 if (m != NULL) {
9bccf70c 4219 panic("soopt_mcopyin");
39236c6e
A
4220 /* NOTREACHED */
4221 }
2d21ac55 4222 return (0);
1c79356b
A
4223}
4224
6d2010ae 4225/* copyout mbuf chain data into soopt */
1c79356b 4226int
9bccf70c 4227soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
1c79356b
A
4228{
4229 struct mbuf *m0 = m;
4230 size_t valsize = 0;
4231
91447636 4232 if (sopt->sopt_val == USER_ADDR_NULL)
2d21ac55 4233 return (0);
1c79356b 4234 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
b0d623f7 4235 if (sopt->sopt_p != kernproc) {
1c79356b
A
4236 int error;
4237
2d21ac55
A
4238 error = copyout(mtod(m, char *), sopt->sopt_val,
4239 m->m_len);
1c79356b
A
4240 if (error != 0) {
4241 m_freem(m0);
2d21ac55 4242 return (error);
1c79356b 4243 }
2d21ac55
A
4244 } else {
4245 bcopy(mtod(m, char *),
4246 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
4247 }
4248 sopt->sopt_valsize -= m->m_len;
4249 sopt->sopt_val += m->m_len;
4250 valsize += m->m_len;
4251 m = m->m_next;
1c79356b
A
4252 }
4253 if (m != NULL) {
4254 /* enough soopt buffer should be given from user-land */
4255 m_freem(m0);
2d21ac55 4256 return (EINVAL);
1c79356b
A
4257 }
4258 sopt->sopt_valsize = valsize;
2d21ac55 4259 return (0);
1c79356b
A
4260}
4261
9bccf70c 4262void
2d21ac55 4263sohasoutofband(struct socket *so)
9bccf70c 4264{
9bccf70c
A
4265 if (so->so_pgid < 0)
4266 gsignal(-so->so_pgid, SIGURG);
2d21ac55
A
4267 else if (so->so_pgid > 0)
4268 proc_signal(so->so_pgid, SIGURG);
9bccf70c
A
4269 selwakeup(&so->so_rcv.sb_sel);
4270}
4271
4272int
39236c6e 4273sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
9bccf70c 4274{
39236c6e 4275#pragma unused(cred)
9bccf70c
A
4276 struct proc *p = current_proc();
4277 int revents = 0;
91447636
A
4278
4279 socket_lock(so, 1);
39236c6e
A
4280 so_update_last_owner_locked(so, PROC_NULL);
4281 so_update_policy(so);
9bccf70c
A
4282
4283 if (events & (POLLIN | POLLRDNORM))
4284 if (soreadable(so))
4285 revents |= events & (POLLIN | POLLRDNORM);
4286
4287 if (events & (POLLOUT | POLLWRNORM))
4288 if (sowriteable(so))
4289 revents |= events & (POLLOUT | POLLWRNORM);
4290
4291 if (events & (POLLPRI | POLLRDBAND))
4292 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
4293 revents |= events & (POLLPRI | POLLRDBAND);
4294
4295 if (revents == 0) {
4296 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
2d21ac55
A
4297 /*
4298 * Darwin sets the flag first,
4299 * BSD calls selrecord first
4300 */
9bccf70c
A
4301 so->so_rcv.sb_flags |= SB_SEL;
4302 selrecord(p, &so->so_rcv.sb_sel, wql);
4303 }
4304
4305 if (events & (POLLOUT | POLLWRNORM)) {
2d21ac55
A
4306 /*
4307 * Darwin sets the flag first,
4308 * BSD calls selrecord first
4309 */
9bccf70c
A
4310 so->so_snd.sb_flags |= SB_SEL;
4311 selrecord(p, &so->so_snd.sb_sel, wql);
4312 }
4313 }
4314
91447636 4315 socket_unlock(so, 1);
9bccf70c
A
4316 return (revents);
4317}
55e303ae 4318
55e303ae 4319int
39236c6e 4320soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx)
55e303ae 4321{
39236c6e
A
4322#pragma unused(fp)
4323#if !CONFIG_MACF_SOCKET
4324#pragma unused(ctx)
4325#endif /* MAC_SOCKET */
91447636 4326 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
316670eb 4327 struct klist *skl;
2d21ac55 4328
91447636 4329 socket_lock(so, 1);
39236c6e
A
4330 so_update_last_owner_locked(so, PROC_NULL);
4331 so_update_policy(so);
55e303ae 4332
2d21ac55 4333#if CONFIG_MACF_SOCKET
39236c6e
A
4334 if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
4335 kn, so) != 0) {
2d21ac55
A
4336 socket_unlock(so, 1);
4337 return (1);
4338 }
4339#endif /* MAC_SOCKET */
4340
55e303ae
A
4341 switch (kn->kn_filter) {
4342 case EVFILT_READ:
b0d623f7 4343 kn->kn_fop = &soread_filtops;
316670eb 4344 skl = &so->so_rcv.sb_sel.si_note;
55e303ae
A
4345 break;
4346 case EVFILT_WRITE:
4347 kn->kn_fop = &sowrite_filtops;
316670eb
A
4348 skl = &so->so_snd.sb_sel.si_note;
4349 break;
4350 case EVFILT_SOCK:
4351 kn->kn_fop = &sock_filtops;
4352 skl = &so->so_klist;
55e303ae
A
4353 break;
4354 default:
91447636 4355 socket_unlock(so, 1);
55e303ae
A
4356 return (1);
4357 }
4358
316670eb 4359 if (KNOTE_ATTACH(skl, kn)) {
39236c6e 4360 switch (kn->kn_filter) {
316670eb
A
4361 case EVFILT_READ:
4362 so->so_rcv.sb_flags |= SB_KNOTE;
4363 break;
4364 case EVFILT_WRITE:
4365 so->so_snd.sb_flags |= SB_KNOTE;
4366 break;
4367 case EVFILT_SOCK:
4368 so->so_flags |= SOF_KNOTE;
4369 break;
4370 default:
4371 socket_unlock(so, 1);
4372 return (1);
4373 }
4374 }
91447636 4375 socket_unlock(so, 1);
55e303ae
A
4376 return (0);
4377}
4378
4379static void
4380filt_sordetach(struct knote *kn)
4381{
91447636 4382 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
55e303ae 4383
91447636
A
4384 socket_lock(so, 1);
4385 if (so->so_rcv.sb_flags & SB_KNOTE)
55e303ae
A
4386 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
4387 so->so_rcv.sb_flags &= ~SB_KNOTE;
91447636 4388 socket_unlock(so, 1);
55e303ae
A
4389}
4390
4391/*ARGSUSED*/
4392static int
4393filt_soread(struct knote *kn, long hint)
4394{
91447636 4395 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
55e303ae 4396
91447636
A
4397 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4398 socket_lock(so, 1);
4399
b0d623f7
A
4400 if (so->so_options & SO_ACCEPTCONN) {
4401 int isempty;
4402
39236c6e
A
4403 /*
4404 * Radar 6615193 handle the listen case dynamically
4405 * for kqueue read filter. This allows to call listen()
4406 * after registering the kqueue EVFILT_READ.
b0d623f7
A
4407 */
4408
4409 kn->kn_data = so->so_qlen;
4410 isempty = ! TAILQ_EMPTY(&so->so_comp);
4411
4412 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4413 socket_unlock(so, 1);
4414
4415 return (isempty);
4416 }
4417
4418 /* socket isn't a listener */
4419
2d21ac55
A
4420 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
4421
91447636
A
4422 if (so->so_oobmark) {
4423 if (kn->kn_flags & EV_OOBAND) {
2d21ac55 4424 kn->kn_data -= so->so_oobmark;
91447636
A
4425 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4426 socket_unlock(so, 1);
4427 return (1);
4428 }
4429 kn->kn_data = so->so_oobmark;
4430 kn->kn_flags |= EV_OOBAND;
4431 } else {
91447636
A
4432 if (so->so_state & SS_CANTRCVMORE) {
4433 kn->kn_flags |= EV_EOF;
4434 kn->kn_fflags = so->so_error;
4435 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4436 socket_unlock(so, 1);
4437 return (1);
4438 }
55e303ae 4439 }
91447636
A
4440
4441 if (so->so_state & SS_RCVATMARK) {
4442 if (kn->kn_flags & EV_OOBAND) {
4443 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4444 socket_unlock(so, 1);
4445 return (1);
4446 }
4447 kn->kn_flags |= EV_OOBAND;
4448 } else if (kn->kn_flags & EV_OOBAND) {
4449 kn->kn_data = 0;
4450 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4451 socket_unlock(so, 1);
4452 return (0);
4453 }
4454
4455 if (so->so_error) { /* temporary udp error */
4456 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4457 socket_unlock(so, 1);
55e303ae 4458 return (1);
91447636
A
4459 }
4460
6d2010ae 4461 int64_t lowwat = so->so_rcv.sb_lowat;
39236c6e 4462 if (kn->kn_sfflags & NOTE_LOWAT) {
6d2010ae
A
4463 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
4464 lowwat = so->so_rcv.sb_hiwat;
4465 else if (kn->kn_sdata > lowwat)
4466 lowwat = kn->kn_sdata;
4467 }
39236c6e 4468
91447636
A
4469 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4470 socket_unlock(so, 1);
39236c6e 4471
6d2010ae 4472 return ((kn->kn_flags & EV_OOBAND) || kn->kn_data >= lowwat);
55e303ae
A
4473}
4474
4475static void
4476filt_sowdetach(struct knote *kn)
4477{
91447636
A
4478 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4479 socket_lock(so, 1);
55e303ae 4480
2d21ac55 4481 if (so->so_snd.sb_flags & SB_KNOTE)
55e303ae
A
4482 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
4483 so->so_snd.sb_flags &= ~SB_KNOTE;
91447636 4484 socket_unlock(so, 1);
55e303ae
A
4485}
4486
316670eb
A
4487int
4488so_wait_for_if_feedback(struct socket *so)
4489{
39236c6e 4490 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
316670eb
A
4491 (so->so_state & SS_ISCONNECTED)) {
4492 struct inpcb *inp = sotoinpcb(so);
4493 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
4494 return (1);
4495 }
4496 return (0);
4497}
4498
55e303ae
A
4499/*ARGSUSED*/
4500static int
4501filt_sowrite(struct knote *kn, long hint)
4502{
91447636 4503 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
316670eb 4504 int ret = 0;
91447636
A
4505
4506 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4507 socket_lock(so, 1);
55e303ae
A
4508
4509 kn->kn_data = sbspace(&so->so_snd);
4510 if (so->so_state & SS_CANTSENDMORE) {
2d21ac55 4511 kn->kn_flags |= EV_EOF;
55e303ae 4512 kn->kn_fflags = so->so_error;
316670eb
A
4513 ret = 1;
4514 goto out;
55e303ae 4515 }
91447636 4516 if (so->so_error) { /* temporary udp error */
316670eb
A
4517 ret = 1;
4518 goto out;
91447636 4519 }
55e303ae 4520 if (((so->so_state & SS_ISCONNECTED) == 0) &&
91447636 4521 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
316670eb
A
4522 ret = 0;
4523 goto out;
91447636 4524 }
6d2010ae 4525 int64_t lowwat = so->so_snd.sb_lowat;
39236c6e 4526 if (kn->kn_sfflags & NOTE_LOWAT) {
6d2010ae
A
4527 if (kn->kn_sdata > so->so_snd.sb_hiwat)
4528 lowwat = so->so_snd.sb_hiwat;
4529 else if (kn->kn_sdata > lowwat)
4530 lowwat = kn->kn_sdata;
4531 }
316670eb
A
4532 if (kn->kn_data >= lowwat) {
4533 if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0) {
4534 ret = tcp_notsent_lowat_check(so);
4535 } else {
4536 ret = 1;
4537 }
4538 }
4539 if (so_wait_for_if_feedback(so))
4540 ret = 0;
4541out:
91447636
A
4542 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4543 socket_unlock(so, 1);
39236c6e 4544 return (ret);
316670eb
A
4545}
4546
4547static void
4548filt_sockdetach(struct knote *kn)
4549{
4550 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4551 socket_lock(so, 1);
39236c6e 4552
316670eb
A
4553 if ((so->so_flags & SOF_KNOTE) != 0)
4554 if (KNOTE_DETACH(&so->so_klist, kn))
4555 so->so_flags &= ~SOF_KNOTE;
4556 socket_unlock(so, 1);
4557}
4558
4559static int
4560filt_sockev(struct knote *kn, long hint)
4561{
4562 int ret = 0, locked = 0;
4563 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
39236c6e 4564 long ev_hint = (hint & SO_FILT_HINT_EV);
316670eb
A
4565
4566 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
4567 socket_lock(so, 1);
4568 locked = 1;
4569 }
4570
39236c6e 4571 if (ev_hint & SO_FILT_HINT_CONNRESET) {
316670eb
A
4572 if (kn->kn_sfflags & NOTE_CONNRESET)
4573 kn->kn_fflags |= NOTE_CONNRESET;
39236c6e
A
4574 }
4575 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
316670eb
A
4576 if (kn->kn_sfflags & NOTE_TIMEOUT)
4577 kn->kn_fflags |= NOTE_TIMEOUT;
39236c6e
A
4578 }
4579 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
316670eb
A
4580 if (kn->kn_sfflags & NOTE_NOSRCADDR)
4581 kn->kn_fflags |= NOTE_NOSRCADDR;
39236c6e
A
4582 }
4583 if (ev_hint & SO_FILT_HINT_IFDENIED) {
316670eb
A
4584 if ((kn->kn_sfflags & NOTE_IFDENIED))
4585 kn->kn_fflags |= NOTE_IFDENIED;
39236c6e
A
4586 }
4587 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
316670eb
A
4588 if (kn->kn_sfflags & NOTE_KEEPALIVE)
4589 kn->kn_fflags |= NOTE_KEEPALIVE;
4590 }
39236c6e
A
4591 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
4592 if (kn->kn_sfflags & NOTE_ADAPTIVE_WTIMO)
4593 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
4594 }
4595 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
4596 if (kn->kn_sfflags & NOTE_ADAPTIVE_RTIMO)
4597 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
4598 }
4599 if (ev_hint & SO_FILT_HINT_CONNECTED) {
4600 if (kn->kn_sfflags & NOTE_CONNECTED)
4601 kn->kn_fflags |= NOTE_CONNECTED;
4602 }
4603 if (ev_hint & SO_FILT_HINT_DISCONNECTED) {
4604 if (kn->kn_sfflags & NOTE_DISCONNECTED)
4605 kn->kn_fflags |= NOTE_DISCONNECTED;
4606 }
4607 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
4608 if (so->so_proto != NULL &&
4609 (so->so_proto->pr_flags & PR_EVCONNINFO) &&
4610 (kn->kn_sfflags & NOTE_CONNINFO_UPDATED))
4611 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
4612 }
316670eb
A
4613
4614 if ((kn->kn_sfflags & NOTE_READCLOSED) &&
39236c6e 4615 (so->so_state & SS_CANTRCVMORE))
316670eb
A
4616 kn->kn_fflags |= NOTE_READCLOSED;
4617
4618 if ((kn->kn_sfflags & NOTE_WRITECLOSED) &&
39236c6e 4619 (so->so_state & SS_CANTSENDMORE))
316670eb
A
4620 kn->kn_fflags |= NOTE_WRITECLOSED;
4621
4622 if ((kn->kn_sfflags & NOTE_SUSPEND) &&
39236c6e 4623 ((ev_hint & SO_FILT_HINT_SUSPEND) ||
316670eb 4624 (so->so_flags & SOF_SUSPENDED))) {
39236c6e 4625 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
316670eb
A
4626 kn->kn_fflags |= NOTE_SUSPEND;
4627 }
4628
4629 if ((kn->kn_sfflags & NOTE_RESUME) &&
39236c6e 4630 ((ev_hint & SO_FILT_HINT_RESUME) ||
316670eb 4631 (so->so_flags & SOF_SUSPENDED) == 0)) {
39236c6e 4632 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
316670eb
A
4633 kn->kn_fflags |= NOTE_RESUME;
4634 }
4635
4636 if (so->so_error != 0) {
4637 ret = 1;
4638 kn->kn_data = so->so_error;
4639 kn->kn_flags |= EV_EOF;
4640 } else {
4641 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
4642 }
4643
4644 if (kn->kn_fflags != 0)
4645 ret = 1;
4646
4647 if (locked)
4648 socket_unlock(so, 1);
4649
39236c6e 4650 return (ret);
316670eb
A
4651}
4652
4653void
39236c6e
A
4654get_sockev_state(struct socket *so, u_int32_t *statep)
4655{
316670eb
A
4656 u_int32_t state = *(statep);
4657
39236c6e 4658 if (so->so_state & SS_ISCONNECTED)
316670eb 4659 state |= SOCKEV_CONNECTED;
39236c6e 4660 else
316670eb 4661 state &= ~(SOCKEV_CONNECTED);
39236c6e 4662 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
316670eb 4663 *(statep) = state;
55e303ae
A
4664}
4665
39236c6e
A
4666#define SO_LOCK_HISTORY_STR_LEN \
4667 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
b0d623f7 4668
39236c6e
A
4669__private_extern__ const char *
4670solockhistory_nr(struct socket *so)
55e303ae 4671{
39236c6e
A
4672 size_t n = 0;
4673 int i;
4674 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
4675
4676 bzero(lock_history_str, sizeof (lock_history_str));
4677 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
4678 n += snprintf(lock_history_str + n,
4679 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
4680 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
4681 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
b0d623f7 4682 }
39236c6e 4683 return (lock_history_str);
55e303ae
A
4684}
4685
91447636 4686int
2d21ac55 4687socket_lock(struct socket *so, int refcount)
91447636 4688{
b0d623f7
A
4689 int error = 0;
4690 void *lr_saved;
0c530ab8 4691
b0d623f7 4692 lr_saved = __builtin_return_address(0);
91447636
A
4693
4694 if (so->so_proto->pr_lock) {
4695 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
2d21ac55 4696 } else {
91447636 4697#ifdef MORE_LOCKING_DEBUG
2d21ac55
A
4698 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
4699 LCK_MTX_ASSERT_NOTOWNED);
91447636
A
4700#endif
4701 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
4702 if (refcount)
4703 so->so_usecount++;
b0d623f7 4704 so->lock_lr[so->next_lock_lr] = lr_saved;
0c530ab8 4705 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
91447636
A
4706 }
4707
2d21ac55 4708 return (error);
91447636
A
4709}
4710
4711int
2d21ac55 4712socket_unlock(struct socket *so, int refcount)
91447636 4713{
b0d623f7
A
4714 int error = 0;
4715 void *lr_saved;
2d21ac55 4716 lck_mtx_t *mutex_held;
91447636 4717
b0d623f7 4718 lr_saved = __builtin_return_address(0);
91447636 4719
39236c6e
A
4720 if (so->so_proto == NULL) {
4721 panic("%s: null so_proto so=%p\n", __func__, so);
4722 /* NOTREACHED */
4723 }
91447636 4724
2d21ac55 4725 if (so && so->so_proto->pr_unlock) {
91447636 4726 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
2d21ac55 4727 } else {
91447636
A
4728 mutex_held = so->so_proto->pr_domain->dom_mtx;
4729#ifdef MORE_LOCKING_DEBUG
4730 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4731#endif
b0d623f7 4732 so->unlock_lr[so->next_unlock_lr] = lr_saved;
0c530ab8
A
4733 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
4734
91447636 4735 if (refcount) {
39236c6e
A
4736 if (so->so_usecount <= 0) {
4737 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
4738 "lrh=%s", __func__, so->so_usecount, so,
4739 SOCK_DOM(so), so->so_type,
4740 SOCK_PROTO(so), solockhistory_nr(so));
4741 /* NOTREACHED */
4742 }
4743
91447636 4744 so->so_usecount--;
39236c6e 4745 if (so->so_usecount == 0)
91447636 4746 sofreelastref(so, 1);
91447636
A
4747 }
4748 lck_mtx_unlock(mutex_held);
4749 }
4750
2d21ac55 4751 return (error);
91447636 4752}
2d21ac55
A
4753
4754/* Called with socket locked, will unlock socket */
91447636 4755void
2d21ac55 4756sofree(struct socket *so)
91447636 4757{
2d21ac55 4758 lck_mtx_t *mutex_held;
39236c6e 4759
2d21ac55 4760 if (so->so_proto->pr_getlock != NULL)
91447636 4761 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
2d21ac55 4762 else
91447636
A
4763 mutex_held = so->so_proto->pr_domain->dom_mtx;
4764 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
2d21ac55 4765
91447636
A
4766 sofreelastref(so, 0);
4767}
4768
4769void
2d21ac55 4770soreference(struct socket *so)
91447636
A
4771{
4772 socket_lock(so, 1); /* locks & take one reference on socket */
4773 socket_unlock(so, 0); /* unlock only */
4774}
4775
4776void
2d21ac55 4777sodereference(struct socket *so)
91447636
A
4778{
4779 socket_lock(so, 0);
4780 socket_unlock(so, 1);
4781}
2d21ac55
A
4782
4783/*
4784 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
4785 * possibility of using jumbo clusters. Caller must ensure to hold
4786 * the socket lock.
4787 */
4788void
4789somultipages(struct socket *so, boolean_t set)
4790{
4791 if (set)
4792 so->so_flags |= SOF_MULTIPAGES;
4793 else
4794 so->so_flags &= ~SOF_MULTIPAGES;
4795}
b0d623f7
A
4796
4797int
4798so_isdstlocal(struct socket *so) {
4799
4800 struct inpcb *inp = (struct inpcb *)so->so_pcb;
4801
39236c6e
A
4802 if (SOCK_DOM(so) == PF_INET)
4803 return (inaddr_local(inp->inp_faddr));
4804 else if (SOCK_DOM(so) == PF_INET6)
4805 return (in6addr_local(&inp->in6p_faddr));
4806
4807 return (0);
b0d623f7 4808}
6d2010ae
A
4809
4810int
4811sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
4812{
39236c6e 4813 struct sockbuf *rcv, *snd;
6d2010ae
A
4814 int err = 0, defunct;
4815
39236c6e
A
4816 rcv = &so->so_rcv;
4817 snd = &so->so_snd;
4818
6d2010ae
A
4819 defunct = (so->so_flags & SOF_DEFUNCT);
4820 if (defunct) {
39236c6e 4821 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
6d2010ae 4822 panic("%s: SB_DROP not set", __func__);
39236c6e
A
4823 /* NOTREACHED */
4824 }
6d2010ae
A
4825 goto done;
4826 }
4827
4828 if (so->so_flags & SOF_NODEFUNCT) {
4829 if (noforce) {
4830 err = EOPNOTSUPP;
39236c6e
A
4831 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) "
4832 "so 0x%llx [%d,%d] is not eligible for defunct "
4833 "(%d)\n", __func__, proc_selfpid(), proc_pid(p),
4834 level, (uint64_t)VM_KERNEL_ADDRPERM(so),
4835 SOCK_DOM(so), SOCK_TYPE(so), err));
6d2010ae
A
4836 return (err);
4837 }
4838 so->so_flags &= ~SOF_NODEFUNCT;
39236c6e
A
4839 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
4840 "[%d,%d] defunct by force\n", __func__, proc_selfpid(),
4841 proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
4842 SOCK_DOM(so), SOCK_TYPE(so)));
6d2010ae
A
4843 }
4844
4845 so->so_flags |= SOF_DEFUNCT;
39236c6e 4846
6d2010ae 4847 /* Prevent further data from being appended to the socket buffers */
39236c6e
A
4848 snd->sb_flags |= SB_DROP;
4849 rcv->sb_flags |= SB_DROP;
4850
4851 /* Flush any existing data in the socket buffers */
4852 if (rcv->sb_cc != 0) {
4853 rcv->sb_flags &= ~SB_SEL;
4854 selthreadclear(&rcv->sb_sel);
4855 sbrelease(rcv);
4856 }
4857 if (snd->sb_cc != 0) {
4858 snd->sb_flags &= ~SB_SEL;
4859 selthreadclear(&snd->sb_sel);
4860 sbrelease(snd);
4861 }
6d2010ae
A
4862
4863done:
39236c6e
A
4864 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%d,%d] %s "
4865 "defunct\n", __func__, proc_selfpid(), proc_pid(p), level,
4866 (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so),
6d2010ae
A
4867 defunct ? "is already" : "marked as"));
4868
4869 return (err);
4870}
4871
4872int
4873sodefunct(struct proc *p, struct socket *so, int level)
4874{
4875 struct sockbuf *rcv, *snd;
4876
39236c6e 4877 if (!(so->so_flags & SOF_DEFUNCT)) {
6d2010ae 4878 panic("%s improperly called", __func__);
39236c6e
A
4879 /* NOTREACHED */
4880 }
6d2010ae
A
4881 if (so->so_state & SS_DEFUNCT)
4882 goto done;
4883
4884 rcv = &so->so_rcv;
4885 snd = &so->so_snd;
4886
39236c6e
A
4887 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
4888 char s[MAX_IPv6_STR_LEN];
4889 char d[MAX_IPv6_STR_LEN];
4890 struct inpcb *inp = sotoinpcb(so);
4891
4892 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%s "
4893 "%s:%d -> %s:%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
4894 "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
4895 proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
4896 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
4897 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
4898 (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr),
4899 s, sizeof (s)), ntohs(inp->in6p_lport),
4900 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
4901 (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr,
4902 d, sizeof (d)), ntohs(inp->in6p_fport),
4903 (uint32_t)rcv->sb_sel.si_flags,
4904 (uint32_t)snd->sb_sel.si_flags,
4905 rcv->sb_flags, snd->sb_flags));
4906 } else {
4907 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
4908 "[%d,%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
4909 "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
4910 proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
4911 SOCK_DOM(so), SOCK_TYPE(so), (uint32_t)rcv->sb_sel.si_flags,
4912 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
4913 snd->sb_flags));
4914 }
6d2010ae
A
4915
4916 /*
4917 * Unwedge threads blocked on sbwait() and sb_lock().
4918 */
4919 sbwakeup(rcv);
4920 sbwakeup(snd);
4921
4922 if (rcv->sb_flags & SB_LOCK)
39236c6e 4923 sbunlock(rcv, TRUE); /* keep socket locked */
6d2010ae 4924 if (snd->sb_flags & SB_LOCK)
39236c6e 4925 sbunlock(snd, TRUE); /* keep socket locked */
6d2010ae
A
4926
4927 /*
4928 * Flush the buffers and disconnect. We explicitly call shutdown
4929 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
4930 * states are set for the socket. This would also flush out data
4931 * hanging off the receive list of this socket.
4932 */
4933 (void) soshutdownlock(so, SHUT_RD);
4934 (void) soshutdownlock(so, SHUT_WR);
4935 (void) sodisconnectlocked(so);
4936
4937 /*
4938 * Explicitly handle connectionless-protocol disconnection
4939 * and release any remaining data in the socket buffers.
4940 */
4941 if (!(so->so_flags & SS_ISDISCONNECTED))
4942 (void) soisdisconnected(so);
4943
4944 if (so->so_error == 0)
4945 so->so_error = EBADF;
4946
39236c6e
A
4947 if (rcv->sb_cc != 0) {
4948 rcv->sb_flags &= ~SB_SEL;
4949 selthreadclear(&rcv->sb_sel);
6d2010ae 4950 sbrelease(rcv);
39236c6e
A
4951 }
4952 if (snd->sb_cc != 0) {
4953 snd->sb_flags &= ~SB_SEL;
4954 selthreadclear(&snd->sb_sel);
6d2010ae 4955 sbrelease(snd);
39236c6e 4956 }
6d2010ae
A
4957 so->so_state |= SS_DEFUNCT;
4958
4959done:
4960 return (0);
4961}
316670eb
A
4962
4963__private_extern__ int
4964so_set_recv_anyif(struct socket *so, int optval)
4965{
4966 int ret = 0;
4967
4968#if INET6
39236c6e 4969 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
316670eb 4970#else
39236c6e 4971 if (SOCK_DOM(so) == PF_INET) {
316670eb
A
4972#endif /* !INET6 */
4973 if (optval)
4974 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
4975 else
4976 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
316670eb
A
4977 }
4978
4979 return (ret);
4980}
4981
4982__private_extern__ int
4983so_get_recv_anyif(struct socket *so)
4984{
4985 int ret = 0;
4986
4987#if INET6
39236c6e 4988 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
316670eb 4989#else
39236c6e 4990 if (SOCK_DOM(so) == PF_INET) {
316670eb
A
4991#endif /* !INET6 */
4992 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
4993 }
4994
4995 return (ret);
4996}
39236c6e
A
4997
4998int
4999so_set_restrictions(struct socket *so, uint32_t vals)
5000{
5001 int nocell_old, nocell_new;
5002 int ret = 0;
5003
5004 /*
5005 * Deny-type restrictions are trapdoors; once set they cannot be
5006 * unset for the lifetime of the socket. This allows them to be
5007 * issued by a framework on behalf of the application without
5008 * having to worry that they can be undone.
5009 *
5010 * Note here that socket-level restrictions overrides any protocol
5011 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
5012 * socket restriction issued on the socket has a higher precendence
5013 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
5014 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
5015 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
5016 */
5017 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
5018 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
5019 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR));
5020 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
5021
5022 /* other than deny cellular, there's nothing more to do */
5023 if ((nocell_new - nocell_old) == 0)
5024 return (ret);
5025
5026 /* we can only set, not clear restrictions */
5027 VERIFY((nocell_new - nocell_old) > 0);
5028
5029#if INET6
5030 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
5031#else
5032 if (SOCK_DOM(so) == PF_INET) {
5033#endif /* !INET6 */
5034 /* if deny cellular is now set, do what's needed for INPCB */
5035 inp_set_nocellular(sotoinpcb(so));
5036 }
5037
5038 return (ret);
5039}
5040
5041uint32_t
5042so_get_restrictions(struct socket *so)
5043{
5044 return (so->so_restrictions & (SO_RESTRICT_DENY_IN |
5045 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR));
5046}
5047
5048struct sockaddr_entry *
5049sockaddrentry_alloc(int how)
5050{
5051 struct sockaddr_entry *se;
5052
5053 se = (how == M_WAITOK) ? zalloc(se_zone) : zalloc_noblock(se_zone);
5054 if (se != NULL)
5055 bzero(se, se_zone_size);
5056
5057 return (se);
5058}
5059
5060void
5061sockaddrentry_free(struct sockaddr_entry *se)
5062{
5063 if (se->se_addr != NULL) {
5064 FREE(se->se_addr, M_SONAME);
5065 se->se_addr = NULL;
5066 }
5067 zfree(se_zone, se);
5068}
5069
5070struct sockaddr_entry *
5071sockaddrentry_dup(const struct sockaddr_entry *src_se, int how)
5072{
5073 struct sockaddr_entry *dst_se;
5074
5075 dst_se = sockaddrentry_alloc(how);
5076 if (dst_se != NULL) {
5077 int len = src_se->se_addr->sa_len;
5078
5079 MALLOC(dst_se->se_addr, struct sockaddr *,
5080 len, M_SONAME, how | M_ZERO);
5081 if (dst_se->se_addr != NULL) {
5082 bcopy(src_se->se_addr, dst_se->se_addr, len);
5083 } else {
5084 sockaddrentry_free(dst_se);
5085 dst_se = NULL;
5086 }
5087 }
5088
5089 return (dst_se);
5090}
5091
5092struct sockaddr_list *
5093sockaddrlist_alloc(int how)
5094{
5095 struct sockaddr_list *sl;
5096
5097 sl = (how == M_WAITOK) ? zalloc(sl_zone) : zalloc_noblock(sl_zone);
5098 if (sl != NULL) {
5099 bzero(sl, sl_zone_size);
5100 TAILQ_INIT(&sl->sl_head);
5101 }
5102 return (sl);
5103}
5104
5105void
5106sockaddrlist_free(struct sockaddr_list *sl)
5107{
5108 struct sockaddr_entry *se, *tse;
5109
5110 TAILQ_FOREACH_SAFE(se, &sl->sl_head, se_link, tse) {
5111 sockaddrlist_remove(sl, se);
5112 sockaddrentry_free(se);
5113 }
5114 VERIFY(sl->sl_cnt == 0 && TAILQ_EMPTY(&sl->sl_head));
5115 zfree(sl_zone, sl);
5116}
5117
5118void
5119sockaddrlist_insert(struct sockaddr_list *sl, struct sockaddr_entry *se)
5120{
5121 VERIFY(!(se->se_flags & SEF_ATTACHED));
5122 se->se_flags |= SEF_ATTACHED;
5123 TAILQ_INSERT_TAIL(&sl->sl_head, se, se_link);
5124 sl->sl_cnt++;
5125 VERIFY(sl->sl_cnt != 0);
5126}
5127
5128void
5129sockaddrlist_remove(struct sockaddr_list *sl, struct sockaddr_entry *se)
5130{
5131 VERIFY(se->se_flags & SEF_ATTACHED);
5132 se->se_flags &= ~SEF_ATTACHED;
5133 VERIFY(sl->sl_cnt != 0);
5134 sl->sl_cnt--;
5135 TAILQ_REMOVE(&sl->sl_head, se, se_link);
5136}
5137
5138struct sockaddr_list *
5139sockaddrlist_dup(const struct sockaddr_list *src_sl, int how)
5140{
5141 struct sockaddr_entry *src_se, *tse;
5142 struct sockaddr_list *dst_sl;
5143
5144 dst_sl = sockaddrlist_alloc(how);
5145 if (dst_sl == NULL)
5146 return (NULL);
5147
5148 TAILQ_FOREACH_SAFE(src_se, &src_sl->sl_head, se_link, tse) {
5149 struct sockaddr_entry *dst_se;
5150
5151 if (src_se->se_addr == NULL)
5152 continue;
5153
5154 dst_se = sockaddrentry_dup(src_se, how);
5155 if (dst_se == NULL) {
5156 sockaddrlist_free(dst_sl);
5157 return (NULL);
5158 }
5159
5160 sockaddrlist_insert(dst_sl, dst_se);
5161 }
5162 VERIFY(src_sl->sl_cnt == dst_sl->sl_cnt);
5163
5164 return (dst_sl);
5165}
5166
5167int
5168so_set_effective_pid(struct socket *so, int epid, struct proc *p)
5169{
5170 struct proc *ep = PROC_NULL;
5171 int error = 0;
5172
5173 /* pid 0 is reserved for kernel */
5174 if (epid == 0) {
5175 error = EINVAL;
5176 goto done;
5177 }
5178
5179 /*
5180 * If this is an in-kernel socket, prevent its delegate
5181 * association from changing unless the socket option is
5182 * coming from within the kernel itself.
5183 */
5184 if (so->last_pid == 0 && p != kernproc) {
5185 error = EACCES;
5186 goto done;
5187 }
5188
5189 /*
5190 * If this is issued by a process that's recorded as the
5191 * real owner of the socket, or if the pid is the same as
5192 * the process's own pid, then proceed. Otherwise ensure
5193 * that the issuing process has the necessary privileges.
5194 */
5195 if (epid != so->last_pid || epid != proc_pid(p)) {
5196 if ((error = priv_check_cred(kauth_cred_get(),
5197 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
5198 error = EACCES;
5199 goto done;
5200 }
5201 }
5202
5203 /* Find the process that corresponds to the effective pid */
5204 if ((ep = proc_find(epid)) == PROC_NULL) {
5205 error = ESRCH;
5206 goto done;
5207 }
5208
5209 /*
5210 * If a process tries to delegate the socket to itself, then
5211 * there's really nothing to do; treat it as a way for the
5212 * delegate association to be cleared. Note that we check
5213 * the passed-in proc rather than calling proc_selfpid(),
5214 * as we need to check the process issuing the socket option
5215 * which could be kernproc. Given that we don't allow 0 for
5216 * effective pid, it means that a delegated in-kernel socket
5217 * stays delegated during its lifetime (which is probably OK.)
5218 */
5219 if (epid == proc_pid(p)) {
5220 so->so_flags &= ~SOF_DELEGATED;
5221 so->e_upid = 0;
5222 so->e_pid = 0;
5223 uuid_clear(so->e_uuid);
5224 } else {
5225 so->so_flags |= SOF_DELEGATED;
5226 so->e_upid = proc_uniqueid(ep);
5227 so->e_pid = proc_pid(ep);
5228 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
5229 }
5230
5231done:
5232 if (error == 0 && net_io_policy_log) {
5233 uuid_string_t buf;
5234
5235 uuid_unparse(so->e_uuid, buf);
5236 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
5237 "euuid %s%s\n", __func__, proc_name_address(p),
5238 proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
5239 SOCK_TYPE(so), so->e_pid, proc_name_address(ep), buf,
5240 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
5241 } else if (error != 0 && net_io_policy_log) {
5242 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
5243 "ERROR (%d)\n", __func__, proc_name_address(p),
5244 proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
5245 SOCK_TYPE(so), epid, (ep == PROC_NULL) ? "PROC_NULL" :
5246 proc_name_address(ep), error);
5247 }
5248
5249 if (ep != PROC_NULL)
5250 proc_rele(ep);
5251
5252 return (error);
5253}
5254
5255int
5256so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
5257{
5258 uuid_string_t buf;
5259 uuid_t uuid;
5260 int error = 0;
5261
5262 /* UUID must not be all-zeroes (reserved for kernel) */
5263 if (uuid_is_null(euuid)) {
5264 error = EINVAL;
5265 goto done;;
5266 }
5267
5268 /*
5269 * If this is an in-kernel socket, prevent its delegate
5270 * association from changing unless the socket option is
5271 * coming from within the kernel itself.
5272 */
5273 if (so->last_pid == 0 && p != kernproc) {
5274 error = EACCES;
5275 goto done;
5276 }
5277
5278 /* Get the UUID of the issuing process */
5279 proc_getexecutableuuid(p, uuid, sizeof (uuid));
5280
5281 /*
5282 * If this is issued by a process that's recorded as the
5283 * real owner of the socket, or if the uuid is the same as
5284 * the process's own uuid, then proceed. Otherwise ensure
5285 * that the issuing process has the necessary privileges.
5286 */
5287 if (uuid_compare(euuid, so->last_uuid) != 0 ||
5288 uuid_compare(euuid, uuid) != 0) {
5289 if ((error = priv_check_cred(kauth_cred_get(),
5290 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
5291 error = EACCES;
5292 goto done;
5293 }
5294 }
5295
5296 /*
5297 * If a process tries to delegate the socket to itself, then
5298 * there's really nothing to do; treat it as a way for the
5299 * delegate association to be cleared. Note that we check
5300 * the uuid of the passed-in proc rather than that of the
5301 * current process, as we need to check the process issuing
5302 * the socket option which could be kernproc itself. Given
5303 * that we don't allow 0 for effective uuid, it means that
5304 * a delegated in-kernel socket stays delegated during its
5305 * lifetime (which is okay.)
5306 */
5307 if (uuid_compare(euuid, uuid) == 0) {
5308 so->so_flags &= ~SOF_DELEGATED;
5309 so->e_upid = 0;
5310 so->e_pid = 0;
5311 uuid_clear(so->e_uuid);
5312 } else {
5313 so->so_flags |= SOF_DELEGATED;
5314 /*
5315 * Unlike so_set_effective_pid(), we only have the UUID
5316 * here and the process ID is not known. Inherit the
5317 * real {pid,upid} of the socket.
5318 */
5319 so->e_upid = so->last_upid;
5320 so->e_pid = so->last_pid;
5321 uuid_copy(so->e_uuid, euuid);
5322 }
5323
5324done:
5325 if (error == 0 && net_io_policy_log) {
5326 uuid_unparse(so->e_uuid, buf);
5327 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
5328 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
5329 (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
5330 SOCK_TYPE(so), so->e_pid, buf,
5331 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
5332 } else if (error != 0 && net_io_policy_log) {
5333 uuid_unparse(euuid, buf);
5334 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
5335 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
5336 (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
5337 SOCK_TYPE(so), buf, error);
5338 }
5339
5340 return (error);
5341}
5342
5343void
5344netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
5345 uint32_t ev_datalen)
5346{
5347 struct kev_msg ev_msg;
5348
5349 /*
5350 * A netpolicy event always starts with a netpolicy_event_data
5351 * structure, but the caller can provide for a longer event
5352 * structure to post, depending on the event code.
5353 */
5354 VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
5355
5356 bzero(&ev_msg, sizeof (ev_msg));
5357 ev_msg.vendor_code = KEV_VENDOR_APPLE;
5358 ev_msg.kev_class = KEV_NETWORK_CLASS;
5359 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
5360 ev_msg.event_code = ev_code;
5361
5362 ev_msg.dv[0].data_ptr = ev_data;
5363 ev_msg.dv[0].data_length = ev_datalen;
5364
5365 kev_post_msg(&ev_msg);
5366}