git.saurik.com Git - apple/xnu.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 1998-2013 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28	/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
	29	/*
	30	* Copyright (c) 1982, 1986, 1988, 1990, 1993
	31	* The Regents of the University of California. All rights reserved.
	32	*
	33	* Redistribution and use in source and binary forms, with or without
	34	* modification, are permitted provided that the following conditions
	35	* are met:
	36	* 1. Redistributions of source code must retain the above copyright
	37	* notice, this list of conditions and the following disclaimer.
	38	* 2. Redistributions in binary form must reproduce the above copyright
	39	* notice, this list of conditions and the following disclaimer in the
	40	* documentation and/or other materials provided with the distribution.
	41	* 3. All advertising materials mentioning features or use of this software
	42	* must display the following acknowledgement:
	43	* This product includes software developed by the University of
	44	* California, Berkeley and its contributors.
	45	* 4. Neither the name of the University nor the names of its contributors
	46	* may be used to endorse or promote products derived from this software
	47	* without specific prior written permission.
	48	*
	49	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	50	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	51	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	52	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	53	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	54	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	55	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	56	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	57	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	58	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	59	* SUCH DAMAGE.
	60	*
	61	* @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
	62	*/
	63	/*
	64	* NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
	65	* support for mandatory and extensible security protections. This notice
	66	* is included in support of clause 2.2 (b) of the Apple Public License,
	67	* Version 2.0.
	68	*/
	69
	70	#include <sys/param.h>
	71	#include <sys/systm.h>
	72	#include <sys/filedesc.h>
	73	#include <sys/proc.h>
	74	#include <sys/proc_internal.h>
	75	#include <sys/kauth.h>
	76	#include <sys/file_internal.h>
	77	#include <sys/fcntl.h>
	78	#include <sys/malloc.h>
	79	#include <sys/mbuf.h>
	80	#include <sys/domain.h>
	81	#include <sys/kernel.h>
	82	#include <sys/event.h>
	83	#include <sys/poll.h>
	84	#include <sys/protosw.h>
	85	#include <sys/socket.h>
	86	#include <sys/socketvar.h>
	87	#include <sys/resourcevar.h>
	88	#include <sys/signalvar.h>
	89	#include <sys/sysctl.h>
	90	#include <sys/syslog.h>
	91	#include <sys/uio.h>
	92	#include <sys/ev.h>
	93	#include <sys/kdebug.h>
	94	#include <sys/un.h>
	95	#include <sys/user.h>
	96	#include <sys/priv.h>
	97	#include <sys/kern_event.h>
	98	#include <net/route.h>
	99	#include <net/init.h>
	100	#include <net/ntstat.h>
	101	#include <netinet/in.h>
	102	#include <netinet/in_pcb.h>
	103	#include <netinet/ip6.h>
	104	#include <netinet6/ip6_var.h>
	105	#include <netinet/flow_divert.h>
	106	#include <kern/zalloc.h>
	107	#include <kern/locks.h>
	108	#include <machine/limits.h>
	109	#include <libkern/OSAtomic.h>
	110	#include <pexpert/pexpert.h>
	111	#include <kern/assert.h>
	112	#include <kern/task.h>
	113	#include <sys/kpi_mbuf.h>
	114	#include <sys/mcache.h>
	115
	116	#if CONFIG_MACF
	117	#include <security/mac.h>
	118	#include <security/mac_framework.h>
	119	#endif /* MAC */
	120
	121	#if MULTIPATH
	122	#include <netinet/mp_pcb.h>
	123	#endif /* MULTIPATH */
	124
	125	/* TODO: this should be in a header file somewhere */
	126	extern char proc_name_address(void p);
	127
	128	static u_int32_t so_cache_hw; /* High water mark for socache */
	129	static u_int32_t so_cache_timeouts; /* number of timeouts */
	130	static u_int32_t so_cache_max_freed; /* max freed per timeout */
	131	static u_int32_t cached_sock_count = 0;
	132	STAILQ_HEAD(, socket) so_cache_head;
	133	int max_cached_sock_count = MAX_CACHED_SOCKETS;
	134	static u_int32_t so_cache_time;
	135	static int socketinit_done;
	136	static struct zone *so_cache_zone;
	137
	138	static lck_grp_t *so_cache_mtx_grp;
	139	static lck_attr_t *so_cache_mtx_attr;
	140	static lck_grp_attr_t *so_cache_mtx_grp_attr;
	141	static lck_mtx_t *so_cache_mtx;
	142
	143	#include <machine/limits.h>
	144
	145	static void filt_sordetach(struct knote *kn);
	146	static int filt_soread(struct knote *kn, long hint);
	147	static void filt_sowdetach(struct knote *kn);
	148	static int filt_sowrite(struct knote *kn, long hint);
	149	static void filt_sockdetach(struct knote *kn);
	150	static int filt_sockev(struct knote *kn, long hint);
	151
	152	static int sooptcopyin_timeval(struct sockopt , struct timeval );
	153	static int sooptcopyout_timeval(struct sockopt , const struct timeval );
	154
	155	static struct filterops soread_filtops = {
	156	.f_isfd = 1,
	157	.f_detach = filt_sordetach,
	158	.f_event = filt_soread,
	159	};
	160
	161	static struct filterops sowrite_filtops = {
	162	.f_isfd = 1,
	163	.f_detach = filt_sowdetach,
	164	.f_event = filt_sowrite,
	165	};
	166
	167	static struct filterops sock_filtops = {
	168	.f_isfd = 1,
	169	.f_detach = filt_sockdetach,
	170	.f_event = filt_sockev,
	171	};
	172
	173	#define EVEN_MORE_LOCKING_DEBUG 0
	174	int socket_debug = 0;
	175	static int socket_zone = M_SOCKET;
	176	so_gen_t so_gencnt; /* generation count for sockets */
	177
	178	MALLOC_DEFINE(M_SONAME, "soname", "socket name");
	179	MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
	180
	181	#define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
	182	#define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
	183	#define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
	184	#define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
	185	#define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) \| 1)
	186	#define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
	187	#define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
	188
	189	#define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
	190
	191	SYSCTL_DECL(_kern_ipc);
	192
	193	int somaxconn = SOMAXCONN;
	194	SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
	195	CTLFLAG_RW \| CTLFLAG_LOCKED, &somaxconn, 0, "");
	196
	197	/* Should we get a maximum also ??? */
	198	static int sosendmaxchain = 65536;
	199	static int sosendminchain = 16384;
	200	static int sorecvmincopy = 16384;
	201	SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
	202	CTLFLAG_RW \| CTLFLAG_LOCKED, &sosendminchain, 0, "");
	203	SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
	204	CTLFLAG_RW \| CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
	205
	206	/*
	207	* Set to enable jumbo clusters (if available) for large writes when
	208	* the socket is marked with SOF_MULTIPAGES; see below.
	209	*/
	210	int sosendjcl = 1;
	211	SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
	212	CTLFLAG_RW \| CTLFLAG_LOCKED, &sosendjcl, 0, "");
	213
	214	/*
	215	* Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
	216	* writes on the socket for all protocols on any network interfaces,
	217	* depending upon sosendjcl above. Be extra careful when setting this
	218	* to 1, because sending down packets that cross physical pages down to
	219	* broken drivers (those that falsely assume that the physical pages
	220	* are contiguous) might lead to system panics or silent data corruption.
	221	* When set to 0, the system will respect SOF_MULTIPAGES, which is set
	222	* only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
	223	* capable. Set this to 1 only for testing/debugging purposes.
	224	*/
	225	int sosendjcl_ignore_capab = 0;
	226	SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
	227	CTLFLAG_RW \| CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
	228
	229	int sodefunctlog = 0;
	230	SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW \| CTLFLAG_LOCKED,
	231	&sodefunctlog, 0, "");
	232
	233	int sothrottlelog = 0;
	234	SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW \| CTLFLAG_LOCKED,
	235	&sothrottlelog, 0, "");
	236
	237	int sorestrictrecv = 1;
	238	SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW \| CTLFLAG_LOCKED,
	239	&sorestrictrecv, 0, "Enable inbound interface restrictions");
	240
	241	/*
	242	* Socket operation routines.
	243	* These routines are called by the routines in
	244	* sys_socket.c or from a system process, and
	245	* implement the semantics of socket operations by
	246	* switching out to the protocol specific routines.
	247	*/
	248
	249	/* sys_generic.c */
	250	extern void postevent(struct socket , struct sockbuf , int);
	251	extern void evsofree(struct socket *);
	252	extern int tcp_notsent_lowat_check(struct socket *so);
	253	extern struct inpcbinfo tcbinfo;
	254
	255	/* TODO: these should be in header file */
	256	extern int get_inpcb_str_size(void);
	257	extern int get_tcp_str_size(void);
	258
	259	static unsigned int sl_zone_size; /* size of sockaddr_list */
	260	static struct zone sl_zone; / zone for sockaddr_list */
	261
	262	static unsigned int se_zone_size; /* size of sockaddr_entry */
	263	static struct zone se_zone; / zone for sockaddr_entry */
	264
	265	vm_size_t so_cache_zone_element_size;
	266
	267	static int sodelayed_copy(struct socket , struct uio , struct mbuf *, user_ssize_t );
	268	static void cached_sock_alloc(struct socket **, int);
	269	static void cached_sock_free(struct socket *);
	270
	271	/*
	272	* SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
	273	* setting the DSCP code on the packet based on the service class; see
	274	* <rdar://problem/11277343> for details.
	275	*/
	276	__private_extern__ u_int32_t sotcdb = SOTCDB_NO_DSCP;
	277	SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW \| CTLFLAG_LOCKED,
	278	&sotcdb, 0, "");
	279
	280	void
	281	socketinit(void)
	282	{
	283	if (socketinit_done) {
	284	printf("socketinit: already called...\n");
	285	return;
	286	}
	287	socketinit_done = 1;
	288
	289	PE_parse_boot_argn("socket_debug", &socket_debug,
	290	sizeof (socket_debug));
	291
	292	/*
	293	* allocate lock group attribute and group for socket cache mutex
	294	*/
	295	so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
	296	so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
	297	so_cache_mtx_grp_attr);
	298
	299	/*
	300	* allocate the lock attribute for socket cache mutex
	301	*/
	302	so_cache_mtx_attr = lck_attr_alloc_init();
	303
	304	/* cached sockets mutex */
	305	so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
	306	if (so_cache_mtx == NULL) {
	307	panic("%s: unable to allocate so_cache_mtx\n", __func__);
	308	/* NOTREACHED */
	309	}
	310	STAILQ_INIT(&so_cache_head);
	311
	312	so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4
	313	+ get_inpcb_str_size() + 4 + get_tcp_str_size());
	314
	315	so_cache_zone = zinit(so_cache_zone_element_size,
	316	(120000 * so_cache_zone_element_size), 8192, "socache zone");
	317	zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
	318	zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
	319
	320	sl_zone_size = sizeof (struct sockaddr_list);
	321	if ((sl_zone = zinit(sl_zone_size, 1024 * sl_zone_size, 1024,
	322	"sockaddr_list")) == NULL) {
	323	panic("%s: unable to allocate sockaddr_list zone\n", __func__);
	324	/* NOTREACHED */
	325	}
	326	zone_change(sl_zone, Z_CALLERACCT, FALSE);
	327	zone_change(sl_zone, Z_EXPAND, TRUE);
	328
	329	se_zone_size = sizeof (struct sockaddr_entry);
	330	if ((se_zone = zinit(se_zone_size, 1024 * se_zone_size, 1024,
	331	"sockaddr_entry")) == NULL) {
	332	panic("%s: unable to allocate sockaddr_entry zone\n", __func__);
	333	/* NOTREACHED */
	334	}
	335	zone_change(se_zone, Z_CALLERACCT, FALSE);
	336	zone_change(se_zone, Z_EXPAND, TRUE);
	337
	338
	339	in_pcbinit();
	340	sflt_init();
	341	socket_tclass_init();
	342	#if MULTIPATH
	343	mp_pcbinit();
	344	#endif /* MULTIPATH */
	345	}
	346
	347	static void
	348	cached_sock_alloc(struct socket **so, int waitok)
	349	{
	350	caddr_t temp;
	351	uintptr_t offset;
	352
	353	lck_mtx_lock(so_cache_mtx);
	354
	355	if (!STAILQ_EMPTY(&so_cache_head)) {
	356	VERIFY(cached_sock_count > 0);
	357
	358	*so = STAILQ_FIRST(&so_cache_head);
	359	STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
	360	STAILQ_NEXT((*so), so_cache_ent) = NULL;
	361
	362	cached_sock_count--;
	363	lck_mtx_unlock(so_cache_mtx);
	364
	365	temp = (*so)->so_saved_pcb;
	366	bzero((caddr_t)*so, sizeof (struct socket));
	367
	368	(*so)->so_saved_pcb = temp;
	369	} else {
	370
	371	lck_mtx_unlock(so_cache_mtx);
	372
	373	if (waitok)
	374	so = (struct socket )zalloc(so_cache_zone);
	375	else
	376	so = (struct socket )zalloc_noblock(so_cache_zone);
	377
	378	if (*so == NULL)
	379	return;
	380
	381	bzero((caddr_t)*so, sizeof (struct socket));
	382
	383	/*
	384	* Define offsets for extra structures into our
	385	* single block of memory. Align extra structures
	386	* on longword boundaries.
	387	*/
	388
	389	offset = (uintptr_t)*so;
	390	offset += sizeof (struct socket);
	391
	392	offset = ALIGN(offset);
	393
	394	(*so)->so_saved_pcb = (caddr_t)offset;
	395	offset += get_inpcb_str_size();
	396
	397	offset = ALIGN(offset);
	398
	399	((struct inpcb )(void )(*so)->so_saved_pcb)->inp_saved_ppcb =
	400	(caddr_t)offset;
	401	}
	402
	403	(*so)->cached_in_sock_layer = true;
	404	}
	405
	406	static void
	407	cached_sock_free(struct socket *so)
	408	{
	409
	410	lck_mtx_lock(so_cache_mtx);
	411
	412	so_cache_time = net_uptime();
	413	if (++cached_sock_count > max_cached_sock_count) {
	414	--cached_sock_count;
	415	lck_mtx_unlock(so_cache_mtx);
	416	zfree(so_cache_zone, so);
	417	} else {
	418	if (so_cache_hw < cached_sock_count)
	419	so_cache_hw = cached_sock_count;
	420
	421	STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
	422
	423	so->cache_timestamp = so_cache_time;
	424	lck_mtx_unlock(so_cache_mtx);
	425	}
	426	}
	427
	428	void
	429	so_update_last_owner_locked(struct socket *so, proc_t self)
	430	{
	431	if (so->last_pid != 0) {
	432	/*
	433	* last_pid and last_upid should remain zero for sockets
	434	* created using sock_socket. The check above achieves that
	435	*/
	436	if (self == PROC_NULL)
	437	self = current_proc();
	438
	439	if (so->last_upid != proc_uniqueid(self) \|\|
	440	so->last_pid != proc_pid(self)) {
	441	so->last_upid = proc_uniqueid(self);
	442	so->last_pid = proc_pid(self);
	443	proc_getexecutableuuid(self, so->last_uuid,
	444	sizeof (so->last_uuid));
	445	}
	446	}
	447	}
	448
	449	void
	450	so_update_policy(struct socket *so)
	451	{
	452	if (SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6)
	453	(void) inp_update_policy(sotoinpcb(so));
	454	}
	455
	456	boolean_t
	457	so_cache_timer(void)
	458	{
	459	struct socket *p;
	460	int n_freed = 0;
	461	boolean_t rc = FALSE;
	462
	463	lck_mtx_lock(so_cache_mtx);
	464	so_cache_timeouts++;
	465	so_cache_time = net_uptime();
	466
	467	while (!STAILQ_EMPTY(&so_cache_head)) {
	468	VERIFY(cached_sock_count > 0);
	469	p = STAILQ_FIRST(&so_cache_head);
	470	if ((so_cache_time - p->cache_timestamp) <
	471	SO_CACHE_TIME_LIMIT)
	472	break;
	473
	474	STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
	475	--cached_sock_count;
	476
	477	zfree(so_cache_zone, p);
	478
	479	if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
	480	so_cache_max_freed++;
	481	break;
	482	}
	483	}
	484
	485	/* Schedule again if there is more to cleanup */
	486	if (!STAILQ_EMPTY(&so_cache_head))
	487	rc = TRUE;
	488
	489	lck_mtx_unlock(so_cache_mtx);
	490	return (rc);
	491	}
	492
	493	/*
	494	* Get a socket structure from our zone, and initialize it.
	495	* We don't implement `waitok' yet (see comments in uipc_domain.c).
	496	* Note that it would probably be better to allocate socket
	497	* and PCB at the same time, but I'm not convinced that all
	498	* the protocols can be easily modified to do this.
	499	*/
	500	struct socket *
	501	soalloc(int waitok, int dom, int type)
	502	{
	503	struct socket *so;
	504
	505	if ((dom == PF_INET) && (type == SOCK_STREAM)) {
	506	cached_sock_alloc(&so, waitok);
	507	} else {
	508	MALLOC_ZONE(so, struct socket , sizeof (so), socket_zone,
	509	M_WAITOK);
	510	if (so != NULL)
	511	bzero(so, sizeof (*so));
	512	}
	513	if (so != NULL) {
	514	so->so_gencnt = ++so_gencnt;
	515	so->so_zone = socket_zone;
	516	#if CONFIG_MACF_SOCKET
	517	/* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
	518	if (mac_socket_label_init(so, !waitok) != 0) {
	519	sodealloc(so);
	520	return (NULL);
	521	}
	522	#endif /* MAC_SOCKET */
	523	}
	524
	525	return (so);
	526	}
	527
	528	int
	529	socreate_internal(int dom, struct socket **aso, int type, int proto,
	530	struct proc p, uint32_t flags, struct proc ep)
	531	{
	532	struct protosw *prp;
	533	struct socket *so;
	534	int error = 0;
	535
	536	#if TCPDEBUG
	537	extern int tcpconsdebug;
	538	#endif
	539
	540	VERIFY(aso != NULL);
	541	*aso = NULL;
	542
	543	if (proto != 0)
	544	prp = pffindproto(dom, proto, type);
	545	else
	546	prp = pffindtype(dom, type);
	547
	548	if (prp == NULL \|\| prp->pr_usrreqs->pru_attach == NULL) {
	549	if (pffinddomain(dom) == NULL)
	550	return (EAFNOSUPPORT);
	551	if (proto != 0) {
	552	if (pffindprotonotype(dom, proto) != NULL)
	553	return (EPROTOTYPE);
	554	}
	555	return (EPROTONOSUPPORT);
	556	}
	557	if (prp->pr_type != type)
	558	return (EPROTOTYPE);
	559	so = soalloc(1, dom, type);
	560	if (so == NULL)
	561	return (ENOBUFS);
	562
	563	if (flags & SOCF_ASYNC)
	564	so->so_state \|= SS_NBIO;
	565	#if MULTIPATH
	566	if (flags & SOCF_MP_SUBFLOW) {
	567	/*
	568	* A multipath subflow socket is used internally in the kernel,
	569	* therefore it does not have a file desciptor associated by
	570	* default.
	571	*/
	572	so->so_state \|= SS_NOFDREF;
	573	so->so_flags \|= SOF_MP_SUBFLOW;
	574	}
	575	#endif /* MULTIPATH */
	576
	577	TAILQ_INIT(&so->so_incomp);
	578	TAILQ_INIT(&so->so_comp);
	579	so->so_type = type;
	580	so->last_upid = proc_uniqueid(p);
	581	so->last_pid = proc_pid(p);
	582	proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
	583
	584	if (ep != PROC_NULL && ep != p) {
	585	so->e_upid = proc_uniqueid(ep);
	586	so->e_pid = proc_pid(ep);
	587	proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
	588	so->so_flags \|= SOF_DELEGATED;
	589	}
	590
	591	so->so_cred = kauth_cred_proc_ref(p);
	592	if (!suser(kauth_cred_get(), NULL))
	593	so->so_state \|= SS_PRIV;
	594
	595	so->so_proto = prp;
	596	so->so_rcv.sb_flags \|= SB_RECV;
	597	so->so_rcv.sb_so = so->so_snd.sb_so = so;
	598	so->next_lock_lr = 0;
	599	so->next_unlock_lr = 0;
	600
	601	#if CONFIG_MACF_SOCKET
	602	mac_socket_label_associate(kauth_cred_get(), so);
	603	#endif /* MAC_SOCKET */
	604
	605	/*
	606	* Attachment will create the per pcb lock if necessary and
	607	* increase refcount for creation, make sure it's done before
	608	* socket is inserted in lists.
	609	*/
	610	so->so_usecount++;
	611
	612	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
	613	if (error != 0) {
	614	/*
	615	* Warning:
	616	* If so_pcb is not zero, the socket will be leaked,
	617	* so protocol attachment handler must be coded carefuly
	618	*/
	619	so->so_state \|= SS_NOFDREF;
	620	so->so_usecount--;
	621	sofreelastref(so, 1); /* will deallocate the socket */
	622	return (error);
	623	}
	624
	625	atomic_add_32(&prp->pr_domain->dom_refs, 1);
	626	TAILQ_INIT(&so->so_evlist);
	627
	628	/* Attach socket filters for this protocol */
	629	sflt_initsock(so);
	630	#if TCPDEBUG
	631	if (tcpconsdebug == 2)
	632	so->so_options \|= SO_DEBUG;
	633	#endif
	634	so_set_default_traffic_class(so);
	635
	636	/*
	637	* If this thread or task is marked to create backgrounded sockets,
	638	* mark the socket as background.
	639	*/
	640	if (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
	641	socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
	642	so->so_background_thread = current_thread();
	643	}
	644
	645	switch (dom) {
	646	/*
	647	* Don't mark Unix domain, system or multipath sockets as
	648	* eligible for defunct by default.
	649	*/
	650	case PF_LOCAL:
	651	case PF_SYSTEM:
	652	case PF_MULTIPATH:
	653	so->so_flags \|= SOF_NODEFUNCT;
	654	break;
	655	default:
	656	break;
	657	}
	658
	659	*aso = so;
	660
	661	return (0);
	662	}
	663
	664	/*
	665	* Returns: 0 Success
	666	* EAFNOSUPPORT
	667	* EPROTOTYPE
	668	* EPROTONOSUPPORT
	669	* ENOBUFS
	670	* <pru_attach>:ENOBUFS[AF_UNIX]
	671	* <pru_attach>:ENOBUFS[TCP]
	672	* <pru_attach>:ENOMEM[TCP]
	673	* <pru_attach>:??? [other protocol families, IPSEC]
	674	*/
	675	int
	676	socreate(int dom, struct socket **aso, int type, int proto)
	677	{
	678	return (socreate_internal(dom, aso, type, proto, current_proc(), 0,
	679	PROC_NULL));
	680	}
	681
	682	int
	683	socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
	684	{
	685	int error = 0;
	686	struct proc *ep = PROC_NULL;
	687
	688	if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
	689	error = ESRCH;
	690	goto done;
	691	}
	692
	693	error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
	694
	695	/*
	696	* It might not be wise to hold the proc reference when calling
	697	* socreate_internal since it calls soalloc with M_WAITOK
	698	*/
	699	done:
	700	if (ep != PROC_NULL)
	701	proc_rele(ep);
	702
	703	return (error);
	704	}
	705
	706	/*
	707	* Returns: 0 Success
	708	* <pru_bind>:EINVAL Invalid argument [COMMON_START]
	709	* <pru_bind>:EAFNOSUPPORT Address family not supported
	710	* <pru_bind>:EADDRNOTAVAIL Address not available.
	711	* <pru_bind>:EINVAL Invalid argument
	712	* <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
	713	* <pru_bind>:EACCES Permission denied
	714	* <pru_bind>:EADDRINUSE Address in use
	715	* <pru_bind>:EAGAIN Resource unavailable, try again
	716	* <pru_bind>:EPERM Operation not permitted
	717	* <pru_bind>:???
	718	* <sf_bind>:???
	719	*
	720	* Notes: It's not possible to fully enumerate the return codes above,
	721	* since socket filter authors and protocol family authors may
	722	* not choose to limit their error returns to those listed, even
	723	* though this may result in some software operating incorrectly.
	724	*
	725	* The error codes which are enumerated above are those known to
	726	* be returned by the tcp_usr_bind function supplied.
	727	*/
	728	int
	729	sobindlock(struct socket so, struct sockaddr nam, int dolock)
	730	{
	731	struct proc *p = current_proc();
	732	int error = 0;
	733
	734	if (dolock)
	735	socket_lock(so, 1);
	736	VERIFY(so->so_usecount > 1);
	737
	738	so_update_last_owner_locked(so, p);
	739	so_update_policy(so);
	740
	741	/*
	742	* If this is a bind request on a socket that has been marked
	743	* as inactive, reject it now before we go any further.
	744	*/
	745	if (so->so_flags & SOF_DEFUNCT) {
	746	error = EINVAL;
	747	SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
	748	__func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
	749	SOCK_DOM(so), SOCK_TYPE(so), error));
	750	goto out;
	751	}
	752
	753	/* Socket filter */
	754	error = sflt_bind(so, nam);
	755
	756	if (error == 0)
	757	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
	758	out:
	759	if (dolock)
	760	socket_unlock(so, 1);
	761
	762	if (error == EJUSTRETURN)
	763	error = 0;
	764
	765	return (error);
	766	}
	767
	768	void
	769	sodealloc(struct socket *so)
	770	{
	771	kauth_cred_unref(&so->so_cred);
	772
	773	/* Remove any filters */
	774	sflt_termsock(so);
	775
	776	/* Delete the state allocated for msg queues on a socket */
	777	if (so->so_flags & SOF_ENABLE_MSGS) {
	778	FREE(so->so_msg_state, M_TEMP);
	779	so->so_msg_state = NULL;
	780	}
	781	VERIFY(so->so_msg_state == NULL);
	782
	783	so->so_gencnt = ++so_gencnt;
	784
	785	#if CONFIG_MACF_SOCKET
	786	mac_socket_label_destroy(so);
	787	#endif /* MAC_SOCKET */
	788
	789	if (so->cached_in_sock_layer) {
	790	cached_sock_free(so);
	791	} else {
	792	FREE_ZONE(so, sizeof (*so), so->so_zone);
	793	}
	794	}
	795
	796	/*
	797	* Returns: 0 Success
	798	* EINVAL
	799	* EOPNOTSUPP
	800	* <pru_listen>:EINVAL[AF_UNIX]
	801	* <pru_listen>:EINVAL[TCP]
	802	* <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
	803	* <pru_listen>:EINVAL[TCP] Invalid argument
	804	* <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
	805	* <pru_listen>:EACCES[TCP] Permission denied
	806	* <pru_listen>:EADDRINUSE[TCP] Address in use
	807	* <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
	808	* <pru_listen>:EPERM[TCP] Operation not permitted
	809	* <sf_listen>:???
	810	*
	811	* Notes: Other <pru_listen> returns depend on the protocol family; all
	812	* <sf_listen> returns depend on what the filter author causes
	813	* their filter to return.
	814	*/
	815	int
	816	solisten(struct socket *so, int backlog)
	817	{
	818	struct proc *p = current_proc();
	819	int error = 0;
	820
	821	socket_lock(so, 1);
	822
	823	so_update_last_owner_locked(so, p);
	824	so_update_policy(so);
	825
	826	if (so->so_proto == NULL) {
	827	error = EINVAL;
	828	goto out;
	829	}
	830	if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
	831	error = EOPNOTSUPP;
	832	goto out;
	833	}
	834
	835	/*
	836	* If the listen request is made on a socket that is not fully
	837	* disconnected, or on a socket that has been marked as inactive,
	838	* reject the request now.
	839	*/
	840	if ((so->so_state &
	841	(SS_ISCONNECTED\|SS_ISCONNECTING\|SS_ISDISCONNECTING)) \|\|
	842	(so->so_flags & SOF_DEFUNCT)) {
	843	error = EINVAL;
	844	if (so->so_flags & SOF_DEFUNCT) {
	845	SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
	846	"(%d)\n", __func__, proc_pid(p),
	847	(uint64_t)VM_KERNEL_ADDRPERM(so),
	848	SOCK_DOM(so), SOCK_TYPE(so), error));
	849	}
	850	goto out;
	851	}
	852
	853	if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
	854	error = EPERM;
	855	goto out;
	856	}
	857
	858	error = sflt_listen(so);
	859	if (error == 0)
	860	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
	861
	862	if (error) {
	863	if (error == EJUSTRETURN)
	864	error = 0;
	865	goto out;
	866	}
	867
	868	if (TAILQ_EMPTY(&so->so_comp))
	869	so->so_options \|= SO_ACCEPTCONN;
	870	/*
	871	* POSIX: The implementation may have an upper limit on the length of
	872	* the listen queue-either global or per accepting socket. If backlog
	873	* exceeds this limit, the length of the listen queue is set to the
	874	* limit.
	875	*
	876	* If listen() is called with a backlog argument value that is less
	877	* than 0, the function behaves as if it had been called with a backlog
	878	* argument value of 0.
	879	*
	880	* A backlog argument of 0 may allow the socket to accept connections,
	881	* in which case the length of the listen queue may be set to an
	882	* implementation-defined minimum value.
	883	*/
	884	if (backlog <= 0 \|\| backlog > somaxconn)
	885	backlog = somaxconn;
	886
	887	so->so_qlimit = backlog;
	888	out:
	889	socket_unlock(so, 1);
	890	return (error);
	891	}
	892
	893	void
	894	sofreelastref(struct socket *so, int dealloc)
	895	{
	896	struct socket *head = so->so_head;
	897
	898	/* Assume socket is locked */
	899
	900	if (!(so->so_flags & SOF_PCBCLEARING) \|\| !(so->so_state & SS_NOFDREF)) {
	901	selthreadclear(&so->so_snd.sb_sel);
	902	selthreadclear(&so->so_rcv.sb_sel);
	903	so->so_rcv.sb_flags &= ~(SB_SEL\|SB_UPCALL);
	904	so->so_snd.sb_flags &= ~(SB_SEL\|SB_UPCALL);
	905	so->so_event = NULL;
	906	return;
	907	}
	908	if (head != NULL) {
	909	socket_lock(head, 1);
	910	if (so->so_state & SS_INCOMP) {
	911	TAILQ_REMOVE(&head->so_incomp, so, so_list);
	912	head->so_incqlen--;
	913	} else if (so->so_state & SS_COMP) {
	914	/*
	915	* We must not decommission a socket that's
	916	* on the accept(2) queue. If we do, then
	917	* accept(2) may hang after select(2) indicated
	918	* that the listening socket was ready.
	919	*/
	920	selthreadclear(&so->so_snd.sb_sel);
	921	selthreadclear(&so->so_rcv.sb_sel);
	922	so->so_rcv.sb_flags &= ~(SB_SEL\|SB_UPCALL);
	923	so->so_snd.sb_flags &= ~(SB_SEL\|SB_UPCALL);
	924	so->so_event = NULL;
	925	socket_unlock(head, 1);
	926	return;
	927	} else {
	928	panic("sofree: not queued");
	929	}
	930	head->so_qlen--;
	931	so->so_state &= ~SS_INCOMP;
	932	so->so_head = NULL;
	933	socket_unlock(head, 1);
	934	}
	935	sowflush(so);
	936	sorflush(so);
	937
	938	#if FLOW_DIVERT
	939	if (so->so_flags & SOF_FLOW_DIVERT) {
	940	flow_divert_detach(so);
	941	}
	942	#endif /* FLOW_DIVERT */
	943
	944	/* 3932268: disable upcall */
	945	so->so_rcv.sb_flags &= ~SB_UPCALL;
	946	so->so_snd.sb_flags &= ~SB_UPCALL;
	947	so->so_event = NULL;
	948
	949	if (dealloc)
	950	sodealloc(so);
	951	}
	952
	953	void
	954	soclose_wait_locked(struct socket *so)
	955	{
	956	lck_mtx_t *mutex_held;
	957
	958	if (so->so_proto->pr_getlock != NULL)
	959	mutex_held = (*so->so_proto->pr_getlock)(so, 0);
	960	else
	961	mutex_held = so->so_proto->pr_domain->dom_mtx;
	962	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
	963
	964	/*
	965	* Double check here and return if there's no outstanding upcall;
	966	* otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
	967	*/
	968	if (!so->so_upcallusecount \|\| !(so->so_flags & SOF_UPCALLCLOSEWAIT))
	969	return;
	970	so->so_rcv.sb_flags &= ~SB_UPCALL;
	971	so->so_snd.sb_flags &= ~SB_UPCALL;
	972	so->so_flags \|= SOF_CLOSEWAIT;
	973	(void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
	974	"soclose_wait_locked", NULL);
	975	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
	976	so->so_flags &= ~SOF_CLOSEWAIT;
	977	}
	978
	979	/*
	980	* Close a socket on last file table reference removal.
	981	* Initiate disconnect if connected.
	982	* Free socket when disconnect complete.
	983	*/
	984	int
	985	soclose_locked(struct socket *so)
	986	{
	987	int error = 0;
	988	lck_mtx_t *mutex_held;
	989	struct timespec ts;
	990
	991	if (so->so_usecount == 0) {
	992	panic("soclose: so=%p refcount=0\n", so);
	993	/* NOTREACHED */
	994	}
	995
	996	sflt_notify(so, sock_evt_closing, NULL);
	997
	998	if (so->so_upcallusecount)
	999	soclose_wait_locked(so);
	1000
	1001	if ((so->so_options & SO_ACCEPTCONN)) {
	1002	struct socket sp, sonext;
	1003	int socklock = 0;
	1004
	1005	/*
	1006	* We do not want new connection to be added
	1007	* to the connection queues
	1008	*/
	1009	so->so_options &= ~SO_ACCEPTCONN;
	1010
	1011	for (sp = TAILQ_FIRST(&so->so_incomp);
	1012	sp != NULL; sp = sonext) {
	1013	sonext = TAILQ_NEXT(sp, so_list);
	1014
	1015	/*
	1016	* Radar 5350314
	1017	* skip sockets thrown away by tcpdropdropblreq
	1018	* they will get cleanup by the garbage collection.
	1019	* otherwise, remove the incomp socket from the queue
	1020	* and let soabort trigger the appropriate cleanup.
	1021	*/
	1022	if (sp->so_flags & SOF_OVERFLOW)
	1023	continue;
	1024
	1025	if (so->so_proto->pr_getlock != NULL) {
	1026	/*
	1027	* Lock ordering for consistency with the
	1028	* rest of the stack, we lock the socket
	1029	* first and then grabb the head.
	1030	*/
	1031	socket_unlock(so, 0);
	1032	socket_lock(sp, 1);
	1033	socket_lock(so, 0);
	1034	socklock = 1;
	1035	}
	1036
	1037	TAILQ_REMOVE(&so->so_incomp, sp, so_list);
	1038	so->so_incqlen--;
	1039
	1040	if (sp->so_state & SS_INCOMP) {
	1041	sp->so_state &= ~SS_INCOMP;
	1042	sp->so_head = NULL;
	1043
	1044	(void) soabort(sp);
	1045	}
	1046
	1047	if (socklock)
	1048	socket_unlock(sp, 1);
	1049	}
	1050
	1051	while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
	1052	/* Dequeue from so_comp since sofree() won't do it */
	1053	TAILQ_REMOVE(&so->so_comp, sp, so_list);
	1054	so->so_qlen--;
	1055
	1056	if (so->so_proto->pr_getlock != NULL) {
	1057	socket_unlock(so, 0);
	1058	socket_lock(sp, 1);
	1059	}
	1060
	1061	if (sp->so_state & SS_COMP) {
	1062	sp->so_state &= ~SS_COMP;
	1063	sp->so_head = NULL;
	1064
	1065	(void) soabort(sp);
	1066	}
	1067
	1068	if (so->so_proto->pr_getlock != NULL) {
	1069	socket_unlock(sp, 1);
	1070	socket_lock(so, 0);
	1071	}
	1072	}
	1073	}
	1074	if (so->so_pcb == NULL) {
	1075	/* 3915887: mark the socket as ready for dealloc */
	1076	so->so_flags \|= SOF_PCBCLEARING;
	1077	goto discard;
	1078	}
	1079	if (so->so_state & SS_ISCONNECTED) {
	1080	if ((so->so_state & SS_ISDISCONNECTING) == 0) {
	1081	error = sodisconnectlocked(so);
	1082	if (error)
	1083	goto drop;
	1084	}
	1085	if (so->so_options & SO_LINGER) {
	1086	if ((so->so_state & SS_ISDISCONNECTING) &&
	1087	(so->so_state & SS_NBIO))
	1088	goto drop;
	1089	if (so->so_proto->pr_getlock != NULL)
	1090	mutex_held = (*so->so_proto->pr_getlock)(so, 0);
	1091	else
	1092	mutex_held = so->so_proto->pr_domain->dom_mtx;
	1093	while (so->so_state & SS_ISCONNECTED) {
	1094	ts.tv_sec = (so->so_linger/100);
	1095	ts.tv_nsec = (so->so_linger % 100) *
	1096	NSEC_PER_USEC * 1000 * 10;
	1097	error = msleep((caddr_t)&so->so_timeo,
	1098	mutex_held, PSOCK \| PCATCH, "soclose", &ts);
	1099	if (error) {
	1100	/*
	1101	* It's OK when the time fires,
	1102	* don't report an error
	1103	*/
	1104	if (error == EWOULDBLOCK)
	1105	error = 0;
	1106	break;
	1107	}
	1108	}
	1109	}
	1110	}
	1111	drop:
	1112	if (so->so_usecount == 0) {
	1113	panic("soclose: usecount is zero so=%p\n", so);
	1114	/* NOTREACHED */
	1115	}
	1116	if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
	1117	/*
	1118	* Let NetworkStatistics know this PCB is going away
	1119	* before we detach it.
	1120	*/
	1121	if (nstat_collect &&
	1122	(SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6))
	1123	nstat_pcb_detach(so->so_pcb);
	1124
	1125	int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
	1126	if (error == 0)
	1127	error = error2;
	1128	}
	1129	if (so->so_usecount <= 0) {
	1130	panic("soclose: usecount is zero so=%p\n", so);
	1131	/* NOTREACHED */
	1132	}
	1133	discard:
	1134	if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
	1135	(so->so_state & SS_NOFDREF)) {
	1136	panic("soclose: NOFDREF");
	1137	/* NOTREACHED */
	1138	}
	1139	so->so_state \|= SS_NOFDREF;
	1140
	1141	if (so->so_flags & SOF_MP_SUBFLOW)
	1142	so->so_flags &= ~SOF_MP_SUBFLOW;
	1143
	1144	if ((so->so_flags & SOF_KNOTE) != 0)
	1145	KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
	1146
	1147	atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
	1148	evsofree(so);
	1149
	1150	so->so_usecount--;
	1151	sofree(so);
	1152	return (error);
	1153	}
	1154
	1155	int
	1156	soclose(struct socket *so)
	1157	{
	1158	int error = 0;
	1159	socket_lock(so, 1);
	1160
	1161	if (so->so_retaincnt == 0) {
	1162	error = soclose_locked(so);
	1163	} else {
	1164	/*
	1165	* if the FD is going away, but socket is
	1166	* retained in kernel remove its reference
	1167	*/
	1168	so->so_usecount--;
	1169	if (so->so_usecount < 2)
	1170	panic("soclose: retaincnt non null and so=%p "
	1171	"usecount=%d\n", so, so->so_usecount);
	1172	}
	1173	socket_unlock(so, 1);
	1174	return (error);
	1175	}
	1176
	1177	/*
	1178	* Must be called at splnet...
	1179	*/
	1180	/* Should already be locked */
	1181	int
	1182	soabort(struct socket *so)
	1183	{
	1184	int error;
	1185
	1186	#ifdef MORE_LOCKING_DEBUG
	1187	lck_mtx_t *mutex_held;
	1188
	1189	if (so->so_proto->pr_getlock != NULL)
	1190	mutex_held = (*so->so_proto->pr_getlock)(so, 0);
	1191	else
	1192	mutex_held = so->so_proto->pr_domain->dom_mtx;
	1193	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
	1194	#endif
	1195
	1196	if ((so->so_flags & SOF_ABORTED) == 0) {
	1197	so->so_flags \|= SOF_ABORTED;
	1198	error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
	1199	if (error) {
	1200	sofree(so);
	1201	return (error);
	1202	}
	1203	}
	1204	return (0);
	1205	}
	1206
	1207	int
	1208	soacceptlock(struct socket so, struct sockaddr *nam, int dolock)
	1209	{
	1210	int error;
	1211
	1212	if (dolock)
	1213	socket_lock(so, 1);
	1214
	1215	so_update_last_owner_locked(so, PROC_NULL);
	1216	so_update_policy(so);
	1217
	1218	if ((so->so_state & SS_NOFDREF) == 0)
	1219	panic("soaccept: !NOFDREF");
	1220	so->so_state &= ~SS_NOFDREF;
	1221	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
	1222
	1223	if (dolock)
	1224	socket_unlock(so, 1);
	1225	return (error);
	1226	}
	1227
	1228	int
	1229	soaccept(struct socket so, struct sockaddr *nam)
	1230	{
	1231	return (soacceptlock(so, nam, 1));
	1232	}
	1233
	1234	int
	1235	soacceptfilter(struct socket *so)
	1236	{
	1237	struct sockaddr local = NULL, remote = NULL;
	1238	int error = 0;
	1239	struct socket *head = so->so_head;
	1240
	1241	/*
	1242	* Hold the lock even if this socket has not been made visible
	1243	* to the filter(s). For sockets with global locks, this protects
	1244	* against the head or peer going away
	1245	*/
	1246	socket_lock(so, 1);
	1247	if (sogetaddr_locked(so, &remote, 1) != 0 \|\|
	1248	sogetaddr_locked(so, &local, 0) != 0) {
	1249	so->so_state &= ~(SS_NOFDREF \| SS_COMP);
	1250	so->so_head = NULL;
	1251	socket_unlock(so, 1);
	1252	soclose(so);
	1253	/* Out of resources; try it again next time */
	1254	error = ECONNABORTED;
	1255	goto done;
	1256	}
	1257
	1258	error = sflt_accept(head, so, local, remote);
	1259
	1260	/*
	1261	* If we get EJUSTRETURN from one of the filters, mark this socket
	1262	* as inactive and return it anyway. This newly accepted socket
	1263	* will be disconnected later before we hand it off to the caller.
	1264	*/
	1265	if (error == EJUSTRETURN) {
	1266	error = 0;
	1267	(void) sosetdefunct(current_proc(), so,
	1268	SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
	1269	}
	1270
	1271	if (error != 0) {
	1272	/*
	1273	* This may seem like a duplication to the above error
	1274	* handling part when we return ECONNABORTED, except
	1275	* the following is done while holding the lock since
	1276	* the socket has been exposed to the filter(s) earlier.
	1277	*/
	1278	so->so_state &= ~(SS_NOFDREF \| SS_COMP);
	1279	so->so_head = NULL;
	1280	socket_unlock(so, 1);
	1281	soclose(so);
	1282	/* Propagate socket filter's error code to the caller */
	1283	} else {
	1284	socket_unlock(so, 1);
	1285	}
	1286	done:
	1287	/* Callee checks for NULL pointer */
	1288	sock_freeaddr(remote);
	1289	sock_freeaddr(local);
	1290	return (error);
	1291	}
	1292
	1293	/*
	1294	* Returns: 0 Success
	1295	* EOPNOTSUPP Operation not supported on socket
	1296	* EISCONN Socket is connected
	1297	* <pru_connect>:EADDRNOTAVAIL Address not available.
	1298	* <pru_connect>:EINVAL Invalid argument
	1299	* <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
	1300	* <pru_connect>:EACCES Permission denied
	1301	* <pru_connect>:EADDRINUSE Address in use
	1302	* <pru_connect>:EAGAIN Resource unavailable, try again
	1303	* <pru_connect>:EPERM Operation not permitted
	1304	* <sf_connect_out>:??? [anything a filter writer might set]
	1305	*/
	1306	int
	1307	soconnectlock(struct socket so, struct sockaddr nam, int dolock)
	1308	{
	1309	int error;
	1310	struct proc *p = current_proc();
	1311
	1312	if (dolock)
	1313	socket_lock(so, 1);
	1314
	1315	so_update_last_owner_locked(so, p);
	1316	so_update_policy(so);
	1317
	1318	/*
	1319	* If this is a listening socket or if this is a previously-accepted
	1320	* socket that has been marked as inactive, reject the connect request.
	1321	*/
	1322	if ((so->so_options & SO_ACCEPTCONN) \|\| (so->so_flags & SOF_DEFUNCT)) {
	1323	error = EOPNOTSUPP;
	1324	if (so->so_flags & SOF_DEFUNCT) {
	1325	SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
	1326	"(%d)\n", __func__, proc_pid(p),
	1327	(uint64_t)VM_KERNEL_ADDRPERM(so),
	1328	SOCK_DOM(so), SOCK_TYPE(so), error));
	1329	}
	1330	if (dolock)
	1331	socket_unlock(so, 1);
	1332	return (error);
	1333	}
	1334
	1335	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
	1336	if (dolock)
	1337	socket_unlock(so, 1);
	1338	return (EPERM);
	1339	}
	1340
	1341	/*
	1342	* If protocol is connection-based, can only connect once.
	1343	* Otherwise, if connected, try to disconnect first.
	1344	* This allows user to disconnect by connecting to, e.g.,
	1345	* a null address.
	1346	*/
	1347	if (so->so_state & (SS_ISCONNECTED\|SS_ISCONNECTING) &&
	1348	((so->so_proto->pr_flags & PR_CONNREQUIRED) \|\|
	1349	(error = sodisconnectlocked(so)))) {
	1350	error = EISCONN;
	1351	} else {
	1352	/*
	1353	* Run connect filter before calling protocol:
	1354	* - non-blocking connect returns before completion;
	1355	*/
	1356	error = sflt_connectout(so, nam);
	1357	if (error != 0) {
	1358	if (error == EJUSTRETURN)
	1359	error = 0;
	1360	} else {
	1361	error = (*so->so_proto->pr_usrreqs->pru_connect)
	1362	(so, nam, p);
	1363	}
	1364	}
	1365	if (dolock)
	1366	socket_unlock(so, 1);
	1367	return (error);
	1368	}
	1369
	1370	int
	1371	soconnect(struct socket so, struct sockaddr nam)
	1372	{
	1373	return (soconnectlock(so, nam, 1));
	1374	}
	1375
	1376	/*
	1377	* Returns: 0 Success
	1378	* <pru_connect2>:EINVAL[AF_UNIX]
	1379	* <pru_connect2>:EPROTOTYPE[AF_UNIX]
	1380	* <pru_connect2>:??? [other protocol families]
	1381	*
	1382	* Notes: <pru_connect2> is not supported by [TCP].
	1383	*/
	1384	int
	1385	soconnect2(struct socket so1, struct socket so2)
	1386	{
	1387	int error;
	1388
	1389	socket_lock(so1, 1);
	1390	if (so2->so_proto->pr_lock)
	1391	socket_lock(so2, 1);
	1392
	1393	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
	1394
	1395	socket_unlock(so1, 1);
	1396	if (so2->so_proto->pr_lock)
	1397	socket_unlock(so2, 1);
	1398	return (error);
	1399	}
	1400
	1401	int
	1402	soconnectxlocked(struct socket so, struct sockaddr_list *src_sl,
	1403	struct sockaddr_list *dst_sl, struct proc p, uint32_t ifscope,
	1404	associd_t aid, connid_t pcid, uint32_t flags, void arg,
	1405	uint32_t arglen)
	1406	{
	1407	int error;
	1408
	1409	/*
	1410	* If this is a listening socket or if this is a previously-accepted
	1411	* socket that has been marked as inactive, reject the connect request.
	1412	*/
	1413	if ((so->so_options & SO_ACCEPTCONN) \|\| (so->so_flags & SOF_DEFUNCT)) {
	1414	error = EOPNOTSUPP;
	1415	if (so->so_flags & SOF_DEFUNCT) {
	1416	SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
	1417	"(%d)\n", __func__, proc_pid(p),
	1418	(uint64_t)VM_KERNEL_ADDRPERM(so),
	1419	SOCK_DOM(so), SOCK_TYPE(so), error));
	1420	}
	1421	return (error);
	1422	}
	1423
	1424	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0)
	1425	return (EPERM);
	1426
	1427	/*
	1428	* If protocol is connection-based, can only connect once
	1429	* unless PR_MULTICONN is set. Otherwise, if connected,
	1430	* try to disconnect first. This allows user to disconnect
	1431	* by connecting to, e.g., a null address.
	1432	*/
	1433	if ((so->so_state & (SS_ISCONNECTED\|SS_ISCONNECTING)) &&
	1434	!(so->so_proto->pr_flags & PR_MULTICONN) &&
	1435	((so->so_proto->pr_flags & PR_CONNREQUIRED) \|\|
	1436	(error = sodisconnectlocked(so)) != 0)) {
	1437	error = EISCONN;
	1438	} else {
	1439	/*
	1440	* Run connect filter before calling protocol:
	1441	* - non-blocking connect returns before completion;
	1442	*/
	1443	error = sflt_connectxout(so, dst_sl);
	1444	if (error != 0) {
	1445	if (error == EJUSTRETURN)
	1446	error = 0;
	1447	} else {
	1448	error = (*so->so_proto->pr_usrreqs->pru_connectx)
	1449	(so, src_sl, dst_sl, p, ifscope, aid, pcid,
	1450	flags, arg, arglen);
	1451	}
	1452	}
	1453
	1454	return (error);
	1455	}
	1456
	1457	int
	1458	sodisconnectlocked(struct socket *so)
	1459	{
	1460	int error;
	1461
	1462	if ((so->so_state & SS_ISCONNECTED) == 0) {
	1463	error = ENOTCONN;
	1464	goto bad;
	1465	}
	1466	if (so->so_state & SS_ISDISCONNECTING) {
	1467	error = EALREADY;
	1468	goto bad;
	1469	}
	1470
	1471	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
	1472	if (error == 0)
	1473	sflt_notify(so, sock_evt_disconnected, NULL);
	1474
	1475	bad:
	1476	return (error);
	1477	}
	1478
	1479	/* Locking version */
	1480	int
	1481	sodisconnect(struct socket *so)
	1482	{
	1483	int error;
	1484
	1485	socket_lock(so, 1);
	1486	error = sodisconnectlocked(so);
	1487	socket_unlock(so, 1);
	1488	return (error);
	1489	}
	1490
	1491	int
	1492	sodisconnectxlocked(struct socket *so, associd_t aid, connid_t cid)
	1493	{
	1494	int error;
	1495
	1496	/*
	1497	* Call the protocol disconnectx handler; let it handle all
	1498	* matters related to the connection state of this session.
	1499	*/
	1500	error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
	1501	if (error == 0) {
	1502	/*
	1503	* The event applies only for the session, not for
	1504	* the disconnection of individual subflows.
	1505	*/
	1506	if (so->so_state & (SS_ISDISCONNECTING\|SS_ISDISCONNECTED))
	1507	sflt_notify(so, sock_evt_disconnected, NULL);
	1508	}
	1509	return (error);
	1510	}
	1511
	1512	int
	1513	sodisconnectx(struct socket *so, associd_t aid, connid_t cid)
	1514	{
	1515	int error;
	1516
	1517	socket_lock(so, 1);
	1518	error = sodisconnectxlocked(so, aid, cid);
	1519	socket_unlock(so, 1);
	1520	return (error);
	1521	}
	1522
	1523	int
	1524	sopeelofflocked(struct socket so, associd_t aid, struct socket *psop)
	1525	{
	1526	return ((*so->so_proto->pr_usrreqs->pru_peeloff)(so, aid, psop));
	1527	}
	1528
	1529	#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
	1530
	1531	/*
	1532	* sosendcheck will lock the socket buffer if it isn't locked and
	1533	* verify that there is space for the data being inserted.
	1534	*
	1535	* Returns: 0 Success
	1536	* EPIPE
	1537	* sblock:EWOULDBLOCK
	1538	* sblock:EINTR
	1539	* sbwait:EBADF
	1540	* sbwait:EINTR
	1541	* [so_error]:???
	1542	*/
	1543	int
	1544	sosendcheck(struct socket so, struct sockaddr addr, user_ssize_t resid,
	1545	int32_t clen, int32_t atomic, int flags, int *sblocked,
	1546	struct mbuf *control)
	1547	{
	1548	int error = 0;
	1549	int32_t space;
	1550	int assumelock = 0;
	1551
	1552	restart:
	1553	if (*sblocked == 0) {
	1554	if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
	1555	so->so_send_filt_thread != 0 &&
	1556	so->so_send_filt_thread == current_thread()) {
	1557	/*
	1558	* We're being called recursively from a filter,
	1559	* allow this to continue. Radar 4150520.
	1560	* Don't set sblocked because we don't want
	1561	* to perform an unlock later.
	1562	*/
	1563	assumelock = 1;
	1564	} else {
	1565	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
	1566	if (error) {
	1567	if (so->so_flags & SOF_DEFUNCT)
	1568	goto defunct;
	1569	return (error);
	1570	}
	1571	*sblocked = 1;
	1572	}
	1573	}
	1574
	1575	/*
	1576	* If a send attempt is made on a socket that has been marked
	1577	* as inactive (disconnected), reject the request.
	1578	*/
	1579	if (so->so_flags & SOF_DEFUNCT) {
	1580	defunct:
	1581	error = EPIPE;
	1582	SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
	1583	__func__, proc_selfpid(), (uint64_t)VM_KERNEL_ADDRPERM(so),
	1584	SOCK_DOM(so), SOCK_TYPE(so), error));
	1585	return (error);
	1586	}
	1587
	1588	if (so->so_state & SS_CANTSENDMORE)
	1589	return (EPIPE);
	1590
	1591	if (so->so_error) {
	1592	error = so->so_error;
	1593	so->so_error = 0;
	1594	return (error);
	1595	}
	1596
	1597	if ((so->so_state & SS_ISCONNECTED) == 0) {
	1598	if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
	1599	if ((so->so_state & SS_ISCONFIRMING) == 0 &&
	1600	!(resid == 0 && clen != 0))
	1601	return (ENOTCONN);
	1602	} else if (addr == 0 && !(flags&MSG_HOLD)) {
	1603	return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
	1604	ENOTCONN : EDESTADDRREQ);
	1605	}
	1606	}
	1607	if (so->so_flags & SOF_ENABLE_MSGS)
	1608	space = msgq_sbspace(so, control);
	1609	else
	1610	space = sbspace(&so->so_snd);
	1611
	1612	if (flags & MSG_OOB)
	1613	space += 1024;
	1614	if ((atomic && resid > so->so_snd.sb_hiwat) \|\|
	1615	clen > so->so_snd.sb_hiwat)
	1616	return (EMSGSIZE);
	1617
	1618	if ((space < resid + clen &&
	1619	(atomic \|\| space < (int32_t)so->so_snd.sb_lowat \|\| space < clen)) \|\|
	1620	(so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
	1621	if ((so->so_state & SS_NBIO) \|\| (flags & MSG_NBIO) \|\|
	1622	assumelock) {
	1623	return (EWOULDBLOCK);
	1624	}
	1625	sbunlock(&so->so_snd, TRUE); /* keep socket locked */
	1626	*sblocked = 0;
	1627	error = sbwait(&so->so_snd);
	1628	if (error) {
	1629	if (so->so_flags & SOF_DEFUNCT)
	1630	goto defunct;
	1631	return (error);
	1632	}
	1633	goto restart;
	1634	}
	1635	return (0);
	1636	}
	1637
	1638	/*
	1639	* Send on a socket.
	1640	* If send must go all at once and message is larger than
	1641	* send buffering, then hard error.
	1642	* Lock against other senders.
	1643	* If must go all at once and not enough room now, then
	1644	* inform user that this would block and do nothing.
	1645	* Otherwise, if nonblocking, send as much as possible.
	1646	* The data to be sent is described by "uio" if nonzero,
	1647	* otherwise by the mbuf chain "top" (which must be null
	1648	* if uio is not). Data provided in mbuf chain must be small
	1649	* enough to send all at once.
	1650	*
	1651	* Returns nonzero on error, timeout or signal; callers
	1652	* must check for short counts if EINTR/ERESTART are returned.
	1653	* Data and control buffers are freed on return.
	1654	* Experiment:
	1655	* MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
	1656	* MSG_SEND: go thru as for MSG_HOLD on current fragment, then
	1657	* point at the mbuf chain being constructed and go from there.
	1658	*
	1659	* Returns: 0 Success
	1660	* EOPNOTSUPP
	1661	* EINVAL
	1662	* ENOBUFS
	1663	* uiomove:EFAULT
	1664	* sosendcheck:EPIPE
	1665	* sosendcheck:EWOULDBLOCK
	1666	* sosendcheck:EINTR
	1667	* sosendcheck:EBADF
	1668	* sosendcheck:EINTR
	1669	* sosendcheck:??? [value from so_error]
	1670	* <pru_send>:ECONNRESET[TCP]
	1671	* <pru_send>:EINVAL[TCP]
	1672	* <pru_send>:ENOBUFS[TCP]
	1673	* <pru_send>:EADDRINUSE[TCP]
	1674	* <pru_send>:EADDRNOTAVAIL[TCP]
	1675	* <pru_send>:EAFNOSUPPORT[TCP]
	1676	* <pru_send>:EACCES[TCP]
	1677	* <pru_send>:EAGAIN[TCP]
	1678	* <pru_send>:EPERM[TCP]
	1679	* <pru_send>:EMSGSIZE[TCP]
	1680	* <pru_send>:EHOSTUNREACH[TCP]
	1681	* <pru_send>:ENETUNREACH[TCP]
	1682	* <pru_send>:ENETDOWN[TCP]
	1683	* <pru_send>:ENOMEM[TCP]
	1684	* <pru_send>:ENOBUFS[TCP]
	1685	* <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
	1686	* <pru_send>:EINVAL[AF_UNIX]
	1687	* <pru_send>:EOPNOTSUPP[AF_UNIX]
	1688	* <pru_send>:EPIPE[AF_UNIX]
	1689	* <pru_send>:ENOTCONN[AF_UNIX]
	1690	* <pru_send>:EISCONN[AF_UNIX]
	1691	* <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
	1692	* <sf_data_out>:??? [whatever a filter author chooses]
	1693	*
	1694	* Notes: Other <pru_send> returns depend on the protocol family; all
	1695	* <sf_data_out> returns depend on what the filter author causes
	1696	* their filter to return.
	1697	*/
	1698	int
	1699	sosend(struct socket so, struct sockaddr addr, struct uio *uio,
	1700	struct mbuf top, struct mbuf control, int flags)
	1701	{
	1702	struct mbuf **mp;
	1703	struct mbuf m, freelist = NULL;
	1704	user_ssize_t space, len, resid;
	1705	int clen = 0, error, dontroute, mlen, sendflags;
	1706	int atomic = sosendallatonce(so) \|\| top;
	1707	int sblocked = 0;
	1708	struct proc *p = current_proc();
	1709	struct mbuf *control_copy = NULL;
	1710
	1711	if (uio != NULL)
	1712	resid = uio_resid(uio);
	1713	else
	1714	resid = top->m_pkthdr.len;
	1715
	1716	KERNEL_DEBUG((DBG_FNC_SOSEND \| DBG_FUNC_START), so, resid,
	1717	so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
	1718
	1719	socket_lock(so, 1);
	1720	so_update_last_owner_locked(so, p);
	1721	so_update_policy(so);
	1722
	1723	if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
	1724	error = EOPNOTSUPP;
	1725	socket_unlock(so, 1);
	1726	goto out;
	1727	}
	1728
	1729	/*
	1730	* In theory resid should be unsigned.
	1731	* However, space must be signed, as it might be less than 0
	1732	* if we over-committed, and we must use a signed comparison
	1733	* of space and resid. On the other hand, a negative resid
	1734	* causes us to loop sending 0-length segments to the protocol.
	1735	*
	1736	* Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
	1737	* But it will be used by sockets doing message delivery.
	1738	*
	1739	* Note: We limit resid to be a positive 32 bits value as we use
	1740	* imin() to set bytes_to_copy -- radr://14558484
	1741	*/
	1742	if ((int32_t)resid < 0 \|\| (so->so_type == SOCK_STREAM &&
	1743	!(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
	1744	error = EINVAL;
	1745	socket_unlock(so, 1);
	1746	goto out;
	1747	}
	1748
	1749	dontroute = (flags & MSG_DONTROUTE) &&
	1750	(so->so_options & SO_DONTROUTE) == 0 &&
	1751	(so->so_proto->pr_flags & PR_ATOMIC);
	1752	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
	1753
	1754	if (control != NULL)
	1755	clen = control->m_len;
	1756
	1757	do {
	1758	error = sosendcheck(so, addr, resid, clen, atomic, flags,
	1759	&sblocked, control);
	1760	if (error)
	1761	goto release;
	1762
	1763	mp = &top;
	1764	if (so->so_flags & SOF_ENABLE_MSGS)
	1765	space = msgq_sbspace(so, control);
	1766	else
	1767	space = sbspace(&so->so_snd) - clen;
	1768	space += ((flags & MSG_OOB) ? 1024 : 0);
	1769
	1770	do {
	1771	if (uio == NULL) {
	1772	/*
	1773	* Data is prepackaged in "top".
	1774	*/
	1775	resid = 0;
	1776	if (flags & MSG_EOR)
	1777	top->m_flags \|= M_EOR;
	1778	} else {
	1779	int chainlength;
	1780	int bytes_to_copy;
	1781	boolean_t jumbocl;
	1782
	1783	bytes_to_copy = imin(resid, space);
	1784
	1785	if (sosendminchain > 0)
	1786	chainlength = 0;
	1787	else
	1788	chainlength = sosendmaxchain;
	1789
	1790	/*
	1791	* Attempt to use larger than system page-size
	1792	* clusters for large writes only if there is
	1793	* a jumbo cluster pool and if the socket is
	1794	* marked accordingly.
	1795	*/
	1796	jumbocl = sosendjcl && njcl > 0 &&
	1797	((so->so_flags & SOF_MULTIPAGES) \|\|
	1798	sosendjcl_ignore_capab);
	1799
	1800	socket_unlock(so, 0);
	1801
	1802	do {
	1803	int num_needed;
	1804	int hdrs_needed = (top == NULL) ? 1 : 0;
	1805
	1806	/*
	1807	* try to maintain a local cache of mbuf
	1808	* clusters needed to complete this
	1809	* write the list is further limited to
	1810	* the number that are currently needed
	1811	* to fill the socket this mechanism
	1812	* allows a large number of mbufs/
	1813	* clusters to be grabbed under a single
	1814	* mbuf lock... if we can't get any
	1815	* clusters, than fall back to trying
	1816	* for mbufs if we fail early (or
	1817	* miscalcluate the number needed) make
	1818	* sure to release any clusters we
	1819	* haven't yet consumed.
	1820	*/
	1821	if (freelist == NULL &&
	1822	bytes_to_copy > MBIGCLBYTES &&
	1823	jumbocl) {
	1824	num_needed =
	1825	bytes_to_copy / M16KCLBYTES;
	1826
	1827	if ((bytes_to_copy -
	1828	(num_needed * M16KCLBYTES))
	1829	>= MINCLSIZE)
	1830	num_needed++;
	1831
	1832	freelist =
	1833	m_getpackets_internal(
	1834	(unsigned int *)&num_needed,
	1835	hdrs_needed, M_WAIT, 0,
	1836	M16KCLBYTES);
	1837	/*
	1838	* Fall back to 4K cluster size
	1839	* if allocation failed
	1840	*/
	1841	}
	1842
	1843	if (freelist == NULL &&
	1844	bytes_to_copy > MCLBYTES) {
	1845	num_needed =
	1846	bytes_to_copy / MBIGCLBYTES;
	1847
	1848	if ((bytes_to_copy -
	1849	(num_needed * MBIGCLBYTES)) >=
	1850	MINCLSIZE)
	1851	num_needed++;
	1852
	1853	freelist =
	1854	m_getpackets_internal(
	1855	(unsigned int *)&num_needed,
	1856	hdrs_needed, M_WAIT, 0,
	1857	MBIGCLBYTES);
	1858	/*
	1859	* Fall back to cluster size
	1860	* if allocation failed
	1861	*/
	1862	}
	1863
	1864	if (freelist == NULL &&
	1865	bytes_to_copy > MINCLSIZE) {
	1866	num_needed =
	1867	bytes_to_copy / MCLBYTES;
	1868
	1869	if ((bytes_to_copy -
	1870	(num_needed * MCLBYTES)) >=
	1871	MINCLSIZE)
	1872	num_needed++;
	1873
	1874	freelist =
	1875	m_getpackets_internal(
	1876	(unsigned int *)&num_needed,
	1877	hdrs_needed, M_WAIT, 0,
	1878	MCLBYTES);
	1879	/*
	1880	* Fall back to a single mbuf
	1881	* if allocation failed
	1882	*/
	1883	}
	1884
	1885	if (freelist == NULL) {
	1886	if (top == NULL)
	1887	MGETHDR(freelist,
	1888	M_WAIT, MT_DATA);
	1889	else
	1890	MGET(freelist,
	1891	M_WAIT, MT_DATA);
	1892
	1893	if (freelist == NULL) {
	1894	error = ENOBUFS;
	1895	socket_lock(so, 0);
	1896	goto release;
	1897	}
	1898	/*
	1899	* For datagram protocols,
	1900	* leave room for protocol
	1901	* headers in first mbuf.
	1902	*/
	1903	if (atomic && top == NULL &&
	1904	bytes_to_copy < MHLEN) {
	1905	MH_ALIGN(freelist,
	1906	bytes_to_copy);
	1907	}
	1908	}
	1909	m = freelist;
	1910	freelist = m->m_next;
	1911	m->m_next = NULL;
	1912
	1913	if ((m->m_flags & M_EXT))
	1914	mlen = m->m_ext.ext_size;
	1915	else if ((m->m_flags & M_PKTHDR))
	1916	mlen =
	1917	MHLEN - m_leadingspace(m);
	1918	else
	1919	mlen = MLEN;
	1920	len = imin(mlen, bytes_to_copy);
	1921
	1922	chainlength += len;
	1923
	1924	space -= len;
	1925
	1926	error = uiomove(mtod(m, caddr_t),
	1927	len, uio);
	1928
	1929	resid = uio_resid(uio);
	1930
	1931	m->m_len = len;
	1932	*mp = m;
	1933	top->m_pkthdr.len += len;
	1934	if (error)
	1935	break;
	1936	mp = &m->m_next;
	1937	if (resid <= 0) {
	1938	if (flags & MSG_EOR)
	1939	top->m_flags \|= M_EOR;
	1940	break;
	1941	}
	1942	bytes_to_copy = min(resid, space);
	1943
	1944	} while (space > 0 &&
	1945	(chainlength < sosendmaxchain \|\| atomic \|\|
	1946	resid < MINCLSIZE));
	1947
	1948	socket_lock(so, 0);
	1949
	1950	if (error)
	1951	goto release;
	1952	}
	1953
	1954	if (flags & (MSG_HOLD\|MSG_SEND)) {
	1955	/* Enqueue for later, go away if HOLD */
	1956	struct mbuf *mb1;
	1957	if (so->so_temp && (flags & MSG_FLUSH)) {
	1958	m_freem(so->so_temp);
	1959	so->so_temp = NULL;
	1960	}
	1961	if (so->so_temp)
	1962	so->so_tail->m_next = top;
	1963	else
	1964	so->so_temp = top;
	1965	mb1 = top;
	1966	while (mb1->m_next)
	1967	mb1 = mb1->m_next;
	1968	so->so_tail = mb1;
	1969	if (flags & MSG_HOLD) {
	1970	top = NULL;
	1971	goto release;
	1972	}
	1973	top = so->so_temp;
	1974	}
	1975	if (dontroute)
	1976	so->so_options \|= SO_DONTROUTE;
	1977
	1978	/* Compute flags here, for pru_send and NKEs */
	1979	sendflags = (flags & MSG_OOB) ? PRUS_OOB :
	1980	/*
	1981	* If the user set MSG_EOF, the protocol
	1982	* understands this flag and nothing left to
	1983	* send then use PRU_SEND_EOF instead of PRU_SEND.
	1984	*/
	1985	((flags & MSG_EOF) &&
	1986	(so->so_proto->pr_flags & PR_IMPLOPCL) &&
	1987	(resid <= 0)) ? PRUS_EOF :
	1988	/* If there is more to send set PRUS_MORETOCOME */
	1989	(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
	1990
	1991	/*
	1992	* Socket filter processing
	1993	*/
	1994	error = sflt_data_out(so, addr, &top,
	1995	&control, (sendflags & MSG_OOB) ?
	1996	sock_data_filt_flag_oob : 0);
	1997	if (error) {
	1998	if (error == EJUSTRETURN) {
	1999	error = 0;
	2000	clen = 0;
	2001	control = NULL;
	2002	top = NULL;
	2003	}
	2004
	2005	goto release;
	2006	}
	2007	/*
	2008	* End Socket filter processing
	2009	*/
	2010
	2011	if (so->so_flags & SOF_ENABLE_MSGS) {
	2012	/*
	2013	* Make a copy of control mbuf,
	2014	* so that msg priority can be
	2015	* passed to subsequent mbufs.
	2016	*/
	2017	control_copy = m_dup(control, M_NOWAIT);
	2018	}
	2019	error = (*so->so_proto->pr_usrreqs->pru_send)
	2020	(so, sendflags, top, addr, control, p);
	2021
	2022	if (flags & MSG_SEND)
	2023	so->so_temp = NULL;
	2024
	2025	if (dontroute)
	2026	so->so_options &= ~SO_DONTROUTE;
	2027
	2028	clen = 0;
	2029	control = control_copy;
	2030	control_copy = NULL;
	2031	top = NULL;
	2032	mp = &top;
	2033	if (error)
	2034	goto release;
	2035	} while (resid && space > 0);
	2036	} while (resid);
	2037
	2038	release:
	2039	if (sblocked)
	2040	sbunlock(&so->so_snd, FALSE); /* will unlock socket */
	2041	else
	2042	socket_unlock(so, 1);
	2043	out:
	2044	if (top != NULL)
	2045	m_freem(top);
	2046	if (control != NULL)
	2047	m_freem(control);
	2048	if (freelist != NULL)
	2049	m_freem_list(freelist);
	2050	if (control_copy != NULL)
	2051	m_freem(control_copy);
	2052
	2053	KERNEL_DEBUG(DBG_FNC_SOSEND \| DBG_FUNC_END, so, resid, so->so_snd.sb_cc,
	2054	space, error);
	2055
	2056	return (error);
	2057	}
	2058
	2059	/*
	2060	* Implement receive operations on a socket.
	2061	* We depend on the way that records are added to the sockbuf
	2062	* by sbappend*. In particular, each record (mbufs linked through m_next)
	2063	* must begin with an address if the protocol so specifies,
	2064	* followed by an optional mbuf or mbufs containing ancillary data,
	2065	* and then zero or more mbufs of data.
	2066	* In order to avoid blocking network interrupts for the entire time here,
	2067	* we splx() while doing the actual copy to user space.
	2068	* Although the sockbuf is locked, new data may still be appended,
	2069	* and thus we must maintain consistency of the sockbuf during that time.
	2070	*
	2071	* The caller may receive the data as a single mbuf chain by supplying
	2072	* an mbuf **mp0 for use in returning the chain. The uio is then used
	2073	* only for the count in uio_resid.
	2074	*
	2075	* Returns: 0 Success
	2076	* ENOBUFS
	2077	* ENOTCONN
	2078	* EWOULDBLOCK
	2079	* uiomove:EFAULT
	2080	* sblock:EWOULDBLOCK
	2081	* sblock:EINTR
	2082	* sbwait:EBADF
	2083	* sbwait:EINTR
	2084	* sodelayed_copy:EFAULT
	2085	* <pru_rcvoob>:EINVAL[TCP]
	2086	* <pru_rcvoob>:EWOULDBLOCK[TCP]
	2087	* <pru_rcvoob>:???
	2088	* <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
	2089	* <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
	2090	* <pr_domain->dom_externalize>:???
	2091	*
	2092	* Notes: Additional return values from calls through <pru_rcvoob> and
	2093	* <pr_domain->dom_externalize> depend on protocols other than
	2094	* TCP or AF_UNIX, which are documented above.
	2095	*/
	2096	int
	2097	soreceive(struct socket so, struct sockaddr psa, struct uio uio,
	2098	struct mbuf mp0, struct mbuf controlp, int *flagsp)
	2099	{
	2100	struct mbuf m, mp, ml = NULL;
	2101	struct mbuf nextrecord, free_list;
	2102	int flags, error, offset;
	2103	user_ssize_t len;
	2104	struct protosw *pr = so->so_proto;
	2105	int moff, type =0;
	2106	user_ssize_t orig_resid = uio_resid(uio);
	2107	user_ssize_t delayed_copy_len;
	2108	int can_delay;
	2109	int need_event;
	2110	struct proc *p = current_proc();
	2111
	2112	KERNEL_DEBUG(DBG_FNC_SORECEIVE \| DBG_FUNC_START, so, uio_resid(uio),
	2113	so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
	2114
	2115	socket_lock(so, 1);
	2116	so_update_last_owner_locked(so, p);
	2117	so_update_policy(so);
	2118
	2119	#ifdef MORE_LOCKING_DEBUG
	2120	if (so->so_usecount == 1) {
	2121	panic("%s: so=%x no other reference on socket\n", __func__, so);
	2122	/* NOTREACHED */
	2123	}
	2124	#endif
	2125	mp = mp0;
	2126	if (psa != NULL)
	2127	*psa = NULL;
	2128	if (controlp != NULL)
	2129	*controlp = NULL;
	2130	if (flagsp != NULL)
	2131	flags = *flagsp &~ MSG_EOR;
	2132	else
	2133	flags = 0;
	2134
	2135	/*
	2136	* If a recv attempt is made on a previously-accepted socket
	2137	* that has been marked as inactive (disconnected), reject
	2138	* the request.
	2139	*/
	2140	if (so->so_flags & SOF_DEFUNCT) {
	2141	struct sockbuf *sb = &so->so_rcv;
	2142
	2143	error = ENOTCONN;
	2144	SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
	2145	__func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
	2146	SOCK_DOM(so), SOCK_TYPE(so), error));
	2147	/*
	2148	* This socket should have been disconnected and flushed
	2149	* prior to being returned from sodefunct(); there should
	2150	* be no data on its receive list, so panic otherwise.
	2151	*/
	2152	if (so->so_state & SS_DEFUNCT)
	2153	sb_empty_assert(sb, __func__);
	2154	socket_unlock(so, 1);
	2155	return (error);
	2156	}
	2157
	2158	/*
	2159	* When SO_WANTOOBFLAG is set we try to get out-of-band data
	2160	* regardless of the flags argument. Here is the case were
	2161	* out-of-band data is not inline.
	2162	*/
	2163	if ((flags & MSG_OOB) \|\|
	2164	((so->so_options & SO_WANTOOBFLAG) != 0 &&
	2165	(so->so_options & SO_OOBINLINE) == 0 &&
	2166	(so->so_oobmark \|\| (so->so_state & SS_RCVATMARK)))) {
	2167	m = m_get(M_WAIT, MT_DATA);
	2168	if (m == NULL) {
	2169	socket_unlock(so, 1);
	2170	KERNEL_DEBUG(DBG_FNC_SORECEIVE \| DBG_FUNC_END,
	2171	ENOBUFS, 0, 0, 0, 0);
	2172	return (ENOBUFS);
	2173	}
	2174	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
	2175	if (error)
	2176	goto bad;
	2177	socket_unlock(so, 0);
	2178	do {
	2179	error = uiomove(mtod(m, caddr_t),
	2180	imin(uio_resid(uio), m->m_len), uio);
	2181	m = m_free(m);
	2182	} while (uio_resid(uio) && error == 0 && m != NULL);
	2183	socket_lock(so, 0);
	2184	bad:
	2185	if (m != NULL)
	2186	m_freem(m);
	2187
	2188	if ((so->so_options & SO_WANTOOBFLAG) != 0) {
	2189	if (error == EWOULDBLOCK \|\| error == EINVAL) {
	2190	/*
	2191	* Let's try to get normal data:
	2192	* EWOULDBLOCK: out-of-band data not
	2193	* receive yet. EINVAL: out-of-band data
	2194	* already read.
	2195	*/
	2196	error = 0;
	2197	goto nooob;
	2198	} else if (error == 0 && flagsp != NULL) {
	2199	*flagsp \|= MSG_OOB;
	2200	}
	2201	}
	2202	socket_unlock(so, 1);
	2203	KERNEL_DEBUG(DBG_FNC_SORECEIVE \| DBG_FUNC_END, error,
	2204	0, 0, 0, 0);
	2205
	2206	return (error);
	2207	}
	2208	nooob:
	2209	if (mp != NULL)
	2210	*mp = NULL;
	2211	if (so->so_state & SS_ISCONFIRMING && uio_resid(uio))
	2212	(*pr->pr_usrreqs->pru_rcvd)(so, 0);
	2213
	2214	free_list = NULL;
	2215	delayed_copy_len = 0;
	2216	restart:
	2217	#ifdef MORE_LOCKING_DEBUG
	2218	if (so->so_usecount <= 1)
	2219	printf("soreceive: sblock so=%p ref=%d on socket\n",
	2220	so, so->so_usecount);
	2221	#endif
	2222	/*
	2223	* See if the socket has been closed (SS_NOFDREF\|SS_CANTRCVMORE)
	2224	* and if so just return to the caller. This could happen when
	2225	* soreceive() is called by a socket upcall function during the
	2226	* time the socket is freed. The socket buffer would have been
	2227	* locked across the upcall, therefore we cannot put this thread
	2228	* to sleep (else we will deadlock) or return EWOULDBLOCK (else
	2229	* we may livelock), because the lock on the socket buffer will
	2230	* only be released when the upcall routine returns to its caller.
	2231	* Because the socket has been officially closed, there can be
	2232	* no further read on it.
	2233	*
	2234	* A multipath subflow socket would have its SS_NOFDREF set by
	2235	* default, so check for SOF_MP_SUBFLOW socket flag; when the
	2236	* socket is closed for real, SOF_MP_SUBFLOW would be cleared.
	2237	*/
	2238	if ((so->so_state & (SS_NOFDREF \| SS_CANTRCVMORE)) ==
	2239	(SS_NOFDREF \| SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
	2240	socket_unlock(so, 1);
	2241	return (0);
	2242	}
	2243
	2244	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
	2245	if (error) {
	2246	socket_unlock(so, 1);
	2247	KERNEL_DEBUG(DBG_FNC_SORECEIVE \| DBG_FUNC_END, error,
	2248	0, 0, 0, 0);
	2249	return (error);
	2250	}
	2251
	2252	m = so->so_rcv.sb_mb;
	2253	/*
	2254	* If we have less data than requested, block awaiting more
	2255	* (subject to any timeout) if:
	2256	* 1. the current count is less than the low water mark, or
	2257	* 2. MSG_WAITALL is set, and it is possible to do the entire
	2258	* receive operation at once if we block (resid <= hiwat).
	2259	* 3. MSG_DONTWAIT is not set
	2260	* If MSG_WAITALL is set but resid is larger than the receive buffer,
	2261	* we have to do the receive in sections, and thus risk returning
	2262	* a short count if a timeout or signal occurs after we start.
	2263	*/
	2264	if (m == NULL \|\| (((flags & MSG_DONTWAIT) == 0 &&
	2265	so->so_rcv.sb_cc < uio_resid(uio)) &&
	2266	(so->so_rcv.sb_cc < so->so_rcv.sb_lowat \|\|
	2267	((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
	2268	m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
	2269	/*
	2270	* Panic if we notice inconsistencies in the socket's
	2271	* receive list; both sb_mb and sb_cc should correctly
	2272	* reflect the contents of the list, otherwise we may
	2273	* end up with false positives during select() or poll()
	2274	* which could put the application in a bad state.
	2275	*/
	2276	SB_MB_CHECK(&so->so_rcv);
	2277
	2278	if (so->so_error) {
	2279	if (m != NULL)
	2280	goto dontblock;
	2281	error = so->so_error;
	2282	if ((flags & MSG_PEEK) == 0)
	2283	so->so_error = 0;
	2284	goto release;
	2285	}
	2286	if (so->so_state & SS_CANTRCVMORE) {
	2287	if (m != NULL)
	2288	goto dontblock;
	2289	else
	2290	goto release;
	2291	}
	2292	for (; m != NULL; m = m->m_next)
	2293	if (m->m_type == MT_OOBDATA \|\| (m->m_flags & M_EOR)) {
	2294	m = so->so_rcv.sb_mb;
	2295	goto dontblock;
	2296	}
	2297	if ((so->so_state & (SS_ISCONNECTED\|SS_ISCONNECTING)) == 0 &&
	2298	(so->so_proto->pr_flags & PR_CONNREQUIRED)) {
	2299	error = ENOTCONN;
	2300	goto release;
	2301	}
	2302	if (uio_resid(uio) == 0)
	2303	goto release;
	2304	if ((so->so_state & SS_NBIO) \|\|
	2305	(flags & (MSG_DONTWAIT\|MSG_NBIO))) {
	2306	error = EWOULDBLOCK;
	2307	goto release;
	2308	}
	2309	SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
	2310	SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
	2311	sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
	2312	#if EVEN_MORE_LOCKING_DEBUG
	2313	if (socket_debug)
	2314	printf("Waiting for socket data\n");
	2315	#endif
	2316
	2317	error = sbwait(&so->so_rcv);
	2318	#if EVEN_MORE_LOCKING_DEBUG
	2319	if (socket_debug)
	2320	printf("SORECEIVE - sbwait returned %d\n", error);
	2321	#endif
	2322	if (so->so_usecount < 1) {
	2323	panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
	2324	__func__, so, so->so_usecount);
	2325	/* NOTREACHED */
	2326	}
	2327	if (error) {
	2328	socket_unlock(so, 1);
	2329	KERNEL_DEBUG(DBG_FNC_SORECEIVE \| DBG_FUNC_END, error,
	2330	0, 0, 0, 0);
	2331	return (error);
	2332	}
	2333	goto restart;
	2334	}
	2335	dontblock:
	2336	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
	2337	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
	2338	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
	2339	nextrecord = m->m_nextpkt;
	2340	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
	2341	KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
	2342	#if CONFIG_MACF_SOCKET_SUBSET
	2343	/*
	2344	* Call the MAC framework for policy checking if we're in
	2345	* the user process context and the socket isn't connected.
	2346	*/
	2347	if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
	2348	struct mbuf *m0 = m;
	2349	/*
	2350	* Dequeue this record (temporarily) from the receive
	2351	* list since we're about to drop the socket's lock
	2352	* where a new record may arrive and be appended to
	2353	* the list. Upon MAC policy failure, the record
	2354	* will be freed. Otherwise, we'll add it back to
	2355	* the head of the list. We cannot rely on SB_LOCK
	2356	* because append operation uses the socket's lock.
	2357	*/
	2358	do {
	2359	m->m_nextpkt = NULL;
	2360	sbfree(&so->so_rcv, m);
	2361	m = m->m_next;
	2362	} while (m != NULL);
	2363	m = m0;
	2364	so->so_rcv.sb_mb = nextrecord;
	2365	SB_EMPTY_FIXUP(&so->so_rcv);
	2366	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
	2367	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
	2368	socket_unlock(so, 0);
	2369	if (mac_socket_check_received(proc_ucred(p), so,
	2370	mtod(m, struct sockaddr *)) != 0) {
	2371	/*
	2372	* MAC policy failure; free this record and
	2373	* process the next record (or block until
	2374	* one is available). We have adjusted sb_cc
	2375	* and sb_mbcnt above so there is no need to
	2376	* call sbfree() again.
	2377	*/
	2378	do {
	2379	m = m_free(m);
	2380	} while (m != NULL);
	2381	/*
	2382	* Clear SB_LOCK but don't unlock the socket.
	2383	* Process the next record or wait for one.
	2384	*/
	2385	socket_lock(so, 0);
	2386	sbunlock(&so->so_rcv, TRUE); /* stay locked */
	2387	goto restart;
	2388	}
	2389	socket_lock(so, 0);
	2390	/*
	2391	* If the socket has been defunct'd, drop it.
	2392	*/
	2393	if (so->so_flags & SOF_DEFUNCT) {
	2394	m_freem(m);
	2395	error = ENOTCONN;
	2396	goto release;
	2397	}
	2398	/*
	2399	* Re-adjust the socket receive list and re-enqueue
	2400	* the record in front of any packets which may have
	2401	* been appended while we dropped the lock.
	2402	*/
	2403	for (m = m0; m->m_next != NULL; m = m->m_next)
	2404	sballoc(&so->so_rcv, m);
	2405	sballoc(&so->so_rcv, m);
	2406	if (so->so_rcv.sb_mb == NULL) {
	2407	so->so_rcv.sb_lastrecord = m0;
	2408	so->so_rcv.sb_mbtail = m;
	2409	}
	2410	m = m0;
	2411	nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
	2412	so->so_rcv.sb_mb = m;
	2413	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
	2414	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
	2415	}
	2416	#endif /* CONFIG_MACF_SOCKET_SUBSET */
	2417	orig_resid = 0;
	2418	if (psa != NULL) {
	2419	psa = dup_sockaddr(mtod(m, struct sockaddr ),
	2420	mp0 == NULL);
	2421	if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
	2422	error = EWOULDBLOCK;
	2423	goto release;
	2424	}
	2425	}
	2426	if (flags & MSG_PEEK) {
	2427	m = m->m_next;
	2428	} else {
	2429	sbfree(&so->so_rcv, m);
	2430	if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
	2431	panic("%s: about to create invalid socketbuf",
	2432	__func__);
	2433	/* NOTREACHED */
	2434	}
	2435	MFREE(m, so->so_rcv.sb_mb);
	2436	m = so->so_rcv.sb_mb;
	2437	if (m != NULL) {
	2438	m->m_nextpkt = nextrecord;
	2439	} else {
	2440	so->so_rcv.sb_mb = nextrecord;
	2441	SB_EMPTY_FIXUP(&so->so_rcv);
	2442	}
	2443	}
	2444	}
	2445
	2446	/*
	2447	* Process one or more MT_CONTROL mbufs present before any data mbufs
	2448	* in the first mbuf chain on the socket buffer. If MSG_PEEK, we
	2449	* just copy the data; if !MSG_PEEK, we call into the protocol to
	2450	* perform externalization.
	2451	*/
	2452	if (m != NULL && m->m_type == MT_CONTROL) {
	2453	struct mbuf cm = NULL, cmn;
	2454	struct mbuf **cme = &cm;
	2455	struct sockbuf *sb_rcv = &so->so_rcv;
	2456	struct mbuf **msgpcm = NULL;
	2457
	2458	/*
	2459	* Externalizing the control messages would require us to
	2460	* drop the socket's lock below. Once we re-acquire the
	2461	* lock, the mbuf chain might change. In order to preserve
	2462	* consistency, we unlink all control messages from the
	2463	* first mbuf chain in one shot and link them separately
	2464	* onto a different chain.
	2465	*/
	2466	do {
	2467	if (flags & MSG_PEEK) {
	2468	if (controlp != NULL) {
	2469	if (*controlp == NULL) {
	2470	msgpcm = controlp;
	2471	}
	2472	*controlp = m_copy(m, 0, m->m_len);
	2473
	2474	/*
	2475	* If we failed to allocate an mbuf,
	2476	* release any previously allocated
	2477	* mbufs for control data. Return
	2478	* an error. Keep the mbufs in the
	2479	* socket as this is using
	2480	* MSG_PEEK flag.
	2481	*/
	2482	if (*controlp == NULL) {
	2483	m_freem(*msgpcm);
	2484	error = ENOBUFS;
	2485	goto release;
	2486	}
	2487	controlp = &(*controlp)->m_next;
	2488	}
	2489	m = m->m_next;
	2490	} else {
	2491	m->m_nextpkt = NULL;
	2492	sbfree(sb_rcv, m);
	2493	sb_rcv->sb_mb = m->m_next;
	2494	m->m_next = NULL;
	2495	*cme = m;
	2496	cme = &(*cme)->m_next;
	2497	m = sb_rcv->sb_mb;
	2498	}
	2499	} while (m != NULL && m->m_type == MT_CONTROL);
	2500
	2501	if (!(flags & MSG_PEEK)) {
	2502	if (sb_rcv->sb_mb != NULL) {
	2503	sb_rcv->sb_mb->m_nextpkt = nextrecord;
	2504	} else {
	2505	sb_rcv->sb_mb = nextrecord;
	2506	SB_EMPTY_FIXUP(sb_rcv);
	2507	}
	2508	if (nextrecord == NULL)
	2509	sb_rcv->sb_lastrecord = m;
	2510	}
	2511
	2512	SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
	2513	SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
	2514
	2515	while (cm != NULL) {
	2516	int cmsg_type;
	2517
	2518	cmn = cm->m_next;
	2519	cm->m_next = NULL;
	2520	cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
	2521
	2522	/*
	2523	* Call the protocol to externalize SCM_RIGHTS message
	2524	* and return the modified message to the caller upon
	2525	* success. Otherwise, all other control messages are
	2526	* returned unmodified to the caller. Note that we
	2527	* only get into this loop if MSG_PEEK is not set.
	2528	*/
	2529	if (pr->pr_domain->dom_externalize != NULL &&
	2530	cmsg_type == SCM_RIGHTS) {
	2531	/*
	2532	* Release socket lock: see 3903171. This
	2533	* would also allow more records to be appended
	2534	* to the socket buffer. We still have SB_LOCK
	2535	* set on it, so we can be sure that the head
	2536	* of the mbuf chain won't change.
	2537	*/
	2538	socket_unlock(so, 0);
	2539	error = (*pr->pr_domain->dom_externalize)(cm);
	2540	socket_lock(so, 0);
	2541	} else {
	2542	error = 0;
	2543	}
	2544
	2545	if (controlp != NULL && error == 0) {
	2546	*controlp = cm;
	2547	controlp = &(*controlp)->m_next;
	2548	orig_resid = 0;
	2549	} else {
	2550	(void) m_free(cm);
	2551	}
	2552	cm = cmn;
	2553	}
	2554	/*
	2555	* Update the value of nextrecord in case we received new
	2556	* records when the socket was unlocked above for
	2557	* externalizing SCM_RIGHTS.
	2558	*/
	2559	if (m != NULL)
	2560	nextrecord = sb_rcv->sb_mb->m_nextpkt;
	2561	else
	2562	nextrecord = sb_rcv->sb_mb;
	2563	orig_resid = 0;
	2564	}
	2565
	2566	/*
	2567	* If the socket is a TCP socket with message delivery
	2568	* enabled, then create a control msg to deliver the
	2569	* relative TCP sequence number for this data. Waiting
	2570	* until this point will protect against failures to
	2571	* allocate an mbuf for control msgs.
	2572	*/
	2573	if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
	2574	(so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
	2575	struct mbuf *seq_cm;
	2576
	2577	seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
	2578	sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
	2579	if (seq_cm == NULL) {
	2580	/* unable to allocate a control mbuf */
	2581	error = ENOBUFS;
	2582	goto release;
	2583	}
	2584	*controlp = seq_cm;
	2585	controlp = &seq_cm->m_next;
	2586	}
	2587
	2588	if (m != NULL) {
	2589	if (!(flags & MSG_PEEK)) {
	2590	/*
	2591	* We get here because m points to an mbuf following
	2592	* any MT_SONAME or MT_CONTROL mbufs which have been
	2593	* processed above. In any case, m should be pointing
	2594	* to the head of the mbuf chain, and the nextrecord
	2595	* should be either NULL or equal to m->m_nextpkt.
	2596	* See comments above about SB_LOCK.
	2597	*/
	2598	if (m != so->so_rcv.sb_mb \|\|
	2599	m->m_nextpkt != nextrecord) {
	2600	panic("%s: post-control !sync so=%p m=%p "
	2601	"nextrecord=%p\n", __func__, so, m,
	2602	nextrecord);
	2603	/* NOTREACHED */
	2604	}
	2605	if (nextrecord == NULL)
	2606	so->so_rcv.sb_lastrecord = m;
	2607	}
	2608	type = m->m_type;
	2609	if (type == MT_OOBDATA)
	2610	flags \|= MSG_OOB;
	2611	} else {
	2612	if (!(flags & MSG_PEEK)) {
	2613	SB_EMPTY_FIXUP(&so->so_rcv);
	2614	}
	2615	}
	2616	SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
	2617	SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
	2618
	2619	moff = 0;
	2620	offset = 0;
	2621
	2622	if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
	2623	can_delay = 1;
	2624	else
	2625	can_delay = 0;
	2626
	2627	need_event = 0;
	2628
	2629	while (m != NULL &&
	2630	(uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
	2631	if (m->m_type == MT_OOBDATA) {
	2632	if (type != MT_OOBDATA)
	2633	break;
	2634	} else if (type == MT_OOBDATA) {
	2635	break;
	2636	}
	2637	/*
	2638	* Make sure to allways set MSG_OOB event when getting
	2639	* out of band data inline.
	2640	*/
	2641	if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
	2642	(so->so_options & SO_OOBINLINE) != 0 &&
	2643	(so->so_state & SS_RCVATMARK) != 0) {
	2644	flags \|= MSG_OOB;
	2645	}
	2646	so->so_state &= ~SS_RCVATMARK;
	2647	len = uio_resid(uio) - delayed_copy_len;
	2648	if (so->so_oobmark && len > so->so_oobmark - offset)
	2649	len = so->so_oobmark - offset;
	2650	if (len > m->m_len - moff)
	2651	len = m->m_len - moff;
	2652	/*
	2653	* If mp is set, just pass back the mbufs.
	2654	* Otherwise copy them out via the uio, then free.
	2655	* Sockbuf must be consistent here (points to current mbuf,
	2656	* it points to next record) when we drop priority;
	2657	* we must note any additions to the sockbuf when we
	2658	* block interrupts again.
	2659	*/
	2660	if (mp == NULL) {
	2661	SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
	2662	SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
	2663	if (can_delay && len == m->m_len) {
	2664	/*
	2665	* only delay the copy if we're consuming the
	2666	* mbuf and we're NOT in MSG_PEEK mode
	2667	* and we have enough data to make it worthwile
	2668	* to drop and retake the lock... can_delay
	2669	* reflects the state of the 2 latter
	2670	* constraints moff should always be zero
	2671	* in these cases
	2672	*/
	2673	delayed_copy_len += len;
	2674	} else {
	2675	if (delayed_copy_len) {
	2676	error = sodelayed_copy(so, uio,
	2677	&free_list, &delayed_copy_len);
	2678
	2679	if (error) {
	2680	goto release;
	2681	}
	2682	/*
	2683	* can only get here if MSG_PEEK is not
	2684	* set therefore, m should point at the
	2685	* head of the rcv queue; if it doesn't,
	2686	* it means something drastically
	2687	* changed while we were out from behind
	2688	* the lock in sodelayed_copy. perhaps
	2689	* a RST on the stream. in any event,
	2690	* the stream has been interrupted. it's
	2691	* probably best just to return whatever
	2692	* data we've moved and let the caller
	2693	* sort it out...
	2694	*/
	2695	if (m != so->so_rcv.sb_mb) {
	2696	break;
	2697	}
	2698	}
	2699	socket_unlock(so, 0);
	2700	error = uiomove(mtod(m, caddr_t) + moff,
	2701	(int)len, uio);
	2702	socket_lock(so, 0);
	2703
	2704	if (error)
	2705	goto release;
	2706	}
	2707	} else {
	2708	uio_setresid(uio, (uio_resid(uio) - len));
	2709	}
	2710	if (len == m->m_len - moff) {
	2711	if (m->m_flags & M_EOR)
	2712	flags \|= MSG_EOR;
	2713	if (flags & MSG_PEEK) {
	2714	m = m->m_next;
	2715	moff = 0;
	2716	} else {
	2717	nextrecord = m->m_nextpkt;
	2718	sbfree(&so->so_rcv, m);
	2719	m->m_nextpkt = NULL;
	2720
	2721	/*
	2722	* If this packet is an unordered packet
	2723	* (indicated by M_UNORDERED_DATA flag), remove
	2724	* the additional bytes added to the
	2725	* receive socket buffer size.
	2726	*/
	2727	if ((so->so_flags & SOF_ENABLE_MSGS) &&
	2728	m->m_len &&
	2729	(m->m_flags & M_UNORDERED_DATA) &&
	2730	sbreserve(&so->so_rcv,
	2731	so->so_rcv.sb_hiwat - m->m_len)) {
	2732	if (so->so_msg_state->msg_uno_bytes >
	2733	m->m_len) {
	2734	so->so_msg_state->
	2735	msg_uno_bytes -= m->m_len;
	2736	} else {
	2737	so->so_msg_state->
	2738	msg_uno_bytes = 0;
	2739	}
	2740	m->m_flags &= ~M_UNORDERED_DATA;
	2741	}
	2742
	2743	if (mp != NULL) {
	2744	*mp = m;
	2745	mp = &m->m_next;
	2746	so->so_rcv.sb_mb = m = m->m_next;
	2747	*mp = NULL;
	2748	} else {
	2749	if (free_list == NULL)
	2750	free_list = m;
	2751	else
	2752	ml->m_next = m;
	2753	ml = m;
	2754	so->so_rcv.sb_mb = m = m->m_next;
	2755	ml->m_next = NULL;
	2756	}
	2757	if (m != NULL) {
	2758	m->m_nextpkt = nextrecord;
	2759	if (nextrecord == NULL)
	2760	so->so_rcv.sb_lastrecord = m;
	2761	} else {
	2762	so->so_rcv.sb_mb = nextrecord;
	2763	SB_EMPTY_FIXUP(&so->so_rcv);
	2764	}
	2765	SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
	2766	SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
	2767	}
	2768	} else {
	2769	if (flags & MSG_PEEK) {
	2770	moff += len;
	2771	} else {
	2772	if (mp != NULL) {
	2773	int copy_flag;
	2774
	2775	if (flags & MSG_DONTWAIT)
	2776	copy_flag = M_DONTWAIT;
	2777	else
	2778	copy_flag = M_WAIT;
	2779	*mp = m_copym(m, 0, len, copy_flag);
	2780	/*
	2781	* Failed to allocate an mbuf?
	2782	* Adjust uio_resid back, it was
	2783	* adjusted down by len bytes which
	2784	* we didn't copy over.
	2785	*/
	2786	if (*mp == NULL) {
	2787	uio_setresid(uio,
	2788	(uio_resid(uio) + len));
	2789	break;
	2790	}
	2791	}
	2792	m->m_data += len;
	2793	m->m_len -= len;
	2794	so->so_rcv.sb_cc -= len;
	2795	}
	2796	}
	2797	if (so->so_oobmark) {
	2798	if ((flags & MSG_PEEK) == 0) {
	2799	so->so_oobmark -= len;
	2800	if (so->so_oobmark == 0) {
	2801	so->so_state \|= SS_RCVATMARK;
	2802	/*
	2803	* delay posting the actual event until
	2804	* after any delayed copy processing
	2805	* has finished
	2806	*/
	2807	need_event = 1;
	2808	break;
	2809	}
	2810	} else {
	2811	offset += len;
	2812	if (offset == so->so_oobmark)
	2813	break;
	2814	}
	2815	}
	2816	if (flags & MSG_EOR)
	2817	break;
	2818	/*
	2819	* If the MSG_WAITALL or MSG_WAITSTREAM flag is set
	2820	* (for non-atomic socket), we must not quit until
	2821	* "uio->uio_resid == 0" or an error termination.
	2822	* If a signal/timeout occurs, return with a short
	2823	* count but without error. Keep sockbuf locked
	2824	* against other readers.
	2825	*/
	2826	while (flags & (MSG_WAITALL\|MSG_WAITSTREAM) && m == NULL &&
	2827	(uio_resid(uio) - delayed_copy_len) > 0 &&
	2828	!sosendallatonce(so) && !nextrecord) {
	2829	if (so->so_error \|\| so->so_state & SS_CANTRCVMORE)
	2830	goto release;
	2831
	2832	/*
	2833	* Depending on the protocol (e.g. TCP), the following
	2834	* might cause the socket lock to be dropped and later
	2835	* be reacquired, and more data could have arrived and
	2836	* have been appended to the receive socket buffer by
	2837	* the time it returns. Therefore, we only sleep in
	2838	* sbwait() below if and only if the socket buffer is
	2839	* empty, in order to avoid a false sleep.
	2840	*/
	2841	if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
	2842	(((struct inpcb *)so->so_pcb)->inp_state !=
	2843	INPCB_STATE_DEAD))
	2844	(*pr->pr_usrreqs->pru_rcvd)(so, flags);
	2845
	2846	SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
	2847	SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
	2848
	2849	if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
	2850	error = 0;
	2851	goto release;
	2852	}
	2853	/*
	2854	* have to wait until after we get back from the sbwait
	2855	* to do the copy because we will drop the lock if we
	2856	* have enough data that has been delayed... by dropping
	2857	* the lock we open up a window allowing the netisr
	2858	* thread to process the incoming packets and to change
	2859	* the state of this socket... we're issuing the sbwait
	2860	* because the socket is empty and we're expecting the
	2861	* netisr thread to wake us up when more packets arrive;
	2862	* if we allow that processing to happen and then sbwait
	2863	* we could stall forever with packets sitting in the
	2864	* socket if no further packets arrive from the remote
	2865	* side.
	2866	*
	2867	* we want to copy before we've collected all the data
	2868	* to satisfy this request to allow the copy to overlap
	2869	* the incoming packet processing on an MP system
	2870	*/
	2871	if (delayed_copy_len > sorecvmincopy &&
	2872	(delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
	2873	error = sodelayed_copy(so, uio,
	2874	&free_list, &delayed_copy_len);
	2875
	2876	if (error)
	2877	goto release;
	2878	}
	2879	m = so->so_rcv.sb_mb;
	2880	if (m != NULL) {
	2881	nextrecord = m->m_nextpkt;
	2882	}
	2883	SB_MB_CHECK(&so->so_rcv);
	2884	}
	2885	}
	2886	#ifdef MORE_LOCKING_DEBUG
	2887	if (so->so_usecount <= 1) {
	2888	panic("%s: after big while so=%p ref=%d on socket\n",
	2889	__func__, so, so->so_usecount);
	2890	/* NOTREACHED */
	2891	}
	2892	#endif
	2893
	2894	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
	2895	if (so->so_options & SO_DONTTRUNC) {
	2896	flags \|= MSG_RCVMORE;
	2897	} else {
	2898	flags \|= MSG_TRUNC;
	2899	if ((flags & MSG_PEEK) == 0)
	2900	(void) sbdroprecord(&so->so_rcv);
	2901	}
	2902	}
	2903
	2904	/*
	2905	* pru_rcvd below (for TCP) may cause more data to be received
	2906	* if the socket lock is dropped prior to sending the ACK; some
	2907	* legacy OpenTransport applications don't handle this well
	2908	* (if it receives less data than requested while MSG_HAVEMORE
	2909	* is set), and so we set the flag now based on what we know
	2910	* prior to calling pru_rcvd.
	2911	*/
	2912	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
	2913	flags \|= MSG_HAVEMORE;
	2914
	2915	if ((flags & MSG_PEEK) == 0) {
	2916	if (m == NULL) {
	2917	so->so_rcv.sb_mb = nextrecord;
	2918	/*
	2919	* First part is an inline SB_EMPTY_FIXUP(). Second
	2920	* part makes sure sb_lastrecord is up-to-date if
	2921	* there is still data in the socket buffer.
	2922	*/
	2923	if (so->so_rcv.sb_mb == NULL) {
	2924	so->so_rcv.sb_mbtail = NULL;
	2925	so->so_rcv.sb_lastrecord = NULL;
	2926	} else if (nextrecord->m_nextpkt == NULL) {
	2927	so->so_rcv.sb_lastrecord = nextrecord;
	2928	}
	2929	SB_MB_CHECK(&so->so_rcv);
	2930	}
	2931	SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
	2932	SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
	2933	if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
	2934	(*pr->pr_usrreqs->pru_rcvd)(so, flags);
	2935	}
	2936
	2937	if (delayed_copy_len) {
	2938	error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
	2939	if (error)
	2940	goto release;
	2941	}
	2942	if (free_list != NULL) {
	2943	m_freem_list(free_list);
	2944	free_list = NULL;
	2945	}
	2946	if (need_event)
	2947	postevent(so, 0, EV_OOB);
	2948
	2949	if (orig_resid == uio_resid(uio) && orig_resid &&
	2950	(flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
	2951	sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
	2952	goto restart;
	2953	}
	2954
	2955	if (flagsp != NULL)
	2956	*flagsp \|= flags;
	2957	release:
	2958	#ifdef MORE_LOCKING_DEBUG
	2959	if (so->so_usecount <= 1) {
	2960	panic("%s: release so=%p ref=%d on socket\n", __func__,
	2961	so, so->so_usecount);
	2962	/* NOTREACHED */
	2963	}
	2964	#endif
	2965	if (delayed_copy_len)
	2966	error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
	2967
	2968	if (free_list != NULL)
	2969	m_freem_list(free_list);
	2970
	2971	sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
	2972
	2973	KERNEL_DEBUG(DBG_FNC_SORECEIVE \| DBG_FUNC_END, so, uio_resid(uio),
	2974	so->so_rcv.sb_cc, 0, error);
	2975
	2976	return (error);
	2977	}
	2978
	2979	/*
	2980	* Returns: 0 Success
	2981	* uiomove:EFAULT
	2982	*/
	2983	static int
	2984	sodelayed_copy(struct socket so, struct uio uio, struct mbuf **free_list,
	2985	user_ssize_t *resid)
	2986	{
	2987	int error = 0;
	2988	struct mbuf *m;
	2989
	2990	m = *free_list;
	2991
	2992	socket_unlock(so, 0);
	2993
	2994	while (m != NULL && error == 0) {
	2995	error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
	2996	m = m->m_next;
	2997	}
	2998	m_freem_list(*free_list);
	2999
	3000	*free_list = NULL;
	3001	*resid = 0;
	3002
	3003	socket_lock(so, 0);
	3004
	3005	return (error);
	3006	}
	3007
	3008	/*
	3009	* Returns: 0 Success
	3010	* EINVAL
	3011	* ENOTCONN
	3012	* <pru_shutdown>:EINVAL
	3013	* <pru_shutdown>:EADDRNOTAVAIL[TCP]
	3014	* <pru_shutdown>:ENOBUFS[TCP]
	3015	* <pru_shutdown>:EMSGSIZE[TCP]
	3016	* <pru_shutdown>:EHOSTUNREACH[TCP]
	3017	* <pru_shutdown>:ENETUNREACH[TCP]
	3018	* <pru_shutdown>:ENETDOWN[TCP]
	3019	* <pru_shutdown>:ENOMEM[TCP]
	3020	* <pru_shutdown>:EACCES[TCP]
	3021	* <pru_shutdown>:EMSGSIZE[TCP]
	3022	* <pru_shutdown>:ENOBUFS[TCP]
	3023	* <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
	3024	* <pru_shutdown>:??? [other protocol families]
	3025	*/
	3026	int
	3027	soshutdown(struct socket *so, int how)
	3028	{
	3029	int error;
	3030
	3031	switch (how) {
	3032	case SHUT_RD:
	3033	case SHUT_WR:
	3034	case SHUT_RDWR:
	3035	socket_lock(so, 1);
	3036	if ((so->so_state &
	3037	(SS_ISCONNECTED\|SS_ISCONNECTING\|SS_ISDISCONNECTING)) == 0) {
	3038	error = ENOTCONN;
	3039	} else {
	3040	error = soshutdownlock(so, how);
	3041	}
	3042	socket_unlock(so, 1);
	3043	break;
	3044	default:
	3045	error = EINVAL;
	3046	break;
	3047	}
	3048
	3049	return (error);
	3050	}
	3051
	3052	int
	3053	soshutdownlock(struct socket *so, int how)
	3054	{
	3055	struct protosw *pr = so->so_proto;
	3056	int error = 0;
	3057
	3058	sflt_notify(so, sock_evt_shutdown, &how);
	3059
	3060	if (how != SHUT_WR) {
	3061	if ((so->so_state & SS_CANTRCVMORE) != 0) {
	3062	/* read already shut down */
	3063	error = ENOTCONN;
	3064	goto done;
	3065	}
	3066	sorflush(so);
	3067	postevent(so, 0, EV_RCLOSED);
	3068	}
	3069	if (how != SHUT_RD) {
	3070	if ((so->so_state & SS_CANTSENDMORE) != 0) {
	3071	/* write already shut down */
	3072	error = ENOTCONN;
	3073	goto done;
	3074	}
	3075	error = (*pr->pr_usrreqs->pru_shutdown)(so);
	3076	postevent(so, 0, EV_WCLOSED);
	3077	}
	3078	done:
	3079	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN \| DBG_FUNC_END, 0, 0, 0, 0, 0);
	3080	return (error);
	3081	}
	3082
	3083	void
	3084	sowflush(struct socket *so)
	3085	{
	3086	struct sockbuf *sb = &so->so_snd;
	3087	#ifdef notyet
	3088	lck_mtx_t *mutex_held;
	3089	/*
	3090	* XXX: This code is currently commented out, because we may get here
	3091	* as part of sofreelastref(), and at that time, pr_getlock() may no
	3092	* longer be able to return us the lock; this will be fixed in future.
	3093	*/
	3094	if (so->so_proto->pr_getlock != NULL)
	3095	mutex_held = (*so->so_proto->pr_getlock)(so, 0);
	3096	else
	3097	mutex_held = so->so_proto->pr_domain->dom_mtx;
	3098
	3099	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
	3100	#endif /* notyet */
	3101
	3102	/*
	3103	* Obtain lock on the socket buffer (SB_LOCK). This is required
	3104	* to prevent the socket buffer from being unexpectedly altered
	3105	* while it is used by another thread in socket send/receive.
	3106	*
	3107	* sblock() must not fail here, hence the assertion.
	3108	*/
	3109	(void) sblock(sb, SBL_WAIT \| SBL_NOINTR \| SBL_IGNDEFUNCT);
	3110	VERIFY(sb->sb_flags & SB_LOCK);
	3111
	3112	sb->sb_flags &= ~(SB_SEL\|SB_UPCALL);
	3113	sb->sb_flags \|= SB_DROP;
	3114	sb->sb_upcall = NULL;
	3115	sb->sb_upcallarg = NULL;
	3116
	3117	sbunlock(sb, TRUE); /* keep socket locked */
	3118
	3119	selthreadclear(&sb->sb_sel);
	3120	sbrelease(sb);
	3121	}
	3122
	3123	void
	3124	sorflush(struct socket *so)
	3125	{
	3126	struct sockbuf *sb = &so->so_rcv;
	3127	struct protosw *pr = so->so_proto;
	3128	struct sockbuf asb;
	3129	#ifdef notyet
	3130	lck_mtx_t *mutex_held;
	3131	/*
	3132	* XXX: This code is currently commented out, because we may get here
	3133	* as part of sofreelastref(), and at that time, pr_getlock() may no
	3134	* longer be able to return us the lock; this will be fixed in future.
	3135	*/
	3136	if (so->so_proto->pr_getlock != NULL)
	3137	mutex_held = (*so->so_proto->pr_getlock)(so, 0);
	3138	else
	3139	mutex_held = so->so_proto->pr_domain->dom_mtx;
	3140
	3141	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
	3142	#endif /* notyet */
	3143
	3144	sflt_notify(so, sock_evt_flush_read, NULL);
	3145
	3146	socantrcvmore(so);
	3147
	3148	/*
	3149	* Obtain lock on the socket buffer (SB_LOCK). This is required
	3150	* to prevent the socket buffer from being unexpectedly altered
	3151	* while it is used by another thread in socket send/receive.
	3152	*
	3153	* sblock() must not fail here, hence the assertion.
	3154	*/
	3155	(void) sblock(sb, SBL_WAIT \| SBL_NOINTR \| SBL_IGNDEFUNCT);
	3156	VERIFY(sb->sb_flags & SB_LOCK);
	3157
	3158	/*
	3159	* Copy only the relevant fields from "sb" to "asb" which we
	3160	* need for sbrelease() to function. In particular, skip
	3161	* sb_sel as it contains the wait queue linkage, which would
	3162	* wreak havoc if we were to issue selthreadclear() on "asb".
	3163	* Make sure to not carry over SB_LOCK in "asb", as we need
	3164	* to acquire it later as part of sbrelease().
	3165	*/
	3166	bzero(&asb, sizeof (asb));
	3167	asb.sb_cc = sb->sb_cc;
	3168	asb.sb_hiwat = sb->sb_hiwat;
	3169	asb.sb_mbcnt = sb->sb_mbcnt;
	3170	asb.sb_mbmax = sb->sb_mbmax;
	3171	asb.sb_ctl = sb->sb_ctl;
	3172	asb.sb_lowat = sb->sb_lowat;
	3173	asb.sb_mb = sb->sb_mb;
	3174	asb.sb_mbtail = sb->sb_mbtail;
	3175	asb.sb_lastrecord = sb->sb_lastrecord;
	3176	asb.sb_so = sb->sb_so;
	3177	asb.sb_flags = sb->sb_flags;
	3178	asb.sb_flags &= ~(SB_LOCK\|SB_SEL\|SB_KNOTE\|SB_UPCALL);
	3179	asb.sb_flags \|= SB_DROP;
	3180
	3181	/*
	3182	* Ideally we'd bzero() these and preserve the ones we need;
	3183	* but to do that we'd need to shuffle things around in the
	3184	* sockbuf, and we can't do it now because there are KEXTS
	3185	* that are directly referring to the socket structure.
	3186	*
	3187	* Setting SB_DROP acts as a barrier to prevent further appends.
	3188	* Clearing SB_SEL is done for selthreadclear() below.
	3189	*/
	3190	sb->sb_cc = 0;
	3191	sb->sb_hiwat = 0;
	3192	sb->sb_mbcnt = 0;
	3193	sb->sb_mbmax = 0;
	3194	sb->sb_ctl = 0;
	3195	sb->sb_lowat = 0;
	3196	sb->sb_mb = NULL;
	3197	sb->sb_mbtail = NULL;
	3198	sb->sb_lastrecord = NULL;
	3199	sb->sb_timeo.tv_sec = 0;
	3200	sb->sb_timeo.tv_usec = 0;
	3201	sb->sb_upcall = NULL;
	3202	sb->sb_upcallarg = NULL;
	3203	sb->sb_flags &= ~(SB_SEL\|SB_UPCALL);
	3204	sb->sb_flags \|= SB_DROP;
	3205
	3206	sbunlock(sb, TRUE); /* keep socket locked */
	3207
	3208	/*
	3209	* Note that selthreadclear() is called on the original "sb" and
	3210	* not the local "asb" because of the way wait queue linkage is
	3211	* implemented. Given that selwakeup() may be triggered, SB_SEL
	3212	* should no longer be set (cleared above.)
	3213	*/
	3214	selthreadclear(&sb->sb_sel);
	3215
	3216	if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
	3217	(*pr->pr_domain->dom_dispose)(asb.sb_mb);
	3218
	3219	sbrelease(&asb);
	3220	}
	3221
	3222	/*
	3223	* Perhaps this routine, and sooptcopyout(), below, ought to come in
	3224	* an additional variant to handle the case where the option value needs
	3225	* to be some kind of integer, but not a specific size.
	3226	* In addition to their use here, these functions are also called by the
	3227	* protocol-level pr_ctloutput() routines.
	3228	*
	3229	* Returns: 0 Success
	3230	* EINVAL
	3231	* copyin:EFAULT
	3232	*/
	3233	int
	3234	sooptcopyin(struct sockopt sopt, void buf, size_t len, size_t minlen)
	3235	{
	3236	size_t valsize;
	3237
	3238	/*
	3239	* If the user gives us more than we wanted, we ignore it,
	3240	* but if we don't get the minimum length the caller
	3241	* wants, we return EINVAL. On success, sopt->sopt_valsize
	3242	* is set to however much we actually retrieved.
	3243	*/
	3244	if ((valsize = sopt->sopt_valsize) < minlen)
	3245	return (EINVAL);
	3246	if (valsize > len)
	3247	sopt->sopt_valsize = valsize = len;
	3248
	3249	if (sopt->sopt_p != kernproc)
	3250	return (copyin(sopt->sopt_val, buf, valsize));
	3251
	3252	bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
	3253	return (0);
	3254	}
	3255
	3256	/*
	3257	* sooptcopyin_timeval
	3258	* Copy in a timeval value into tv_p, and take into account whether the
	3259	* the calling process is 64-bit or 32-bit. Moved the sanity checking
	3260	* code here so that we can verify the 64-bit tv_sec value before we lose
	3261	* the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
	3262	*/
	3263	static int
	3264	sooptcopyin_timeval(struct sockopt sopt, struct timeval tv_p)
	3265	{
	3266	int error;
	3267
	3268	if (proc_is64bit(sopt->sopt_p)) {
	3269	struct user64_timeval tv64;
	3270
	3271	if (sopt->sopt_valsize < sizeof (tv64))
	3272	return (EINVAL);
	3273
	3274	sopt->sopt_valsize = sizeof (tv64);
	3275	if (sopt->sopt_p != kernproc) {
	3276	error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
	3277	if (error != 0)
	3278	return (error);
	3279	} else {
	3280	bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
	3281	sizeof (tv64));
	3282	}
	3283	if (tv64.tv_sec < 0 \|\| tv64.tv_sec > LONG_MAX \|\|
	3284	tv64.tv_usec < 0 \|\| tv64.tv_usec >= 1000000)
	3285	return (EDOM);
	3286
	3287	tv_p->tv_sec = tv64.tv_sec;
	3288	tv_p->tv_usec = tv64.tv_usec;
	3289	} else {
	3290	struct user32_timeval tv32;
	3291
	3292	if (sopt->sopt_valsize < sizeof (tv32))
	3293	return (EINVAL);
	3294
	3295	sopt->sopt_valsize = sizeof (tv32);
	3296	if (sopt->sopt_p != kernproc) {
	3297	error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
	3298	if (error != 0) {
	3299	return (error);
	3300	}
	3301	} else {
	3302	bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
	3303	sizeof (tv32));
	3304	}
	3305	#ifndef __LP64__
	3306	/*
	3307	* K64todo "comparison is always false due to
	3308	* limited range of data type"
	3309	*/
	3310	if (tv32.tv_sec < 0 \|\| tv32.tv_sec > LONG_MAX \|\|
	3311	tv32.tv_usec < 0 \|\| tv32.tv_usec >= 1000000)
	3312	return (EDOM);
	3313	#endif
	3314	tv_p->tv_sec = tv32.tv_sec;
	3315	tv_p->tv_usec = tv32.tv_usec;
	3316	}
	3317	return (0);
	3318	}
	3319
	3320	/*
	3321	* Returns: 0 Success
	3322	* EINVAL
	3323	* ENOPROTOOPT
	3324	* ENOBUFS
	3325	* EDOM
	3326	* sooptcopyin:EINVAL
	3327	* sooptcopyin:EFAULT
	3328	* sooptcopyin_timeval:EINVAL
	3329	* sooptcopyin_timeval:EFAULT
	3330	* sooptcopyin_timeval:EDOM
	3331	* <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
	3332	* <pr_ctloutput>:???w
	3333	* sflt_attach_private:??? [whatever a filter author chooses]
	3334	* <sf_setoption>:??? [whatever a filter author chooses]
	3335	*
	3336	* Notes: Other <pru_listen> returns depend on the protocol family; all
	3337	* <sf_listen> returns depend on what the filter author causes
	3338	* their filter to return.
	3339	*/
	3340	int
	3341	sosetoptlock(struct socket so, struct sockopt sopt, int dolock)
	3342	{
	3343	int error, optval;
	3344	struct linger l;
	3345	struct timeval tv;
	3346	#if CONFIG_MACF_SOCKET
	3347	struct mac extmac;
	3348	#endif /* MAC_SOCKET */
	3349
	3350	if (sopt->sopt_dir != SOPT_SET)
	3351	sopt->sopt_dir = SOPT_SET;
	3352
	3353	if (dolock)
	3354	socket_lock(so, 1);
	3355
	3356	if ((so->so_state & (SS_CANTRCVMORE \| SS_CANTSENDMORE)) ==
	3357	(SS_CANTRCVMORE \| SS_CANTSENDMORE) &&
	3358	(so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
	3359	/* the socket has been shutdown, no more sockopt's */
	3360	error = EINVAL;
	3361	goto out;
	3362	}
	3363
	3364	error = sflt_setsockopt(so, sopt);
	3365	if (error != 0) {
	3366	if (error == EJUSTRETURN)
	3367	error = 0;
	3368	goto out;
	3369	}
	3370
	3371	if (sopt->sopt_level != SOL_SOCKET) {
	3372	if (so->so_proto != NULL &&
	3373	so->so_proto->pr_ctloutput != NULL) {
	3374	error = (*so->so_proto->pr_ctloutput)(so, sopt);
	3375	goto out;
	3376	}
	3377	error = ENOPROTOOPT;
	3378	} else {
	3379	/*
	3380	* Allow socket-level (SOL_SOCKET) options to be filtered by
	3381	* the protocol layer, if needed. A zero value returned from
	3382	* the handler means use default socket-level processing as
	3383	* done by the rest of this routine. Otherwise, any other
	3384	* return value indicates that the option is unsupported.
	3385	*/
	3386	if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
	3387	pru_socheckopt(so, sopt)) != 0)
	3388	goto out;
	3389
	3390	error = 0;
	3391	switch (sopt->sopt_name) {
	3392	case SO_LINGER:
	3393	case SO_LINGER_SEC:
	3394	error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
	3395	if (error != 0)
	3396	goto out;
	3397
	3398	so->so_linger = (sopt->sopt_name == SO_LINGER) ?
	3399	l.l_linger : l.l_linger * hz;
	3400	if (l.l_onoff != 0)
	3401	so->so_options \|= SO_LINGER;
	3402	else
	3403	so->so_options &= ~SO_LINGER;
	3404	break;
	3405
	3406	case SO_DEBUG:
	3407	case SO_KEEPALIVE:
	3408	case SO_DONTROUTE:
	3409	case SO_USELOOPBACK:
	3410	case SO_BROADCAST:
	3411	case SO_REUSEADDR:
	3412	case SO_REUSEPORT:
	3413	case SO_OOBINLINE:
	3414	case SO_TIMESTAMP:
	3415	case SO_TIMESTAMP_MONOTONIC:
	3416	case SO_DONTTRUNC:
	3417	case SO_WANTMORE:
	3418	case SO_WANTOOBFLAG:
	3419	error = sooptcopyin(sopt, &optval, sizeof (optval),
	3420	sizeof (optval));
	3421	if (error != 0)
	3422	goto out;
	3423	if (optval)
	3424	so->so_options \|= sopt->sopt_name;
	3425	else
	3426	so->so_options &= ~sopt->sopt_name;
	3427	break;
	3428
	3429	case SO_SNDBUF:
	3430	case SO_RCVBUF:
	3431	case SO_SNDLOWAT:
	3432	case SO_RCVLOWAT:
	3433	error = sooptcopyin(sopt, &optval, sizeof (optval),
	3434	sizeof (optval));
	3435	if (error != 0)
	3436	goto out;
	3437
	3438	/*
	3439	* Values < 1 make no sense for any of these
	3440	* options, so disallow them.
	3441	*/
	3442	if (optval < 1) {
	3443	error = EINVAL;
	3444	goto out;
	3445	}
	3446
	3447	switch (sopt->sopt_name) {
	3448	case SO_SNDBUF:
	3449	case SO_RCVBUF: {
	3450	struct sockbuf *sb =
	3451	(sopt->sopt_name == SO_SNDBUF) ?
	3452	&so->so_snd : &so->so_rcv;
	3453	if (sbreserve(sb, (u_int32_t)optval) == 0) {
	3454	error = ENOBUFS;
	3455	goto out;
	3456	}
	3457	sb->sb_flags \|= SB_USRSIZE;
	3458	sb->sb_flags &= ~SB_AUTOSIZE;
	3459	sb->sb_idealsize = (u_int32_t)optval;
	3460	break;
	3461	}
	3462	/*
	3463	* Make sure the low-water is never greater than
	3464	* the high-water.
	3465	*/
	3466	case SO_SNDLOWAT:
	3467	so->so_snd.sb_lowat =
	3468	(optval > so->so_snd.sb_hiwat) ?
	3469	so->so_snd.sb_hiwat : optval;
	3470	break;
	3471	case SO_RCVLOWAT:
	3472	so->so_rcv.sb_lowat =
	3473	(optval > so->so_rcv.sb_hiwat) ?
	3474	so->so_rcv.sb_hiwat : optval;
	3475	break;
	3476	}
	3477	break;
	3478
	3479	case SO_SNDTIMEO:
	3480	case SO_RCVTIMEO:
	3481	error = sooptcopyin_timeval(sopt, &tv);
	3482	if (error != 0)
	3483	goto out;
	3484
	3485	switch (sopt->sopt_name) {
	3486	case SO_SNDTIMEO:
	3487	so->so_snd.sb_timeo = tv;
	3488	break;
	3489	case SO_RCVTIMEO:
	3490	so->so_rcv.sb_timeo = tv;
	3491	break;
	3492	}
	3493	break;
	3494
	3495	case SO_NKE: {
	3496	struct so_nke nke;
	3497
	3498	error = sooptcopyin(sopt, &nke, sizeof (nke),
	3499	sizeof (nke));
	3500	if (error != 0)
	3501	goto out;
	3502
	3503	error = sflt_attach_internal(so, nke.nke_handle);
	3504	break;
	3505	}
	3506
	3507	case SO_NOSIGPIPE:
	3508	error = sooptcopyin(sopt, &optval, sizeof (optval),
	3509	sizeof (optval));
	3510	if (error != 0)
	3511	goto out;
	3512	if (optval != 0)
	3513	so->so_flags \|= SOF_NOSIGPIPE;
	3514	else
	3515	so->so_flags &= ~SOF_NOSIGPIPE;
	3516	break;
	3517
	3518	case SO_NOADDRERR:
	3519	error = sooptcopyin(sopt, &optval, sizeof (optval),
	3520	sizeof (optval));
	3521	if (error != 0)
	3522	goto out;
	3523	if (optval != 0)
	3524	so->so_flags \|= SOF_NOADDRAVAIL;
	3525	else
	3526	so->so_flags &= ~SOF_NOADDRAVAIL;
	3527	break;
	3528
	3529	case SO_REUSESHAREUID:
	3530	error = sooptcopyin(sopt, &optval, sizeof (optval),
	3531	sizeof (optval));
	3532	if (error != 0)
	3533	goto out;
	3534	if (optval != 0)
	3535	so->so_flags \|= SOF_REUSESHAREUID;
	3536	else
	3537	so->so_flags &= ~SOF_REUSESHAREUID;
	3538	break;
	3539
	3540	case SO_NOTIFYCONFLICT:
	3541	if (kauth_cred_issuser(kauth_cred_get()) == 0) {
	3542	error = EPERM;
	3543	goto out;
	3544	}
	3545	error = sooptcopyin(sopt, &optval, sizeof (optval),
	3546	sizeof (optval));
	3547	if (error != 0)
	3548	goto out;
	3549	if (optval != 0)
	3550	so->so_flags \|= SOF_NOTIFYCONFLICT;
	3551	else
	3552	so->so_flags &= ~SOF_NOTIFYCONFLICT;
	3553	break;
	3554
	3555	case SO_RESTRICTIONS:
	3556	error = sooptcopyin(sopt, &optval, sizeof (optval),
	3557	sizeof (optval));
	3558	if (error != 0)
	3559	goto out;
	3560
	3561	error = so_set_restrictions(so, optval);
	3562	break;
	3563
	3564	case SO_LABEL:
	3565	#if CONFIG_MACF_SOCKET
	3566	if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
	3567	sizeof (extmac))) != 0)
	3568	goto out;
	3569
	3570	error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
	3571	so, &extmac);
	3572	#else
	3573	error = EOPNOTSUPP;
	3574	#endif /* MAC_SOCKET */
	3575	break;
	3576
	3577	case SO_UPCALLCLOSEWAIT:
	3578	error = sooptcopyin(sopt, &optval, sizeof (optval),
	3579	sizeof (optval));
	3580	if (error != 0)
	3581	goto out;
	3582	if (optval != 0)
	3583	so->so_flags \|= SOF_UPCALLCLOSEWAIT;
	3584	else
	3585	so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
	3586	break;
	3587
	3588	case SO_RANDOMPORT:
	3589	error = sooptcopyin(sopt, &optval, sizeof (optval),
	3590	sizeof (optval));
	3591	if (error != 0)
	3592	goto out;
	3593	if (optval != 0)
	3594	so->so_flags \|= SOF_BINDRANDOMPORT;
	3595	else
	3596	so->so_flags &= ~SOF_BINDRANDOMPORT;
	3597	break;
	3598
	3599	case SO_NP_EXTENSIONS: {
	3600	struct so_np_extensions sonpx;
	3601
	3602	error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
	3603	sizeof (sonpx));
	3604	if (error != 0)
	3605	goto out;
	3606	if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
	3607	error = EINVAL;
	3608	goto out;
	3609	}
	3610	/*
	3611	* Only one bit defined for now
	3612	*/
	3613	if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
	3614	if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
	3615	so->so_flags \|= SOF_NPX_SETOPTSHUT;
	3616	else
	3617	so->so_flags &= ~SOF_NPX_SETOPTSHUT;
	3618	}
	3619	break;
	3620	}
	3621
	3622	case SO_TRAFFIC_CLASS: {
	3623	error = sooptcopyin(sopt, &optval, sizeof (optval),
	3624	sizeof (optval));
	3625	if (error != 0)
	3626	goto out;
	3627	error = so_set_traffic_class(so, optval);
	3628	if (error != 0)
	3629	goto out;
	3630	break;
	3631	}
	3632
	3633	case SO_RECV_TRAFFIC_CLASS: {
	3634	error = sooptcopyin(sopt, &optval, sizeof (optval),
	3635	sizeof (optval));
	3636	if (error != 0)
	3637	goto out;
	3638	if (optval == 0)
	3639	so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
	3640	else
	3641	so->so_flags \|= SOF_RECV_TRAFFIC_CLASS;
	3642	break;
	3643	}
	3644
	3645	case SO_TRAFFIC_CLASS_DBG: {
	3646	struct so_tcdbg so_tcdbg;
	3647
	3648	error = sooptcopyin(sopt, &so_tcdbg,
	3649	sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
	3650	if (error != 0)
	3651	goto out;
	3652	error = so_set_tcdbg(so, &so_tcdbg);
	3653	if (error != 0)
	3654	goto out;
	3655	break;
	3656	}
	3657
	3658	case SO_PRIVILEGED_TRAFFIC_CLASS:
	3659	error = priv_check_cred(kauth_cred_get(),
	3660	PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
	3661	if (error != 0)
	3662	goto out;
	3663	error = sooptcopyin(sopt, &optval, sizeof (optval),
	3664	sizeof (optval));
	3665	if (error != 0)
	3666	goto out;
	3667	if (optval == 0)
	3668	so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
	3669	else
	3670	so->so_flags \|= SOF_PRIVILEGED_TRAFFIC_CLASS;
	3671	break;
	3672
	3673	case SO_DEFUNCTOK:
	3674	error = sooptcopyin(sopt, &optval, sizeof (optval),
	3675	sizeof (optval));
	3676	if (error != 0 \|\| (so->so_flags & SOF_DEFUNCT)) {
	3677	if (error == 0)
	3678	error = EBADF;
	3679	goto out;
	3680	}
	3681	/*
	3682	* Any process can set SO_DEFUNCTOK (clear
	3683	* SOF_NODEFUNCT), but only root can clear
	3684	* SO_DEFUNCTOK (set SOF_NODEFUNCT).
	3685	*/
	3686	if (optval == 0 &&
	3687	kauth_cred_issuser(kauth_cred_get()) == 0) {
	3688	error = EPERM;
	3689	goto out;
	3690	}
	3691	if (optval)
	3692	so->so_flags &= ~SOF_NODEFUNCT;
	3693	else
	3694	so->so_flags \|= SOF_NODEFUNCT;
	3695
	3696	if (SOCK_DOM(so) == PF_INET \|\|
	3697	SOCK_DOM(so) == PF_INET6) {
	3698	char s[MAX_IPv6_STR_LEN];
	3699	char d[MAX_IPv6_STR_LEN];
	3700	struct inpcb *inp = sotoinpcb(so);
	3701
	3702	SODEFUNCTLOG(("%s[%d]: so 0x%llx [%s %s:%d -> "
	3703	"%s:%d] is now marked as %seligible for "
	3704	"defunct\n", __func__, proc_selfpid(),
	3705	(uint64_t)VM_KERNEL_ADDRPERM(so),
	3706	(SOCK_TYPE(so) == SOCK_STREAM) ?
	3707	"TCP" : "UDP", inet_ntop(SOCK_DOM(so),
	3708	((SOCK_DOM(so) == PF_INET) ?
	3709	(void *)&inp->inp_laddr.s_addr :
	3710	(void *)&inp->in6p_laddr), s, sizeof (s)),
	3711	ntohs(inp->in6p_lport),
	3712	inet_ntop(SOCK_DOM(so),
	3713	(SOCK_DOM(so) == PF_INET) ?
	3714	(void *)&inp->inp_faddr.s_addr :
	3715	(void *)&inp->in6p_faddr, d, sizeof (d)),
	3716	ntohs(inp->in6p_fport),
	3717	(so->so_flags & SOF_NODEFUNCT) ?
	3718	"not " : ""));
	3719	} else {
	3720	SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d] is "
	3721	"now marked as %seligible for defunct\n",
	3722	__func__, proc_selfpid(),
	3723	(uint64_t)VM_KERNEL_ADDRPERM(so),
	3724	SOCK_DOM(so), SOCK_TYPE(so),
	3725	(so->so_flags & SOF_NODEFUNCT) ?
	3726	"not " : ""));
	3727	}
	3728	break;
	3729
	3730	case SO_ISDEFUNCT:
	3731	/* This option is not settable */
	3732	error = EINVAL;
	3733	break;
	3734
	3735	case SO_OPPORTUNISTIC:
	3736	error = sooptcopyin(sopt, &optval, sizeof (optval),
	3737	sizeof (optval));
	3738	if (error == 0)
	3739	error = so_set_opportunistic(so, optval);
	3740	break;
	3741
	3742	case SO_FLUSH:
	3743	/* This option is handled by lower layer(s) */
	3744	error = 0;
	3745	break;
	3746
	3747	case SO_RECV_ANYIF:
	3748	error = sooptcopyin(sopt, &optval, sizeof (optval),
	3749	sizeof (optval));
	3750	if (error == 0)
	3751	error = so_set_recv_anyif(so, optval);
	3752	break;
	3753
	3754	case SO_TRAFFIC_MGT_BACKGROUND: {
	3755	/* This option is handled by lower layer(s) */
	3756	error = 0;
	3757	break;
	3758	}
	3759
	3760	#if FLOW_DIVERT
	3761	case SO_FLOW_DIVERT_TOKEN:
	3762	error = flow_divert_token_set(so, sopt);
	3763	break;
	3764	#endif /* FLOW_DIVERT */
	3765
	3766
	3767	case SO_DELEGATED:
	3768	if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
	3769	sizeof (optval))) != 0)
	3770	break;
	3771
	3772	error = so_set_effective_pid(so, optval, sopt->sopt_p);
	3773	break;
	3774
	3775	case SO_DELEGATED_UUID: {
	3776	uuid_t euuid;
	3777
	3778	if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
	3779	sizeof (euuid))) != 0)
	3780	break;
	3781
	3782	error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
	3783	break;
	3784	}
	3785
	3786	default:
	3787	error = ENOPROTOOPT;
	3788	break;
	3789	}
	3790	if (error == 0 && so->so_proto != NULL &&
	3791	so->so_proto->pr_ctloutput != NULL) {
	3792	(void) so->so_proto->pr_ctloutput(so, sopt);
	3793	}
	3794	}
	3795	out:
	3796	if (dolock)
	3797	socket_unlock(so, 1);
	3798	return (error);
	3799	}
	3800
	3801	/* Helper routines for getsockopt */
	3802	int
	3803	sooptcopyout(struct sockopt sopt, void buf, size_t len)
	3804	{
	3805	int error;
	3806	size_t valsize;
	3807
	3808	error = 0;
	3809
	3810	/*
	3811	* Documented get behavior is that we always return a value,
	3812	* possibly truncated to fit in the user's buffer.
	3813	* Traditional behavior is that we always tell the user
	3814	* precisely how much we copied, rather than something useful
	3815	* like the total amount we had available for her.
	3816	* Note that this interface is not idempotent; the entire answer must
	3817	* generated ahead of time.
	3818	*/
	3819	valsize = min(len, sopt->sopt_valsize);
	3820	sopt->sopt_valsize = valsize;
	3821	if (sopt->sopt_val != USER_ADDR_NULL) {
	3822	if (sopt->sopt_p != kernproc)
	3823	error = copyout(buf, sopt->sopt_val, valsize);
	3824	else
	3825	bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
	3826	}
	3827	return (error);
	3828	}
	3829
	3830	static int
	3831	sooptcopyout_timeval(struct sockopt sopt, const struct timeval tv_p)
	3832	{
	3833	int error;
	3834	size_t len;
	3835	struct user64_timeval tv64;
	3836	struct user32_timeval tv32;
	3837	const void * val;
	3838	size_t valsize;
	3839
	3840	error = 0;
	3841	if (proc_is64bit(sopt->sopt_p)) {
	3842	len = sizeof (tv64);
	3843	tv64.tv_sec = tv_p->tv_sec;
	3844	tv64.tv_usec = tv_p->tv_usec;
	3845	val = &tv64;
	3846	} else {
	3847	len = sizeof (tv32);
	3848	tv32.tv_sec = tv_p->tv_sec;
	3849	tv32.tv_usec = tv_p->tv_usec;
	3850	val = &tv32;
	3851	}
	3852	valsize = min(len, sopt->sopt_valsize);
	3853	sopt->sopt_valsize = valsize;
	3854	if (sopt->sopt_val != USER_ADDR_NULL) {
	3855	if (sopt->sopt_p != kernproc)
	3856	error = copyout(val, sopt->sopt_val, valsize);
	3857	else
	3858	bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
	3859	}
	3860	return (error);
	3861	}
	3862
	3863	/*
	3864	* Return: 0 Success
	3865	* ENOPROTOOPT
	3866	* <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
	3867	* <pr_ctloutput>:???
	3868	* <sf_getoption>:???
	3869	*/
	3870	int
	3871	sogetoptlock(struct socket so, struct sockopt sopt, int dolock)
	3872	{
	3873	int error, optval;
	3874	struct linger l;
	3875	struct timeval tv;
	3876	#if CONFIG_MACF_SOCKET
	3877	struct mac extmac;
	3878	#endif /* MAC_SOCKET */
	3879
	3880	if (sopt->sopt_dir != SOPT_GET)
	3881	sopt->sopt_dir = SOPT_GET;
	3882
	3883	if (dolock)
	3884	socket_lock(so, 1);
	3885
	3886	error = sflt_getsockopt(so, sopt);
	3887	if (error != 0) {
	3888	if (error == EJUSTRETURN)
	3889	error = 0;
	3890	goto out;
	3891	}
	3892
	3893	if (sopt->sopt_level != SOL_SOCKET) {
	3894	if (so->so_proto != NULL &&
	3895	so->so_proto->pr_ctloutput != NULL) {
	3896	error = (*so->so_proto->pr_ctloutput)(so, sopt);
	3897	goto out;
	3898	}
	3899	error = ENOPROTOOPT;
	3900	} else {
	3901	/*
	3902	* Allow socket-level (SOL_SOCKET) options to be filtered by
	3903	* the protocol layer, if needed. A zero value returned from
	3904	* the handler means use default socket-level processing as
	3905	* done by the rest of this routine. Otherwise, any other
	3906	* return value indicates that the option is unsupported.
	3907	*/
	3908	if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
	3909	pru_socheckopt(so, sopt)) != 0)
	3910	goto out;
	3911
	3912	error = 0;
	3913	switch (sopt->sopt_name) {
	3914	case SO_LINGER:
	3915	case SO_LINGER_SEC:
	3916	l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
	3917	l.l_linger = (sopt->sopt_name == SO_LINGER) ?
	3918	so->so_linger : so->so_linger / hz;
	3919	error = sooptcopyout(sopt, &l, sizeof (l));
	3920	break;
	3921
	3922	case SO_USELOOPBACK:
	3923	case SO_DONTROUTE:
	3924	case SO_DEBUG:
	3925	case SO_KEEPALIVE:
	3926	case SO_REUSEADDR:
	3927	case SO_REUSEPORT:
	3928	case SO_BROADCAST:
	3929	case SO_OOBINLINE:
	3930	case SO_TIMESTAMP:
	3931	case SO_TIMESTAMP_MONOTONIC:
	3932	case SO_DONTTRUNC:
	3933	case SO_WANTMORE:
	3934	case SO_WANTOOBFLAG:
	3935	optval = so->so_options & sopt->sopt_name;
	3936	integer:
	3937	error = sooptcopyout(sopt, &optval, sizeof (optval));
	3938	break;
	3939
	3940	case SO_TYPE:
	3941	optval = so->so_type;
	3942	goto integer;
	3943
	3944	case SO_NREAD:
	3945	if (so->so_proto->pr_flags & PR_ATOMIC) {
	3946	int pkt_total;
	3947	struct mbuf *m1;
	3948
	3949	pkt_total = 0;
	3950	m1 = so->so_rcv.sb_mb;
	3951	while (m1 != NULL) {
	3952	if (m1->m_type == MT_DATA \|\|
	3953	m1->m_type == MT_HEADER \|\|
	3954	m1->m_type == MT_OOBDATA)
	3955	pkt_total += m1->m_len;
	3956	m1 = m1->m_next;
	3957	}
	3958	optval = pkt_total;
	3959	} else {
	3960	optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
	3961	}
	3962	goto integer;
	3963
	3964	case SO_NWRITE:
	3965	optval = so->so_snd.sb_cc;
	3966	goto integer;
	3967
	3968	case SO_ERROR:
	3969	optval = so->so_error;
	3970	so->so_error = 0;
	3971	goto integer;
	3972
	3973	case SO_SNDBUF:
	3974	optval = so->so_snd.sb_hiwat;
	3975	goto integer;
	3976
	3977	case SO_RCVBUF:
	3978	optval = so->so_rcv.sb_hiwat;
	3979	goto integer;
	3980
	3981	case SO_SNDLOWAT:
	3982	optval = so->so_snd.sb_lowat;
	3983	goto integer;
	3984
	3985	case SO_RCVLOWAT:
	3986	optval = so->so_rcv.sb_lowat;
	3987	goto integer;
	3988
	3989	case SO_SNDTIMEO:
	3990	case SO_RCVTIMEO:
	3991	tv = (sopt->sopt_name == SO_SNDTIMEO ?
	3992	so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
	3993
	3994	error = sooptcopyout_timeval(sopt, &tv);
	3995	break;
	3996
	3997	case SO_NOSIGPIPE:
	3998	optval = (so->so_flags & SOF_NOSIGPIPE);
	3999	goto integer;
	4000
	4001	case SO_NOADDRERR:
	4002	optval = (so->so_flags & SOF_NOADDRAVAIL);
	4003	goto integer;
	4004
	4005	case SO_REUSESHAREUID:
	4006	optval = (so->so_flags & SOF_REUSESHAREUID);
	4007	goto integer;
	4008
	4009
	4010	case SO_NOTIFYCONFLICT:
	4011	optval = (so->so_flags & SOF_NOTIFYCONFLICT);
	4012	goto integer;
	4013
	4014	case SO_RESTRICTIONS:
	4015	optval = so_get_restrictions(so);
	4016	goto integer;
	4017
	4018	case SO_LABEL:
	4019	#if CONFIG_MACF_SOCKET
	4020	if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
	4021	sizeof (extmac))) != 0 \|\|
	4022	(error = mac_socket_label_get(proc_ucred(
	4023	sopt->sopt_p), so, &extmac)) != 0)
	4024	break;
	4025
	4026	error = sooptcopyout(sopt, &extmac, sizeof (extmac));
	4027	#else
	4028	error = EOPNOTSUPP;
	4029	#endif /* MAC_SOCKET */
	4030	break;
	4031
	4032	case SO_PEERLABEL:
	4033	#if CONFIG_MACF_SOCKET
	4034	if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
	4035	sizeof (extmac))) != 0 \|\|
	4036	(error = mac_socketpeer_label_get(proc_ucred(
	4037	sopt->sopt_p), so, &extmac)) != 0)
	4038	break;
	4039
	4040	error = sooptcopyout(sopt, &extmac, sizeof (extmac));
	4041	#else
	4042	error = EOPNOTSUPP;
	4043	#endif /* MAC_SOCKET */
	4044	break;
	4045
	4046	#ifdef __APPLE_API_PRIVATE
	4047	case SO_UPCALLCLOSEWAIT:
	4048	optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
	4049	goto integer;
	4050	#endif
	4051	case SO_RANDOMPORT:
	4052	optval = (so->so_flags & SOF_BINDRANDOMPORT);
	4053	goto integer;
	4054
	4055	case SO_NP_EXTENSIONS: {
	4056	struct so_np_extensions sonpx;
	4057
	4058	sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
	4059	SONPX_SETOPTSHUT : 0;
	4060	sonpx.npx_mask = SONPX_MASK_VALID;
	4061
	4062	error = sooptcopyout(sopt, &sonpx,
	4063	sizeof (struct so_np_extensions));
	4064	break;
	4065	}
	4066
	4067	case SO_TRAFFIC_CLASS:
	4068	optval = so->so_traffic_class;
	4069	goto integer;
	4070
	4071	case SO_RECV_TRAFFIC_CLASS:
	4072	optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
	4073	goto integer;
	4074
	4075	case SO_TRAFFIC_CLASS_STATS:
	4076	error = sooptcopyout(sopt, &so->so_tc_stats,
	4077	sizeof (so->so_tc_stats));
	4078	break;
	4079
	4080	case SO_TRAFFIC_CLASS_DBG:
	4081	error = sogetopt_tcdbg(so, sopt);
	4082	break;
	4083
	4084	case SO_PRIVILEGED_TRAFFIC_CLASS:
	4085	optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
	4086	goto integer;
	4087
	4088	case SO_DEFUNCTOK:
	4089	optval = !(so->so_flags & SOF_NODEFUNCT);
	4090	goto integer;
	4091
	4092	case SO_ISDEFUNCT:
	4093	optval = (so->so_flags & SOF_DEFUNCT);
	4094	goto integer;
	4095
	4096	case SO_OPPORTUNISTIC:
	4097	optval = so_get_opportunistic(so);
	4098	goto integer;
	4099
	4100	case SO_FLUSH:
	4101	/* This option is not gettable */
	4102	error = EINVAL;
	4103	break;
	4104
	4105	case SO_RECV_ANYIF:
	4106	optval = so_get_recv_anyif(so);
	4107	goto integer;
	4108
	4109	case SO_TRAFFIC_MGT_BACKGROUND:
	4110	/* This option is handled by lower layer(s) */
	4111	if (so->so_proto != NULL &&
	4112	so->so_proto->pr_ctloutput != NULL) {
	4113	(void) so->so_proto->pr_ctloutput(so, sopt);
	4114	}
	4115	break;
	4116
	4117	#if FLOW_DIVERT
	4118	case SO_FLOW_DIVERT_TOKEN:
	4119	error = flow_divert_token_get(so, sopt);
	4120	break;
	4121	#endif /* FLOW_DIVERT */
	4122
	4123	default:
	4124	error = ENOPROTOOPT;
	4125	break;
	4126	}
	4127	}
	4128	out:
	4129	if (dolock)
	4130	socket_unlock(so, 1);
	4131	return (error);
	4132	}
	4133
	4134	/*
	4135	* The size limits on our soopt_getm is different from that on FreeBSD.
	4136	* We limit the size of options to MCLBYTES. This will have to change
	4137	* if we need to define options that need more space than MCLBYTES.
	4138	*/
	4139	int
	4140	soopt_getm(struct sockopt sopt, struct mbuf *mp)
	4141	{
	4142	struct mbuf m, m_prev;
	4143	int sopt_size = sopt->sopt_valsize;
	4144	int how;
	4145
	4146	if (sopt_size <= 0 \|\| sopt_size > MCLBYTES)
	4147	return (EMSGSIZE);
	4148
	4149	how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
	4150	MGET(m, how, MT_DATA);
	4151	if (m == NULL)
	4152	return (ENOBUFS);
	4153	if (sopt_size > MLEN) {
	4154	MCLGET(m, how);
	4155	if ((m->m_flags & M_EXT) == 0) {
	4156	m_free(m);
	4157	return (ENOBUFS);
	4158	}
	4159	m->m_len = min(MCLBYTES, sopt_size);
	4160	} else {
	4161	m->m_len = min(MLEN, sopt_size);
	4162	}
	4163	sopt_size -= m->m_len;
	4164	*mp = m;
	4165	m_prev = m;
	4166
	4167	while (sopt_size > 0) {
	4168	MGET(m, how, MT_DATA);
	4169	if (m == NULL) {
	4170	m_freem(*mp);
	4171	return (ENOBUFS);
	4172	}
	4173	if (sopt_size > MLEN) {
	4174	MCLGET(m, how);
	4175	if ((m->m_flags & M_EXT) == 0) {
	4176	m_freem(*mp);
	4177	m_freem(m);
	4178	return (ENOBUFS);
	4179	}
	4180	m->m_len = min(MCLBYTES, sopt_size);
	4181	} else {
	4182	m->m_len = min(MLEN, sopt_size);
	4183	}
	4184	sopt_size -= m->m_len;
	4185	m_prev->m_next = m;
	4186	m_prev = m;
	4187	}
	4188	return (0);
	4189	}
	4190
	4191	/* copyin sopt data into mbuf chain */
	4192	int
	4193	soopt_mcopyin(struct sockopt sopt, struct mbuf m)
	4194	{
	4195	struct mbuf *m0 = m;
	4196
	4197	if (sopt->sopt_val == USER_ADDR_NULL)
	4198	return (0);
	4199	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
	4200	if (sopt->sopt_p != kernproc) {
	4201	int error;
	4202
	4203	error = copyin(sopt->sopt_val, mtod(m, char *),
	4204	m->m_len);
	4205	if (error != 0) {
	4206	m_freem(m0);
	4207	return (error);
	4208	}
	4209	} else {
	4210	bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
	4211	mtod(m, char *), m->m_len);
	4212	}
	4213	sopt->sopt_valsize -= m->m_len;
	4214	sopt->sopt_val += m->m_len;
	4215	m = m->m_next;
	4216	}
	4217	/* should be allocated enoughly at ip6_sooptmcopyin() */
	4218	if (m != NULL) {
	4219	panic("soopt_mcopyin");
	4220	/* NOTREACHED */
	4221	}
	4222	return (0);
	4223	}
	4224
	4225	/* copyout mbuf chain data into soopt */
	4226	int
	4227	soopt_mcopyout(struct sockopt sopt, struct mbuf m)
	4228	{
	4229	struct mbuf *m0 = m;
	4230	size_t valsize = 0;
	4231
	4232	if (sopt->sopt_val == USER_ADDR_NULL)
	4233	return (0);
	4234	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
	4235	if (sopt->sopt_p != kernproc) {
	4236	int error;
	4237
	4238	error = copyout(mtod(m, char *), sopt->sopt_val,
	4239	m->m_len);
	4240	if (error != 0) {
	4241	m_freem(m0);
	4242	return (error);
	4243	}
	4244	} else {
	4245	bcopy(mtod(m, char *),
	4246	CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
	4247	}
	4248	sopt->sopt_valsize -= m->m_len;
	4249	sopt->sopt_val += m->m_len;
	4250	valsize += m->m_len;
	4251	m = m->m_next;
	4252	}
	4253	if (m != NULL) {
	4254	/* enough soopt buffer should be given from user-land */
	4255	m_freem(m0);
	4256	return (EINVAL);
	4257	}
	4258	sopt->sopt_valsize = valsize;
	4259	return (0);
	4260	}
	4261
	4262	void
	4263	sohasoutofband(struct socket *so)
	4264	{
	4265	if (so->so_pgid < 0)
	4266	gsignal(-so->so_pgid, SIGURG);
	4267	else if (so->so_pgid > 0)
	4268	proc_signal(so->so_pgid, SIGURG);
	4269	selwakeup(&so->so_rcv.sb_sel);
	4270	}
	4271
	4272	int
	4273	sopoll(struct socket so, int events, kauth_cred_t cred, void wql)
	4274	{
	4275	#pragma unused(cred)
	4276	struct proc *p = current_proc();
	4277	int revents = 0;
	4278
	4279	socket_lock(so, 1);
	4280	so_update_last_owner_locked(so, PROC_NULL);
	4281	so_update_policy(so);
	4282
	4283	if (events & (POLLIN \| POLLRDNORM))
	4284	if (soreadable(so))
	4285	revents \|= events & (POLLIN \| POLLRDNORM);
	4286
	4287	if (events & (POLLOUT \| POLLWRNORM))
	4288	if (sowriteable(so))
	4289	revents \|= events & (POLLOUT \| POLLWRNORM);
	4290
	4291	if (events & (POLLPRI \| POLLRDBAND))
	4292	if (so->so_oobmark \|\| (so->so_state & SS_RCVATMARK))
	4293	revents \|= events & (POLLPRI \| POLLRDBAND);
	4294
	4295	if (revents == 0) {
	4296	if (events & (POLLIN \| POLLPRI \| POLLRDNORM \| POLLRDBAND)) {
	4297	/*
	4298	* Darwin sets the flag first,
	4299	* BSD calls selrecord first
	4300	*/
	4301	so->so_rcv.sb_flags \|= SB_SEL;
	4302	selrecord(p, &so->so_rcv.sb_sel, wql);
	4303	}
	4304
	4305	if (events & (POLLOUT \| POLLWRNORM)) {
	4306	/*
	4307	* Darwin sets the flag first,
	4308	* BSD calls selrecord first
	4309	*/
	4310	so->so_snd.sb_flags \|= SB_SEL;
	4311	selrecord(p, &so->so_snd.sb_sel, wql);
	4312	}
	4313	}
	4314
	4315	socket_unlock(so, 1);
	4316	return (revents);
	4317	}
	4318
	4319	int
	4320	soo_kqfilter(struct fileproc fp, struct knote kn, vfs_context_t ctx)
	4321	{
	4322	#pragma unused(fp)
	4323	#if !CONFIG_MACF_SOCKET
	4324	#pragma unused(ctx)
	4325	#endif /* MAC_SOCKET */
	4326	struct socket so = (struct socket )kn->kn_fp->f_fglob->fg_data;
	4327	struct klist *skl;
	4328
	4329	socket_lock(so, 1);
	4330	so_update_last_owner_locked(so, PROC_NULL);
	4331	so_update_policy(so);
	4332
	4333	#if CONFIG_MACF_SOCKET
	4334	if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
	4335	kn, so) != 0) {
	4336	socket_unlock(so, 1);
	4337	return (1);
	4338	}
	4339	#endif /* MAC_SOCKET */
	4340
	4341	switch (kn->kn_filter) {
	4342	case EVFILT_READ:
	4343	kn->kn_fop = &soread_filtops;
	4344	skl = &so->so_rcv.sb_sel.si_note;
	4345	break;
	4346	case EVFILT_WRITE:
	4347	kn->kn_fop = &sowrite_filtops;
	4348	skl = &so->so_snd.sb_sel.si_note;
	4349	break;
	4350	case EVFILT_SOCK:
	4351	kn->kn_fop = &sock_filtops;
	4352	skl = &so->so_klist;
	4353	break;
	4354	default:
	4355	socket_unlock(so, 1);
	4356	return (1);
	4357	}
	4358
	4359	if (KNOTE_ATTACH(skl, kn)) {
	4360	switch (kn->kn_filter) {
	4361	case EVFILT_READ:
	4362	so->so_rcv.sb_flags \|= SB_KNOTE;
	4363	break;
	4364	case EVFILT_WRITE:
	4365	so->so_snd.sb_flags \|= SB_KNOTE;
	4366	break;
	4367	case EVFILT_SOCK:
	4368	so->so_flags \|= SOF_KNOTE;
	4369	break;
	4370	default:
	4371	socket_unlock(so, 1);
	4372	return (1);
	4373	}
	4374	}
	4375	socket_unlock(so, 1);
	4376	return (0);
	4377	}
	4378
	4379	static void
	4380	filt_sordetach(struct knote *kn)
	4381	{
	4382	struct socket so = (struct socket )kn->kn_fp->f_fglob->fg_data;
	4383
	4384	socket_lock(so, 1);
	4385	if (so->so_rcv.sb_flags & SB_KNOTE)
	4386	if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
	4387	so->so_rcv.sb_flags &= ~SB_KNOTE;
	4388	socket_unlock(so, 1);
	4389	}
	4390
	4391	/ARGSUSED/
	4392	static int
	4393	filt_soread(struct knote *kn, long hint)
	4394	{
	4395	struct socket so = (struct socket )kn->kn_fp->f_fglob->fg_data;
	4396
	4397	if ((hint & SO_FILT_HINT_LOCKED) == 0)
	4398	socket_lock(so, 1);
	4399
	4400	if (so->so_options & SO_ACCEPTCONN) {
	4401	int isempty;
	4402
	4403	/*
	4404	* Radar 6615193 handle the listen case dynamically
	4405	* for kqueue read filter. This allows to call listen()
	4406	* after registering the kqueue EVFILT_READ.
	4407	*/
	4408
	4409	kn->kn_data = so->so_qlen;
	4410	isempty = ! TAILQ_EMPTY(&so->so_comp);
	4411
	4412	if ((hint & SO_FILT_HINT_LOCKED) == 0)
	4413	socket_unlock(so, 1);
	4414
	4415	return (isempty);
	4416	}
	4417
	4418	/* socket isn't a listener */
	4419
	4420	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
	4421
	4422	if (so->so_oobmark) {
	4423	if (kn->kn_flags & EV_OOBAND) {
	4424	kn->kn_data -= so->so_oobmark;
	4425	if ((hint & SO_FILT_HINT_LOCKED) == 0)
	4426	socket_unlock(so, 1);
	4427	return (1);
	4428	}
	4429	kn->kn_data = so->so_oobmark;
	4430	kn->kn_flags \|= EV_OOBAND;
	4431	} else {
	4432	if (so->so_state & SS_CANTRCVMORE) {
	4433	kn->kn_flags \|= EV_EOF;
	4434	kn->kn_fflags = so->so_error;
	4435	if ((hint & SO_FILT_HINT_LOCKED) == 0)
	4436	socket_unlock(so, 1);
	4437	return (1);
	4438	}
	4439	}
	4440
	4441	if (so->so_state & SS_RCVATMARK) {
	4442	if (kn->kn_flags & EV_OOBAND) {
	4443	if ((hint & SO_FILT_HINT_LOCKED) == 0)
	4444	socket_unlock(so, 1);
	4445	return (1);
	4446	}
	4447	kn->kn_flags \|= EV_OOBAND;
	4448	} else if (kn->kn_flags & EV_OOBAND) {
	4449	kn->kn_data = 0;
	4450	if ((hint & SO_FILT_HINT_LOCKED) == 0)
	4451	socket_unlock(so, 1);
	4452	return (0);
	4453	}
	4454
	4455	if (so->so_error) { /* temporary udp error */
	4456	if ((hint & SO_FILT_HINT_LOCKED) == 0)
	4457	socket_unlock(so, 1);
	4458	return (1);
	4459	}
	4460
	4461	int64_t lowwat = so->so_rcv.sb_lowat;
	4462	if (kn->kn_sfflags & NOTE_LOWAT) {
	4463	if (kn->kn_sdata > so->so_rcv.sb_hiwat)
	4464	lowwat = so->so_rcv.sb_hiwat;
	4465	else if (kn->kn_sdata > lowwat)
	4466	lowwat = kn->kn_sdata;
	4467	}
	4468
	4469	if ((hint & SO_FILT_HINT_LOCKED) == 0)
	4470	socket_unlock(so, 1);
	4471
	4472	return ((kn->kn_flags & EV_OOBAND) \|\| kn->kn_data >= lowwat);
	4473	}
	4474
	4475	static void
	4476	filt_sowdetach(struct knote *kn)
	4477	{
	4478	struct socket so = (struct socket )kn->kn_fp->f_fglob->fg_data;
	4479	socket_lock(so, 1);
	4480
	4481	if (so->so_snd.sb_flags & SB_KNOTE)
	4482	if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
	4483	so->so_snd.sb_flags &= ~SB_KNOTE;
	4484	socket_unlock(so, 1);
	4485	}
	4486
	4487	int
	4488	so_wait_for_if_feedback(struct socket *so)
	4489	{
	4490	if ((SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6) &&
	4491	(so->so_state & SS_ISCONNECTED)) {
	4492	struct inpcb *inp = sotoinpcb(so);
	4493	if (INP_WAIT_FOR_IF_FEEDBACK(inp))
	4494	return (1);
	4495	}
	4496	return (0);
	4497	}
	4498
	4499	/ARGSUSED/
	4500	static int
	4501	filt_sowrite(struct knote *kn, long hint)
	4502	{
	4503	struct socket so = (struct socket )kn->kn_fp->f_fglob->fg_data;
	4504	int ret = 0;
	4505
	4506	if ((hint & SO_FILT_HINT_LOCKED) == 0)
	4507	socket_lock(so, 1);
	4508
	4509	kn->kn_data = sbspace(&so->so_snd);
	4510	if (so->so_state & SS_CANTSENDMORE) {
	4511	kn->kn_flags \|= EV_EOF;
	4512	kn->kn_fflags = so->so_error;
	4513	ret = 1;
	4514	goto out;
	4515	}
	4516	if (so->so_error) { /* temporary udp error */
	4517	ret = 1;
	4518	goto out;
	4519	}
	4520	if (((so->so_state & SS_ISCONNECTED) == 0) &&
	4521	(so->so_proto->pr_flags & PR_CONNREQUIRED)) {
	4522	ret = 0;
	4523	goto out;
	4524	}
	4525	int64_t lowwat = so->so_snd.sb_lowat;
	4526	if (kn->kn_sfflags & NOTE_LOWAT) {
	4527	if (kn->kn_sdata > so->so_snd.sb_hiwat)
	4528	lowwat = so->so_snd.sb_hiwat;
	4529	else if (kn->kn_sdata > lowwat)
	4530	lowwat = kn->kn_sdata;
	4531	}
	4532	if (kn->kn_data >= lowwat) {
	4533	if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0) {
	4534	ret = tcp_notsent_lowat_check(so);
	4535	} else {
	4536	ret = 1;
	4537	}
	4538	}
	4539	if (so_wait_for_if_feedback(so))
	4540	ret = 0;
	4541	out:
	4542	if ((hint & SO_FILT_HINT_LOCKED) == 0)
	4543	socket_unlock(so, 1);
	4544	return (ret);
	4545	}
	4546
	4547	static void
	4548	filt_sockdetach(struct knote *kn)
	4549	{
	4550	struct socket so = (struct socket )kn->kn_fp->f_fglob->fg_data;
	4551	socket_lock(so, 1);
	4552
	4553	if ((so->so_flags & SOF_KNOTE) != 0)
	4554	if (KNOTE_DETACH(&so->so_klist, kn))
	4555	so->so_flags &= ~SOF_KNOTE;
	4556	socket_unlock(so, 1);
	4557	}
	4558
	4559	static int
	4560	filt_sockev(struct knote *kn, long hint)
	4561	{
	4562	int ret = 0, locked = 0;
	4563	struct socket so = (struct socket )kn->kn_fp->f_fglob->fg_data;
	4564	long ev_hint = (hint & SO_FILT_HINT_EV);
	4565
	4566	if ((hint & SO_FILT_HINT_LOCKED) == 0) {
	4567	socket_lock(so, 1);
	4568	locked = 1;
	4569	}
	4570
	4571	if (ev_hint & SO_FILT_HINT_CONNRESET) {
	4572	if (kn->kn_sfflags & NOTE_CONNRESET)
	4573	kn->kn_fflags \|= NOTE_CONNRESET;
	4574	}
	4575	if (ev_hint & SO_FILT_HINT_TIMEOUT) {
	4576	if (kn->kn_sfflags & NOTE_TIMEOUT)
	4577	kn->kn_fflags \|= NOTE_TIMEOUT;
	4578	}
	4579	if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
	4580	if (kn->kn_sfflags & NOTE_NOSRCADDR)
	4581	kn->kn_fflags \|= NOTE_NOSRCADDR;
	4582	}
	4583	if (ev_hint & SO_FILT_HINT_IFDENIED) {
	4584	if ((kn->kn_sfflags & NOTE_IFDENIED))
	4585	kn->kn_fflags \|= NOTE_IFDENIED;
	4586	}
	4587	if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
	4588	if (kn->kn_sfflags & NOTE_KEEPALIVE)
	4589	kn->kn_fflags \|= NOTE_KEEPALIVE;
	4590	}
	4591	if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
	4592	if (kn->kn_sfflags & NOTE_ADAPTIVE_WTIMO)
	4593	kn->kn_fflags \|= NOTE_ADAPTIVE_WTIMO;
	4594	}
	4595	if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
	4596	if (kn->kn_sfflags & NOTE_ADAPTIVE_RTIMO)
	4597	kn->kn_fflags \|= NOTE_ADAPTIVE_RTIMO;
	4598	}
	4599	if (ev_hint & SO_FILT_HINT_CONNECTED) {
	4600	if (kn->kn_sfflags & NOTE_CONNECTED)
	4601	kn->kn_fflags \|= NOTE_CONNECTED;
	4602	}
	4603	if (ev_hint & SO_FILT_HINT_DISCONNECTED) {
	4604	if (kn->kn_sfflags & NOTE_DISCONNECTED)
	4605	kn->kn_fflags \|= NOTE_DISCONNECTED;
	4606	}
	4607	if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
	4608	if (so->so_proto != NULL &&
	4609	(so->so_proto->pr_flags & PR_EVCONNINFO) &&
	4610	(kn->kn_sfflags & NOTE_CONNINFO_UPDATED))
	4611	kn->kn_fflags \|= NOTE_CONNINFO_UPDATED;
	4612	}
	4613
	4614	if ((kn->kn_sfflags & NOTE_READCLOSED) &&
	4615	(so->so_state & SS_CANTRCVMORE))
	4616	kn->kn_fflags \|= NOTE_READCLOSED;
	4617
	4618	if ((kn->kn_sfflags & NOTE_WRITECLOSED) &&
	4619	(so->so_state & SS_CANTSENDMORE))
	4620	kn->kn_fflags \|= NOTE_WRITECLOSED;
	4621
	4622	if ((kn->kn_sfflags & NOTE_SUSPEND) &&
	4623	((ev_hint & SO_FILT_HINT_SUSPEND) \|\|
	4624	(so->so_flags & SOF_SUSPENDED))) {
	4625	kn->kn_fflags &= ~(NOTE_SUSPEND \| NOTE_RESUME);
	4626	kn->kn_fflags \|= NOTE_SUSPEND;
	4627	}
	4628
	4629	if ((kn->kn_sfflags & NOTE_RESUME) &&
	4630	((ev_hint & SO_FILT_HINT_RESUME) \|\|
	4631	(so->so_flags & SOF_SUSPENDED) == 0)) {
	4632	kn->kn_fflags &= ~(NOTE_SUSPEND \| NOTE_RESUME);
	4633	kn->kn_fflags \|= NOTE_RESUME;
	4634	}
	4635
	4636	if (so->so_error != 0) {
	4637	ret = 1;
	4638	kn->kn_data = so->so_error;
	4639	kn->kn_flags \|= EV_EOF;
	4640	} else {
	4641	get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
	4642	}
	4643
	4644	if (kn->kn_fflags != 0)
	4645	ret = 1;
	4646
	4647	if (locked)
	4648	socket_unlock(so, 1);
	4649
	4650	return (ret);
	4651	}
	4652
	4653	void
	4654	get_sockev_state(struct socket so, u_int32_t statep)
	4655	{
	4656	u_int32_t state = *(statep);
	4657
	4658	if (so->so_state & SS_ISCONNECTED)
	4659	state \|= SOCKEV_CONNECTED;
	4660	else
	4661	state &= ~(SOCKEV_CONNECTED);
	4662	state \|= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
	4663	*(statep) = state;
	4664	}
	4665
	4666	#define SO_LOCK_HISTORY_STR_LEN \
	4667	(2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
	4668
	4669	__private_extern__ const char *
	4670	solockhistory_nr(struct socket *so)
	4671	{
	4672	size_t n = 0;
	4673	int i;
	4674	static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
	4675
	4676	bzero(lock_history_str, sizeof (lock_history_str));
	4677	for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
	4678	n += snprintf(lock_history_str + n,
	4679	SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
	4680	so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
	4681	so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
	4682	}
	4683	return (lock_history_str);
	4684	}
	4685
	4686	int
	4687	socket_lock(struct socket *so, int refcount)
	4688	{
	4689	int error = 0;
	4690	void *lr_saved;
	4691
	4692	lr_saved = __builtin_return_address(0);
	4693
	4694	if (so->so_proto->pr_lock) {
	4695	error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
	4696	} else {
	4697	#ifdef MORE_LOCKING_DEBUG
	4698	lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
	4699	LCK_MTX_ASSERT_NOTOWNED);
	4700	#endif
	4701	lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
	4702	if (refcount)
	4703	so->so_usecount++;
	4704	so->lock_lr[so->next_lock_lr] = lr_saved;
	4705	so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
	4706	}
	4707
	4708	return (error);
	4709	}
	4710
	4711	int
	4712	socket_unlock(struct socket *so, int refcount)
	4713	{
	4714	int error = 0;
	4715	void *lr_saved;
	4716	lck_mtx_t *mutex_held;
	4717
	4718	lr_saved = __builtin_return_address(0);
	4719
	4720	if (so->so_proto == NULL) {
	4721	panic("%s: null so_proto so=%p\n", __func__, so);
	4722	/* NOTREACHED */
	4723	}
	4724
	4725	if (so && so->so_proto->pr_unlock) {
	4726	error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
	4727	} else {
	4728	mutex_held = so->so_proto->pr_domain->dom_mtx;
	4729	#ifdef MORE_LOCKING_DEBUG
	4730	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
	4731	#endif
	4732	so->unlock_lr[so->next_unlock_lr] = lr_saved;
	4733	so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
	4734
	4735	if (refcount) {
	4736	if (so->so_usecount <= 0) {
	4737	panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
	4738	"lrh=%s", __func__, so->so_usecount, so,
	4739	SOCK_DOM(so), so->so_type,
	4740	SOCK_PROTO(so), solockhistory_nr(so));
	4741	/* NOTREACHED */
	4742	}
	4743
	4744	so->so_usecount--;
	4745	if (so->so_usecount == 0)
	4746	sofreelastref(so, 1);
	4747	}
	4748	lck_mtx_unlock(mutex_held);
	4749	}
	4750
	4751	return (error);
	4752	}
	4753
	4754	/* Called with socket locked, will unlock socket */
	4755	void
	4756	sofree(struct socket *so)
	4757	{
	4758	lck_mtx_t *mutex_held;
	4759
	4760	if (so->so_proto->pr_getlock != NULL)
	4761	mutex_held = (*so->so_proto->pr_getlock)(so, 0);
	4762	else
	4763	mutex_held = so->so_proto->pr_domain->dom_mtx;
	4764	lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
	4765
	4766	sofreelastref(so, 0);
	4767	}
	4768
	4769	void
	4770	soreference(struct socket *so)
	4771	{
	4772	socket_lock(so, 1); /* locks & take one reference on socket */
	4773	socket_unlock(so, 0); /* unlock only */
	4774	}
	4775
	4776	void
	4777	sodereference(struct socket *so)
	4778	{
	4779	socket_lock(so, 0);
	4780	socket_unlock(so, 1);
	4781	}
	4782
	4783	/*
	4784	* Set or clear SOF_MULTIPAGES on the socket to enable or disable the
	4785	* possibility of using jumbo clusters. Caller must ensure to hold
	4786	* the socket lock.
	4787	*/
	4788	void
	4789	somultipages(struct socket *so, boolean_t set)
	4790	{
	4791	if (set)
	4792	so->so_flags \|= SOF_MULTIPAGES;
	4793	else
	4794	so->so_flags &= ~SOF_MULTIPAGES;
	4795	}
	4796
	4797	int
	4798	so_isdstlocal(struct socket *so) {
	4799
	4800	struct inpcb inp = (struct inpcb )so->so_pcb;
	4801
	4802	if (SOCK_DOM(so) == PF_INET)
	4803	return (inaddr_local(inp->inp_faddr));
	4804	else if (SOCK_DOM(so) == PF_INET6)
	4805	return (in6addr_local(&inp->in6p_faddr));
	4806
	4807	return (0);
	4808	}
	4809
	4810	int
	4811	sosetdefunct(struct proc p, struct socket so, int level, boolean_t noforce)
	4812	{
	4813	struct sockbuf rcv, snd;
	4814	int err = 0, defunct;
	4815
	4816	rcv = &so->so_rcv;
	4817	snd = &so->so_snd;
	4818
	4819	defunct = (so->so_flags & SOF_DEFUNCT);
	4820	if (defunct) {
	4821	if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
	4822	panic("%s: SB_DROP not set", __func__);
	4823	/* NOTREACHED */
	4824	}
	4825	goto done;
	4826	}
	4827
	4828	if (so->so_flags & SOF_NODEFUNCT) {
	4829	if (noforce) {
	4830	err = EOPNOTSUPP;
	4831	SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) "
	4832	"so 0x%llx [%d,%d] is not eligible for defunct "
	4833	"(%d)\n", __func__, proc_selfpid(), proc_pid(p),
	4834	level, (uint64_t)VM_KERNEL_ADDRPERM(so),
	4835	SOCK_DOM(so), SOCK_TYPE(so), err));
	4836	return (err);
	4837	}
	4838	so->so_flags &= ~SOF_NODEFUNCT;
	4839	SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
	4840	"[%d,%d] defunct by force\n", __func__, proc_selfpid(),
	4841	proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
	4842	SOCK_DOM(so), SOCK_TYPE(so)));
	4843	}
	4844
	4845	so->so_flags \|= SOF_DEFUNCT;
	4846
	4847	/* Prevent further data from being appended to the socket buffers */
	4848	snd->sb_flags \|= SB_DROP;
	4849	rcv->sb_flags \|= SB_DROP;
	4850
	4851	/* Flush any existing data in the socket buffers */
	4852	if (rcv->sb_cc != 0) {
	4853	rcv->sb_flags &= ~SB_SEL;
	4854	selthreadclear(&rcv->sb_sel);
	4855	sbrelease(rcv);
	4856	}
	4857	if (snd->sb_cc != 0) {
	4858	snd->sb_flags &= ~SB_SEL;
	4859	selthreadclear(&snd->sb_sel);
	4860	sbrelease(snd);
	4861	}
	4862
	4863	done:
	4864	SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%d,%d] %s "
	4865	"defunct\n", __func__, proc_selfpid(), proc_pid(p), level,
	4866	(uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so),
	4867	defunct ? "is already" : "marked as"));
	4868
	4869	return (err);
	4870	}
	4871
	4872	int
	4873	sodefunct(struct proc p, struct socket so, int level)
	4874	{
	4875	struct sockbuf rcv, snd;
	4876
	4877	if (!(so->so_flags & SOF_DEFUNCT)) {
	4878	panic("%s improperly called", __func__);
	4879	/* NOTREACHED */
	4880	}
	4881	if (so->so_state & SS_DEFUNCT)
	4882	goto done;
	4883
	4884	rcv = &so->so_rcv;
	4885	snd = &so->so_snd;
	4886
	4887	if (SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6) {
	4888	char s[MAX_IPv6_STR_LEN];
	4889	char d[MAX_IPv6_STR_LEN];
	4890	struct inpcb *inp = sotoinpcb(so);
	4891
	4892	SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%s "
	4893	"%s:%d -> %s:%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
	4894	"rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
	4895	proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
	4896	(SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
	4897	inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
	4898	(void )&inp->inp_laddr.s_addr : (void )&inp->in6p_laddr),
	4899	s, sizeof (s)), ntohs(inp->in6p_lport),
	4900	inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
	4901	(void )&inp->inp_faddr.s_addr : (void )&inp->in6p_faddr,
	4902	d, sizeof (d)), ntohs(inp->in6p_fport),
	4903	(uint32_t)rcv->sb_sel.si_flags,
	4904	(uint32_t)snd->sb_sel.si_flags,
	4905	rcv->sb_flags, snd->sb_flags));
	4906	} else {
	4907	SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
	4908	"[%d,%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
	4909	"rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
	4910	proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
	4911	SOCK_DOM(so), SOCK_TYPE(so), (uint32_t)rcv->sb_sel.si_flags,
	4912	(uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
	4913	snd->sb_flags));
	4914	}
	4915
	4916	/*
	4917	* Unwedge threads blocked on sbwait() and sb_lock().
	4918	*/
	4919	sbwakeup(rcv);
	4920	sbwakeup(snd);
	4921
	4922	if (rcv->sb_flags & SB_LOCK)
	4923	sbunlock(rcv, TRUE); /* keep socket locked */
	4924	if (snd->sb_flags & SB_LOCK)
	4925	sbunlock(snd, TRUE); /* keep socket locked */
	4926
	4927	/*
	4928	* Flush the buffers and disconnect. We explicitly call shutdown
	4929	* on both data directions to ensure that SS_CANT{RCV,SEND}MORE
	4930	* states are set for the socket. This would also flush out data
	4931	* hanging off the receive list of this socket.
	4932	*/
	4933	(void) soshutdownlock(so, SHUT_RD);
	4934	(void) soshutdownlock(so, SHUT_WR);
	4935	(void) sodisconnectlocked(so);
	4936
	4937	/*
	4938	* Explicitly handle connectionless-protocol disconnection
	4939	* and release any remaining data in the socket buffers.
	4940	*/
	4941	if (!(so->so_flags & SS_ISDISCONNECTED))
	4942	(void) soisdisconnected(so);
	4943
	4944	if (so->so_error == 0)
	4945	so->so_error = EBADF;
	4946
	4947	if (rcv->sb_cc != 0) {
	4948	rcv->sb_flags &= ~SB_SEL;
	4949	selthreadclear(&rcv->sb_sel);
	4950	sbrelease(rcv);
	4951	}
	4952	if (snd->sb_cc != 0) {
	4953	snd->sb_flags &= ~SB_SEL;
	4954	selthreadclear(&snd->sb_sel);
	4955	sbrelease(snd);
	4956	}
	4957	so->so_state \|= SS_DEFUNCT;
	4958
	4959	done:
	4960	return (0);
	4961	}
	4962
	4963	__private_extern__ int
	4964	so_set_recv_anyif(struct socket *so, int optval)
	4965	{
	4966	int ret = 0;
	4967
	4968	#if INET6
	4969	if (SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6) {
	4970	#else
	4971	if (SOCK_DOM(so) == PF_INET) {
	4972	#endif /* !INET6 */
	4973	if (optval)
	4974	sotoinpcb(so)->inp_flags \|= INP_RECV_ANYIF;
	4975	else
	4976	sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
	4977	}
	4978
	4979	return (ret);
	4980	}
	4981
	4982	__private_extern__ int
	4983	so_get_recv_anyif(struct socket *so)
	4984	{
	4985	int ret = 0;
	4986
	4987	#if INET6
	4988	if (SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6) {
	4989	#else
	4990	if (SOCK_DOM(so) == PF_INET) {
	4991	#endif /* !INET6 */
	4992	ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
	4993	}
	4994
	4995	return (ret);
	4996	}
	4997
	4998	int
	4999	so_set_restrictions(struct socket *so, uint32_t vals)
	5000	{
	5001	int nocell_old, nocell_new;
	5002	int ret = 0;
	5003
	5004	/*
	5005	* Deny-type restrictions are trapdoors; once set they cannot be
	5006	* unset for the lifetime of the socket. This allows them to be
	5007	* issued by a framework on behalf of the application without
	5008	* having to worry that they can be undone.
	5009	*
	5010	* Note here that socket-level restrictions overrides any protocol
	5011	* level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
	5012	* socket restriction issued on the socket has a higher precendence
	5013	* than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
	5014	* policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
	5015	* i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
	5016	*/
	5017	nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
	5018	so->so_restrictions \|= (vals & (SO_RESTRICT_DENY_IN \|
	5019	SO_RESTRICT_DENY_OUT \| SO_RESTRICT_DENY_CELLULAR));
	5020	nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
	5021
	5022	/* other than deny cellular, there's nothing more to do */
	5023	if ((nocell_new - nocell_old) == 0)
	5024	return (ret);
	5025
	5026	/* we can only set, not clear restrictions */
	5027	VERIFY((nocell_new - nocell_old) > 0);
	5028
	5029	#if INET6
	5030	if (SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6) {
	5031	#else
	5032	if (SOCK_DOM(so) == PF_INET) {
	5033	#endif /* !INET6 */
	5034	/* if deny cellular is now set, do what's needed for INPCB */
	5035	inp_set_nocellular(sotoinpcb(so));
	5036	}
	5037
	5038	return (ret);
	5039	}
	5040
	5041	uint32_t
	5042	so_get_restrictions(struct socket *so)
	5043	{
	5044	return (so->so_restrictions & (SO_RESTRICT_DENY_IN \|
	5045	SO_RESTRICT_DENY_OUT \| SO_RESTRICT_DENY_CELLULAR));
	5046	}
	5047
	5048	struct sockaddr_entry *
	5049	sockaddrentry_alloc(int how)
	5050	{
	5051	struct sockaddr_entry *se;
	5052
	5053	se = (how == M_WAITOK) ? zalloc(se_zone) : zalloc_noblock(se_zone);
	5054	if (se != NULL)
	5055	bzero(se, se_zone_size);
	5056
	5057	return (se);
	5058	}
	5059
	5060	void
	5061	sockaddrentry_free(struct sockaddr_entry *se)
	5062	{
	5063	if (se->se_addr != NULL) {
	5064	FREE(se->se_addr, M_SONAME);
	5065	se->se_addr = NULL;
	5066	}
	5067	zfree(se_zone, se);
	5068	}
	5069
	5070	struct sockaddr_entry *
	5071	sockaddrentry_dup(const struct sockaddr_entry *src_se, int how)
	5072	{
	5073	struct sockaddr_entry *dst_se;
	5074
	5075	dst_se = sockaddrentry_alloc(how);
	5076	if (dst_se != NULL) {
	5077	int len = src_se->se_addr->sa_len;
	5078
	5079	MALLOC(dst_se->se_addr, struct sockaddr *,
	5080	len, M_SONAME, how \| M_ZERO);
	5081	if (dst_se->se_addr != NULL) {
	5082	bcopy(src_se->se_addr, dst_se->se_addr, len);
	5083	} else {
	5084	sockaddrentry_free(dst_se);
	5085	dst_se = NULL;
	5086	}
	5087	}
	5088
	5089	return (dst_se);
	5090	}
	5091
	5092	struct sockaddr_list *
	5093	sockaddrlist_alloc(int how)
	5094	{
	5095	struct sockaddr_list *sl;
	5096
	5097	sl = (how == M_WAITOK) ? zalloc(sl_zone) : zalloc_noblock(sl_zone);
	5098	if (sl != NULL) {
	5099	bzero(sl, sl_zone_size);
	5100	TAILQ_INIT(&sl->sl_head);
	5101	}
	5102	return (sl);
	5103	}
	5104
	5105	void
	5106	sockaddrlist_free(struct sockaddr_list *sl)
	5107	{
	5108	struct sockaddr_entry se, tse;
	5109
	5110	TAILQ_FOREACH_SAFE(se, &sl->sl_head, se_link, tse) {
	5111	sockaddrlist_remove(sl, se);
	5112	sockaddrentry_free(se);
	5113	}
	5114	VERIFY(sl->sl_cnt == 0 && TAILQ_EMPTY(&sl->sl_head));
	5115	zfree(sl_zone, sl);
	5116	}
	5117
	5118	void
	5119	sockaddrlist_insert(struct sockaddr_list sl, struct sockaddr_entry se)
	5120	{
	5121	VERIFY(!(se->se_flags & SEF_ATTACHED));
	5122	se->se_flags \|= SEF_ATTACHED;
	5123	TAILQ_INSERT_TAIL(&sl->sl_head, se, se_link);
	5124	sl->sl_cnt++;
	5125	VERIFY(sl->sl_cnt != 0);
	5126	}
	5127
	5128	void
	5129	sockaddrlist_remove(struct sockaddr_list sl, struct sockaddr_entry se)
	5130	{
	5131	VERIFY(se->se_flags & SEF_ATTACHED);
	5132	se->se_flags &= ~SEF_ATTACHED;
	5133	VERIFY(sl->sl_cnt != 0);
	5134	sl->sl_cnt--;
	5135	TAILQ_REMOVE(&sl->sl_head, se, se_link);
	5136	}
	5137
	5138	struct sockaddr_list *
	5139	sockaddrlist_dup(const struct sockaddr_list *src_sl, int how)
	5140	{
	5141	struct sockaddr_entry src_se, tse;
	5142	struct sockaddr_list *dst_sl;
	5143
	5144	dst_sl = sockaddrlist_alloc(how);
	5145	if (dst_sl == NULL)
	5146	return (NULL);
	5147
	5148	TAILQ_FOREACH_SAFE(src_se, &src_sl->sl_head, se_link, tse) {
	5149	struct sockaddr_entry *dst_se;
	5150
	5151	if (src_se->se_addr == NULL)
	5152	continue;
	5153
	5154	dst_se = sockaddrentry_dup(src_se, how);
	5155	if (dst_se == NULL) {
	5156	sockaddrlist_free(dst_sl);
	5157	return (NULL);
	5158	}
	5159
	5160	sockaddrlist_insert(dst_sl, dst_se);
	5161	}
	5162	VERIFY(src_sl->sl_cnt == dst_sl->sl_cnt);
	5163
	5164	return (dst_sl);
	5165	}
	5166
	5167	int
	5168	so_set_effective_pid(struct socket so, int epid, struct proc p)
	5169	{
	5170	struct proc *ep = PROC_NULL;
	5171	int error = 0;
	5172
	5173	/* pid 0 is reserved for kernel */
	5174	if (epid == 0) {
	5175	error = EINVAL;
	5176	goto done;
	5177	}
	5178
	5179	/*
	5180	* If this is an in-kernel socket, prevent its delegate
	5181	* association from changing unless the socket option is
	5182	* coming from within the kernel itself.
	5183	*/
	5184	if (so->last_pid == 0 && p != kernproc) {
	5185	error = EACCES;
	5186	goto done;
	5187	}
	5188
	5189	/*
	5190	* If this is issued by a process that's recorded as the
	5191	* real owner of the socket, or if the pid is the same as
	5192	* the process's own pid, then proceed. Otherwise ensure
	5193	* that the issuing process has the necessary privileges.
	5194	*/
	5195	if (epid != so->last_pid \|\| epid != proc_pid(p)) {
	5196	if ((error = priv_check_cred(kauth_cred_get(),
	5197	PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
	5198	error = EACCES;
	5199	goto done;
	5200	}
	5201	}
	5202
	5203	/* Find the process that corresponds to the effective pid */
	5204	if ((ep = proc_find(epid)) == PROC_NULL) {
	5205	error = ESRCH;
	5206	goto done;
	5207	}
	5208
	5209	/*
	5210	* If a process tries to delegate the socket to itself, then
	5211	* there's really nothing to do; treat it as a way for the
	5212	* delegate association to be cleared. Note that we check
	5213	* the passed-in proc rather than calling proc_selfpid(),
	5214	* as we need to check the process issuing the socket option
	5215	* which could be kernproc. Given that we don't allow 0 for
	5216	* effective pid, it means that a delegated in-kernel socket
	5217	* stays delegated during its lifetime (which is probably OK.)
	5218	*/
	5219	if (epid == proc_pid(p)) {
	5220	so->so_flags &= ~SOF_DELEGATED;
	5221	so->e_upid = 0;
	5222	so->e_pid = 0;
	5223	uuid_clear(so->e_uuid);
	5224	} else {
	5225	so->so_flags \|= SOF_DELEGATED;
	5226	so->e_upid = proc_uniqueid(ep);
	5227	so->e_pid = proc_pid(ep);
	5228	proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
	5229	}
	5230
	5231	done:
	5232	if (error == 0 && net_io_policy_log) {
	5233	uuid_string_t buf;
	5234
	5235	uuid_unparse(so->e_uuid, buf);
	5236	log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
	5237	"euuid %s%s\n", __func__, proc_name_address(p),
	5238	proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
	5239	SOCK_TYPE(so), so->e_pid, proc_name_address(ep), buf,
	5240	((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
	5241	} else if (error != 0 && net_io_policy_log) {
	5242	log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
	5243	"ERROR (%d)\n", __func__, proc_name_address(p),
	5244	proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
	5245	SOCK_TYPE(so), epid, (ep == PROC_NULL) ? "PROC_NULL" :
	5246	proc_name_address(ep), error);
	5247	}
	5248
	5249	if (ep != PROC_NULL)
	5250	proc_rele(ep);
	5251
	5252	return (error);
	5253	}
	5254
	5255	int
	5256	so_set_effective_uuid(struct socket so, uuid_t euuid, struct proc p)
	5257	{
	5258	uuid_string_t buf;
	5259	uuid_t uuid;
	5260	int error = 0;
	5261
	5262	/* UUID must not be all-zeroes (reserved for kernel) */
	5263	if (uuid_is_null(euuid)) {
	5264	error = EINVAL;
	5265	goto done;;
	5266	}
	5267
	5268	/*
	5269	* If this is an in-kernel socket, prevent its delegate
	5270	* association from changing unless the socket option is
	5271	* coming from within the kernel itself.
	5272	*/
	5273	if (so->last_pid == 0 && p != kernproc) {
	5274	error = EACCES;
	5275	goto done;
	5276	}
	5277
	5278	/* Get the UUID of the issuing process */
	5279	proc_getexecutableuuid(p, uuid, sizeof (uuid));
	5280
	5281	/*
	5282	* If this is issued by a process that's recorded as the
	5283	* real owner of the socket, or if the uuid is the same as
	5284	* the process's own uuid, then proceed. Otherwise ensure
	5285	* that the issuing process has the necessary privileges.
	5286	*/
	5287	if (uuid_compare(euuid, so->last_uuid) != 0 \|\|
	5288	uuid_compare(euuid, uuid) != 0) {
	5289	if ((error = priv_check_cred(kauth_cred_get(),
	5290	PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
	5291	error = EACCES;
	5292	goto done;
	5293	}
	5294	}
	5295
	5296	/*
	5297	* If a process tries to delegate the socket to itself, then
	5298	* there's really nothing to do; treat it as a way for the
	5299	* delegate association to be cleared. Note that we check
	5300	* the uuid of the passed-in proc rather than that of the
	5301	* current process, as we need to check the process issuing
	5302	* the socket option which could be kernproc itself. Given
	5303	* that we don't allow 0 for effective uuid, it means that
	5304	* a delegated in-kernel socket stays delegated during its
	5305	* lifetime (which is okay.)
	5306	*/
	5307	if (uuid_compare(euuid, uuid) == 0) {
	5308	so->so_flags &= ~SOF_DELEGATED;
	5309	so->e_upid = 0;
	5310	so->e_pid = 0;
	5311	uuid_clear(so->e_uuid);
	5312	} else {
	5313	so->so_flags \|= SOF_DELEGATED;
	5314	/*
	5315	* Unlike so_set_effective_pid(), we only have the UUID
	5316	* here and the process ID is not known. Inherit the
	5317	* real {pid,upid} of the socket.
	5318	*/
	5319	so->e_upid = so->last_upid;
	5320	so->e_pid = so->last_pid;
	5321	uuid_copy(so->e_uuid, euuid);
	5322	}
	5323
	5324	done:
	5325	if (error == 0 && net_io_policy_log) {
	5326	uuid_unparse(so->e_uuid, buf);
	5327	log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
	5328	"euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
	5329	(uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
	5330	SOCK_TYPE(so), so->e_pid, buf,
	5331	((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
	5332	} else if (error != 0 && net_io_policy_log) {
	5333	uuid_unparse(euuid, buf);
	5334	log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
	5335	"ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
	5336	(uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
	5337	SOCK_TYPE(so), buf, error);
	5338	}
	5339
	5340	return (error);
	5341	}
	5342
	5343	void
	5344	netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
	5345	uint32_t ev_datalen)
	5346	{
	5347	struct kev_msg ev_msg;
	5348
	5349	/*
	5350	* A netpolicy event always starts with a netpolicy_event_data
	5351	* structure, but the caller can provide for a longer event
	5352	* structure to post, depending on the event code.
	5353	*/
	5354	VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
	5355
	5356	bzero(&ev_msg, sizeof (ev_msg));
	5357	ev_msg.vendor_code = KEV_VENDOR_APPLE;
	5358	ev_msg.kev_class = KEV_NETWORK_CLASS;
	5359	ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
	5360	ev_msg.event_code = ev_code;
	5361
	5362	ev_msg.dv[0].data_ptr = ev_data;
	5363	ev_msg.dv[0].data_length = ev_datalen;
	5364
	5365	kev_post_msg(&ev_msg);
	5366	}