]> git.saurik.com Git - apple/xnu.git/blame_incremental - bsd/kern/uipc_socket.c
xnu-2422.110.17.tar.gz
[apple/xnu.git] / bsd / kern / uipc_socket.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 1998-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70#include <sys/param.h>
71#include <sys/systm.h>
72#include <sys/filedesc.h>
73#include <sys/proc.h>
74#include <sys/proc_internal.h>
75#include <sys/kauth.h>
76#include <sys/file_internal.h>
77#include <sys/fcntl.h>
78#include <sys/malloc.h>
79#include <sys/mbuf.h>
80#include <sys/domain.h>
81#include <sys/kernel.h>
82#include <sys/event.h>
83#include <sys/poll.h>
84#include <sys/protosw.h>
85#include <sys/socket.h>
86#include <sys/socketvar.h>
87#include <sys/resourcevar.h>
88#include <sys/signalvar.h>
89#include <sys/sysctl.h>
90#include <sys/syslog.h>
91#include <sys/uio.h>
92#include <sys/ev.h>
93#include <sys/kdebug.h>
94#include <sys/un.h>
95#include <sys/user.h>
96#include <sys/priv.h>
97#include <sys/kern_event.h>
98#include <net/route.h>
99#include <net/init.h>
100#include <net/ntstat.h>
101#include <netinet/in.h>
102#include <netinet/in_pcb.h>
103#include <netinet/ip6.h>
104#include <netinet6/ip6_var.h>
105#include <netinet/flow_divert.h>
106#include <kern/zalloc.h>
107#include <kern/locks.h>
108#include <machine/limits.h>
109#include <libkern/OSAtomic.h>
110#include <pexpert/pexpert.h>
111#include <kern/assert.h>
112#include <kern/task.h>
113#include <sys/kpi_mbuf.h>
114#include <sys/mcache.h>
115
116#if CONFIG_MACF
117#include <security/mac.h>
118#include <security/mac_framework.h>
119#endif /* MAC */
120
121#if MULTIPATH
122#include <netinet/mp_pcb.h>
123#endif /* MULTIPATH */
124
125/* TODO: this should be in a header file somewhere */
126extern char *proc_name_address(void *p);
127
128static u_int32_t so_cache_hw; /* High water mark for socache */
129static u_int32_t so_cache_timeouts; /* number of timeouts */
130static u_int32_t so_cache_max_freed; /* max freed per timeout */
131static u_int32_t cached_sock_count = 0;
132STAILQ_HEAD(, socket) so_cache_head;
133int max_cached_sock_count = MAX_CACHED_SOCKETS;
134static u_int32_t so_cache_time;
135static int socketinit_done;
136static struct zone *so_cache_zone;
137
138static lck_grp_t *so_cache_mtx_grp;
139static lck_attr_t *so_cache_mtx_attr;
140static lck_grp_attr_t *so_cache_mtx_grp_attr;
141static lck_mtx_t *so_cache_mtx;
142
143#include <machine/limits.h>
144
145static void filt_sordetach(struct knote *kn);
146static int filt_soread(struct knote *kn, long hint);
147static void filt_sowdetach(struct knote *kn);
148static int filt_sowrite(struct knote *kn, long hint);
149static void filt_sockdetach(struct knote *kn);
150static int filt_sockev(struct knote *kn, long hint);
151
152static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
153static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
154
155static struct filterops soread_filtops = {
156 .f_isfd = 1,
157 .f_detach = filt_sordetach,
158 .f_event = filt_soread,
159};
160
161static struct filterops sowrite_filtops = {
162 .f_isfd = 1,
163 .f_detach = filt_sowdetach,
164 .f_event = filt_sowrite,
165};
166
167static struct filterops sock_filtops = {
168 .f_isfd = 1,
169 .f_detach = filt_sockdetach,
170 .f_event = filt_sockev,
171};
172
173#define EVEN_MORE_LOCKING_DEBUG 0
174int socket_debug = 0;
175static int socket_zone = M_SOCKET;
176so_gen_t so_gencnt; /* generation count for sockets */
177
178MALLOC_DEFINE(M_SONAME, "soname", "socket name");
179MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
180
181#define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
182#define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
183#define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
184#define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
185#define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
186#define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
187#define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
188
189#define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
190
191SYSCTL_DECL(_kern_ipc);
192
193int somaxconn = SOMAXCONN;
194SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
195 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
196
197/* Should we get a maximum also ??? */
198static int sosendmaxchain = 65536;
199static int sosendminchain = 16384;
200static int sorecvmincopy = 16384;
201SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
202 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
203SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
204 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
205
206/*
207 * Set to enable jumbo clusters (if available) for large writes when
208 * the socket is marked with SOF_MULTIPAGES; see below.
209 */
210int sosendjcl = 1;
211SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
212 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
213
214/*
215 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
216 * writes on the socket for all protocols on any network interfaces,
217 * depending upon sosendjcl above. Be extra careful when setting this
218 * to 1, because sending down packets that cross physical pages down to
219 * broken drivers (those that falsely assume that the physical pages
220 * are contiguous) might lead to system panics or silent data corruption.
221 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
222 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
223 * capable. Set this to 1 only for testing/debugging purposes.
224 */
225int sosendjcl_ignore_capab = 0;
226SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
227 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
228
229int sodefunctlog = 0;
230SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
231 &sodefunctlog, 0, "");
232
233int sothrottlelog = 0;
234SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
235 &sothrottlelog, 0, "");
236
237int sorestrictrecv = 1;
238SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
239 &sorestrictrecv, 0, "Enable inbound interface restrictions");
240
241/*
242 * Socket operation routines.
243 * These routines are called by the routines in
244 * sys_socket.c or from a system process, and
245 * implement the semantics of socket operations by
246 * switching out to the protocol specific routines.
247 */
248
249/* sys_generic.c */
250extern void postevent(struct socket *, struct sockbuf *, int);
251extern void evsofree(struct socket *);
252extern int tcp_notsent_lowat_check(struct socket *so);
253extern struct inpcbinfo tcbinfo;
254
255/* TODO: these should be in header file */
256extern int get_inpcb_str_size(void);
257extern int get_tcp_str_size(void);
258
259static unsigned int sl_zone_size; /* size of sockaddr_list */
260static struct zone *sl_zone; /* zone for sockaddr_list */
261
262static unsigned int se_zone_size; /* size of sockaddr_entry */
263static struct zone *se_zone; /* zone for sockaddr_entry */
264
265vm_size_t so_cache_zone_element_size;
266
267static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **, user_ssize_t *);
268static void cached_sock_alloc(struct socket **, int);
269static void cached_sock_free(struct socket *);
270
271/*
272 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
273 * setting the DSCP code on the packet based on the service class; see
274 * <rdar://problem/11277343> for details.
275 */
276__private_extern__ u_int32_t sotcdb = SOTCDB_NO_DSCP;
277SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
278 &sotcdb, 0, "");
279
280void
281socketinit(void)
282{
283 if (socketinit_done) {
284 printf("socketinit: already called...\n");
285 return;
286 }
287 socketinit_done = 1;
288
289 PE_parse_boot_argn("socket_debug", &socket_debug,
290 sizeof (socket_debug));
291
292 /*
293 * allocate lock group attribute and group for socket cache mutex
294 */
295 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
296 so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
297 so_cache_mtx_grp_attr);
298
299 /*
300 * allocate the lock attribute for socket cache mutex
301 */
302 so_cache_mtx_attr = lck_attr_alloc_init();
303
304 /* cached sockets mutex */
305 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
306 if (so_cache_mtx == NULL) {
307 panic("%s: unable to allocate so_cache_mtx\n", __func__);
308 /* NOTREACHED */
309 }
310 STAILQ_INIT(&so_cache_head);
311
312 so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4
313 + get_inpcb_str_size() + 4 + get_tcp_str_size());
314
315 so_cache_zone = zinit(so_cache_zone_element_size,
316 (120000 * so_cache_zone_element_size), 8192, "socache zone");
317 zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
318 zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
319
320 sl_zone_size = sizeof (struct sockaddr_list);
321 if ((sl_zone = zinit(sl_zone_size, 1024 * sl_zone_size, 1024,
322 "sockaddr_list")) == NULL) {
323 panic("%s: unable to allocate sockaddr_list zone\n", __func__);
324 /* NOTREACHED */
325 }
326 zone_change(sl_zone, Z_CALLERACCT, FALSE);
327 zone_change(sl_zone, Z_EXPAND, TRUE);
328
329 se_zone_size = sizeof (struct sockaddr_entry);
330 if ((se_zone = zinit(se_zone_size, 1024 * se_zone_size, 1024,
331 "sockaddr_entry")) == NULL) {
332 panic("%s: unable to allocate sockaddr_entry zone\n", __func__);
333 /* NOTREACHED */
334 }
335 zone_change(se_zone, Z_CALLERACCT, FALSE);
336 zone_change(se_zone, Z_EXPAND, TRUE);
337
338
339 in_pcbinit();
340 sflt_init();
341 socket_tclass_init();
342#if MULTIPATH
343 mp_pcbinit();
344#endif /* MULTIPATH */
345}
346
347static void
348cached_sock_alloc(struct socket **so, int waitok)
349{
350 caddr_t temp;
351 uintptr_t offset;
352
353 lck_mtx_lock(so_cache_mtx);
354
355 if (!STAILQ_EMPTY(&so_cache_head)) {
356 VERIFY(cached_sock_count > 0);
357
358 *so = STAILQ_FIRST(&so_cache_head);
359 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
360 STAILQ_NEXT((*so), so_cache_ent) = NULL;
361
362 cached_sock_count--;
363 lck_mtx_unlock(so_cache_mtx);
364
365 temp = (*so)->so_saved_pcb;
366 bzero((caddr_t)*so, sizeof (struct socket));
367
368 (*so)->so_saved_pcb = temp;
369 } else {
370
371 lck_mtx_unlock(so_cache_mtx);
372
373 if (waitok)
374 *so = (struct socket *)zalloc(so_cache_zone);
375 else
376 *so = (struct socket *)zalloc_noblock(so_cache_zone);
377
378 if (*so == NULL)
379 return;
380
381 bzero((caddr_t)*so, sizeof (struct socket));
382
383 /*
384 * Define offsets for extra structures into our
385 * single block of memory. Align extra structures
386 * on longword boundaries.
387 */
388
389 offset = (uintptr_t)*so;
390 offset += sizeof (struct socket);
391
392 offset = ALIGN(offset);
393
394 (*so)->so_saved_pcb = (caddr_t)offset;
395 offset += get_inpcb_str_size();
396
397 offset = ALIGN(offset);
398
399 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
400 (caddr_t)offset;
401 }
402
403 (*so)->cached_in_sock_layer = true;
404}
405
406static void
407cached_sock_free(struct socket *so)
408{
409
410 lck_mtx_lock(so_cache_mtx);
411
412 so_cache_time = net_uptime();
413 if (++cached_sock_count > max_cached_sock_count) {
414 --cached_sock_count;
415 lck_mtx_unlock(so_cache_mtx);
416 zfree(so_cache_zone, so);
417 } else {
418 if (so_cache_hw < cached_sock_count)
419 so_cache_hw = cached_sock_count;
420
421 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
422
423 so->cache_timestamp = so_cache_time;
424 lck_mtx_unlock(so_cache_mtx);
425 }
426}
427
428void
429so_update_last_owner_locked(struct socket *so, proc_t self)
430{
431 if (so->last_pid != 0) {
432 /*
433 * last_pid and last_upid should remain zero for sockets
434 * created using sock_socket. The check above achieves that
435 */
436 if (self == PROC_NULL)
437 self = current_proc();
438
439 if (so->last_upid != proc_uniqueid(self) ||
440 so->last_pid != proc_pid(self)) {
441 so->last_upid = proc_uniqueid(self);
442 so->last_pid = proc_pid(self);
443 proc_getexecutableuuid(self, so->last_uuid,
444 sizeof (so->last_uuid));
445 }
446 }
447}
448
449void
450so_update_policy(struct socket *so)
451{
452 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
453 (void) inp_update_policy(sotoinpcb(so));
454}
455
456boolean_t
457so_cache_timer(void)
458{
459 struct socket *p;
460 int n_freed = 0;
461 boolean_t rc = FALSE;
462
463 lck_mtx_lock(so_cache_mtx);
464 so_cache_timeouts++;
465 so_cache_time = net_uptime();
466
467 while (!STAILQ_EMPTY(&so_cache_head)) {
468 VERIFY(cached_sock_count > 0);
469 p = STAILQ_FIRST(&so_cache_head);
470 if ((so_cache_time - p->cache_timestamp) <
471 SO_CACHE_TIME_LIMIT)
472 break;
473
474 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
475 --cached_sock_count;
476
477 zfree(so_cache_zone, p);
478
479 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
480 so_cache_max_freed++;
481 break;
482 }
483 }
484
485 /* Schedule again if there is more to cleanup */
486 if (!STAILQ_EMPTY(&so_cache_head))
487 rc = TRUE;
488
489 lck_mtx_unlock(so_cache_mtx);
490 return (rc);
491}
492
493/*
494 * Get a socket structure from our zone, and initialize it.
495 * We don't implement `waitok' yet (see comments in uipc_domain.c).
496 * Note that it would probably be better to allocate socket
497 * and PCB at the same time, but I'm not convinced that all
498 * the protocols can be easily modified to do this.
499 */
500struct socket *
501soalloc(int waitok, int dom, int type)
502{
503 struct socket *so;
504
505 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
506 cached_sock_alloc(&so, waitok);
507 } else {
508 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
509 M_WAITOK);
510 if (so != NULL)
511 bzero(so, sizeof (*so));
512 }
513 if (so != NULL) {
514 so->so_gencnt = ++so_gencnt;
515 so->so_zone = socket_zone;
516#if CONFIG_MACF_SOCKET
517 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
518 if (mac_socket_label_init(so, !waitok) != 0) {
519 sodealloc(so);
520 return (NULL);
521 }
522#endif /* MAC_SOCKET */
523 }
524
525 return (so);
526}
527
528int
529socreate_internal(int dom, struct socket **aso, int type, int proto,
530 struct proc *p, uint32_t flags, struct proc *ep)
531{
532 struct protosw *prp;
533 struct socket *so;
534 int error = 0;
535
536#if TCPDEBUG
537 extern int tcpconsdebug;
538#endif
539
540 VERIFY(aso != NULL);
541 *aso = NULL;
542
543 if (proto != 0)
544 prp = pffindproto(dom, proto, type);
545 else
546 prp = pffindtype(dom, type);
547
548 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
549 if (pffinddomain(dom) == NULL)
550 return (EAFNOSUPPORT);
551 if (proto != 0) {
552 if (pffindprotonotype(dom, proto) != NULL)
553 return (EPROTOTYPE);
554 }
555 return (EPROTONOSUPPORT);
556 }
557 if (prp->pr_type != type)
558 return (EPROTOTYPE);
559 so = soalloc(1, dom, type);
560 if (so == NULL)
561 return (ENOBUFS);
562
563 if (flags & SOCF_ASYNC)
564 so->so_state |= SS_NBIO;
565#if MULTIPATH
566 if (flags & SOCF_MP_SUBFLOW) {
567 /*
568 * A multipath subflow socket is used internally in the kernel,
569 * therefore it does not have a file desciptor associated by
570 * default.
571 */
572 so->so_state |= SS_NOFDREF;
573 so->so_flags |= SOF_MP_SUBFLOW;
574 }
575#endif /* MULTIPATH */
576
577 TAILQ_INIT(&so->so_incomp);
578 TAILQ_INIT(&so->so_comp);
579 so->so_type = type;
580 so->last_upid = proc_uniqueid(p);
581 so->last_pid = proc_pid(p);
582 proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
583
584 if (ep != PROC_NULL && ep != p) {
585 so->e_upid = proc_uniqueid(ep);
586 so->e_pid = proc_pid(ep);
587 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
588 so->so_flags |= SOF_DELEGATED;
589 }
590
591 so->so_cred = kauth_cred_proc_ref(p);
592 if (!suser(kauth_cred_get(), NULL))
593 so->so_state |= SS_PRIV;
594
595 so->so_proto = prp;
596 so->so_rcv.sb_flags |= SB_RECV;
597 so->so_rcv.sb_so = so->so_snd.sb_so = so;
598 so->next_lock_lr = 0;
599 so->next_unlock_lr = 0;
600
601#if CONFIG_MACF_SOCKET
602 mac_socket_label_associate(kauth_cred_get(), so);
603#endif /* MAC_SOCKET */
604
605 /*
606 * Attachment will create the per pcb lock if necessary and
607 * increase refcount for creation, make sure it's done before
608 * socket is inserted in lists.
609 */
610 so->so_usecount++;
611
612 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
613 if (error != 0) {
614 /*
615 * Warning:
616 * If so_pcb is not zero, the socket will be leaked,
617 * so protocol attachment handler must be coded carefuly
618 */
619 so->so_state |= SS_NOFDREF;
620 so->so_usecount--;
621 sofreelastref(so, 1); /* will deallocate the socket */
622 return (error);
623 }
624
625 atomic_add_32(&prp->pr_domain->dom_refs, 1);
626 TAILQ_INIT(&so->so_evlist);
627
628 /* Attach socket filters for this protocol */
629 sflt_initsock(so);
630#if TCPDEBUG
631 if (tcpconsdebug == 2)
632 so->so_options |= SO_DEBUG;
633#endif
634 so_set_default_traffic_class(so);
635
636 /*
637 * If this thread or task is marked to create backgrounded sockets,
638 * mark the socket as background.
639 */
640 if (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
641 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
642 so->so_background_thread = current_thread();
643 }
644
645 switch (dom) {
646 /*
647 * Don't mark Unix domain, system or multipath sockets as
648 * eligible for defunct by default.
649 */
650 case PF_LOCAL:
651 case PF_SYSTEM:
652 case PF_MULTIPATH:
653 so->so_flags |= SOF_NODEFUNCT;
654 break;
655 default:
656 break;
657 }
658
659 *aso = so;
660
661 return (0);
662}
663
664/*
665 * Returns: 0 Success
666 * EAFNOSUPPORT
667 * EPROTOTYPE
668 * EPROTONOSUPPORT
669 * ENOBUFS
670 * <pru_attach>:ENOBUFS[AF_UNIX]
671 * <pru_attach>:ENOBUFS[TCP]
672 * <pru_attach>:ENOMEM[TCP]
673 * <pru_attach>:??? [other protocol families, IPSEC]
674 */
675int
676socreate(int dom, struct socket **aso, int type, int proto)
677{
678 return (socreate_internal(dom, aso, type, proto, current_proc(), 0,
679 PROC_NULL));
680}
681
682int
683socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
684{
685 int error = 0;
686 struct proc *ep = PROC_NULL;
687
688 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
689 error = ESRCH;
690 goto done;
691 }
692
693 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
694
695 /*
696 * It might not be wise to hold the proc reference when calling
697 * socreate_internal since it calls soalloc with M_WAITOK
698 */
699done:
700 if (ep != PROC_NULL)
701 proc_rele(ep);
702
703 return (error);
704}
705
706/*
707 * Returns: 0 Success
708 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
709 * <pru_bind>:EAFNOSUPPORT Address family not supported
710 * <pru_bind>:EADDRNOTAVAIL Address not available.
711 * <pru_bind>:EINVAL Invalid argument
712 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
713 * <pru_bind>:EACCES Permission denied
714 * <pru_bind>:EADDRINUSE Address in use
715 * <pru_bind>:EAGAIN Resource unavailable, try again
716 * <pru_bind>:EPERM Operation not permitted
717 * <pru_bind>:???
718 * <sf_bind>:???
719 *
720 * Notes: It's not possible to fully enumerate the return codes above,
721 * since socket filter authors and protocol family authors may
722 * not choose to limit their error returns to those listed, even
723 * though this may result in some software operating incorrectly.
724 *
725 * The error codes which are enumerated above are those known to
726 * be returned by the tcp_usr_bind function supplied.
727 */
728int
729sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
730{
731 struct proc *p = current_proc();
732 int error = 0;
733
734 if (dolock)
735 socket_lock(so, 1);
736 VERIFY(so->so_usecount > 1);
737
738 so_update_last_owner_locked(so, p);
739 so_update_policy(so);
740
741 /*
742 * If this is a bind request on a socket that has been marked
743 * as inactive, reject it now before we go any further.
744 */
745 if (so->so_flags & SOF_DEFUNCT) {
746 error = EINVAL;
747 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
748 __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
749 SOCK_DOM(so), SOCK_TYPE(so), error));
750 goto out;
751 }
752
753 /* Socket filter */
754 error = sflt_bind(so, nam);
755
756 if (error == 0)
757 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
758out:
759 if (dolock)
760 socket_unlock(so, 1);
761
762 if (error == EJUSTRETURN)
763 error = 0;
764
765 return (error);
766}
767
768void
769sodealloc(struct socket *so)
770{
771 kauth_cred_unref(&so->so_cred);
772
773 /* Remove any filters */
774 sflt_termsock(so);
775
776 /* Delete the state allocated for msg queues on a socket */
777 if (so->so_flags & SOF_ENABLE_MSGS) {
778 FREE(so->so_msg_state, M_TEMP);
779 so->so_msg_state = NULL;
780 }
781 VERIFY(so->so_msg_state == NULL);
782
783 so->so_gencnt = ++so_gencnt;
784
785#if CONFIG_MACF_SOCKET
786 mac_socket_label_destroy(so);
787#endif /* MAC_SOCKET */
788
789 if (so->cached_in_sock_layer) {
790 cached_sock_free(so);
791 } else {
792 FREE_ZONE(so, sizeof (*so), so->so_zone);
793 }
794}
795
796/*
797 * Returns: 0 Success
798 * EINVAL
799 * EOPNOTSUPP
800 * <pru_listen>:EINVAL[AF_UNIX]
801 * <pru_listen>:EINVAL[TCP]
802 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
803 * <pru_listen>:EINVAL[TCP] Invalid argument
804 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
805 * <pru_listen>:EACCES[TCP] Permission denied
806 * <pru_listen>:EADDRINUSE[TCP] Address in use
807 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
808 * <pru_listen>:EPERM[TCP] Operation not permitted
809 * <sf_listen>:???
810 *
811 * Notes: Other <pru_listen> returns depend on the protocol family; all
812 * <sf_listen> returns depend on what the filter author causes
813 * their filter to return.
814 */
815int
816solisten(struct socket *so, int backlog)
817{
818 struct proc *p = current_proc();
819 int error = 0;
820
821 socket_lock(so, 1);
822
823 so_update_last_owner_locked(so, p);
824 so_update_policy(so);
825
826 if (so->so_proto == NULL) {
827 error = EINVAL;
828 goto out;
829 }
830 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
831 error = EOPNOTSUPP;
832 goto out;
833 }
834
835 /*
836 * If the listen request is made on a socket that is not fully
837 * disconnected, or on a socket that has been marked as inactive,
838 * reject the request now.
839 */
840 if ((so->so_state &
841 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
842 (so->so_flags & SOF_DEFUNCT)) {
843 error = EINVAL;
844 if (so->so_flags & SOF_DEFUNCT) {
845 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
846 "(%d)\n", __func__, proc_pid(p),
847 (uint64_t)VM_KERNEL_ADDRPERM(so),
848 SOCK_DOM(so), SOCK_TYPE(so), error));
849 }
850 goto out;
851 }
852
853 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
854 error = EPERM;
855 goto out;
856 }
857
858 error = sflt_listen(so);
859 if (error == 0)
860 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
861
862 if (error) {
863 if (error == EJUSTRETURN)
864 error = 0;
865 goto out;
866 }
867
868 if (TAILQ_EMPTY(&so->so_comp))
869 so->so_options |= SO_ACCEPTCONN;
870 /*
871 * POSIX: The implementation may have an upper limit on the length of
872 * the listen queue-either global or per accepting socket. If backlog
873 * exceeds this limit, the length of the listen queue is set to the
874 * limit.
875 *
876 * If listen() is called with a backlog argument value that is less
877 * than 0, the function behaves as if it had been called with a backlog
878 * argument value of 0.
879 *
880 * A backlog argument of 0 may allow the socket to accept connections,
881 * in which case the length of the listen queue may be set to an
882 * implementation-defined minimum value.
883 */
884 if (backlog <= 0 || backlog > somaxconn)
885 backlog = somaxconn;
886
887 so->so_qlimit = backlog;
888out:
889 socket_unlock(so, 1);
890 return (error);
891}
892
893void
894sofreelastref(struct socket *so, int dealloc)
895{
896 struct socket *head = so->so_head;
897
898 /* Assume socket is locked */
899
900 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
901 selthreadclear(&so->so_snd.sb_sel);
902 selthreadclear(&so->so_rcv.sb_sel);
903 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
904 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
905 so->so_event = NULL;
906 return;
907 }
908 if (head != NULL) {
909 socket_lock(head, 1);
910 if (so->so_state & SS_INCOMP) {
911 TAILQ_REMOVE(&head->so_incomp, so, so_list);
912 head->so_incqlen--;
913 } else if (so->so_state & SS_COMP) {
914 /*
915 * We must not decommission a socket that's
916 * on the accept(2) queue. If we do, then
917 * accept(2) may hang after select(2) indicated
918 * that the listening socket was ready.
919 */
920 selthreadclear(&so->so_snd.sb_sel);
921 selthreadclear(&so->so_rcv.sb_sel);
922 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
923 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
924 so->so_event = NULL;
925 socket_unlock(head, 1);
926 return;
927 } else {
928 panic("sofree: not queued");
929 }
930 head->so_qlen--;
931 so->so_state &= ~SS_INCOMP;
932 so->so_head = NULL;
933 socket_unlock(head, 1);
934 }
935 sowflush(so);
936 sorflush(so);
937
938#if FLOW_DIVERT
939 if (so->so_flags & SOF_FLOW_DIVERT) {
940 flow_divert_detach(so);
941 }
942#endif /* FLOW_DIVERT */
943
944 /* 3932268: disable upcall */
945 so->so_rcv.sb_flags &= ~SB_UPCALL;
946 so->so_snd.sb_flags &= ~SB_UPCALL;
947 so->so_event = NULL;
948
949 if (dealloc)
950 sodealloc(so);
951}
952
953void
954soclose_wait_locked(struct socket *so)
955{
956 lck_mtx_t *mutex_held;
957
958 if (so->so_proto->pr_getlock != NULL)
959 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
960 else
961 mutex_held = so->so_proto->pr_domain->dom_mtx;
962 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
963
964 /*
965 * Double check here and return if there's no outstanding upcall;
966 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
967 */
968 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
969 return;
970 so->so_rcv.sb_flags &= ~SB_UPCALL;
971 so->so_snd.sb_flags &= ~SB_UPCALL;
972 so->so_flags |= SOF_CLOSEWAIT;
973 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
974 "soclose_wait_locked", NULL);
975 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
976 so->so_flags &= ~SOF_CLOSEWAIT;
977}
978
979/*
980 * Close a socket on last file table reference removal.
981 * Initiate disconnect if connected.
982 * Free socket when disconnect complete.
983 */
984int
985soclose_locked(struct socket *so)
986{
987 int error = 0;
988 lck_mtx_t *mutex_held;
989 struct timespec ts;
990
991 if (so->so_usecount == 0) {
992 panic("soclose: so=%p refcount=0\n", so);
993 /* NOTREACHED */
994 }
995
996 sflt_notify(so, sock_evt_closing, NULL);
997
998 if (so->so_upcallusecount)
999 soclose_wait_locked(so);
1000
1001 if ((so->so_options & SO_ACCEPTCONN)) {
1002 struct socket *sp, *sonext;
1003 int socklock = 0;
1004
1005 /*
1006 * We do not want new connection to be added
1007 * to the connection queues
1008 */
1009 so->so_options &= ~SO_ACCEPTCONN;
1010
1011 for (sp = TAILQ_FIRST(&so->so_incomp);
1012 sp != NULL; sp = sonext) {
1013 sonext = TAILQ_NEXT(sp, so_list);
1014
1015 /*
1016 * Radar 5350314
1017 * skip sockets thrown away by tcpdropdropblreq
1018 * they will get cleanup by the garbage collection.
1019 * otherwise, remove the incomp socket from the queue
1020 * and let soabort trigger the appropriate cleanup.
1021 */
1022 if (sp->so_flags & SOF_OVERFLOW)
1023 continue;
1024
1025 if (so->so_proto->pr_getlock != NULL) {
1026 /*
1027 * Lock ordering for consistency with the
1028 * rest of the stack, we lock the socket
1029 * first and then grabb the head.
1030 */
1031 socket_unlock(so, 0);
1032 socket_lock(sp, 1);
1033 socket_lock(so, 0);
1034 socklock = 1;
1035 }
1036
1037 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1038 so->so_incqlen--;
1039
1040 if (sp->so_state & SS_INCOMP) {
1041 sp->so_state &= ~SS_INCOMP;
1042 sp->so_head = NULL;
1043
1044 (void) soabort(sp);
1045 }
1046
1047 if (socklock)
1048 socket_unlock(sp, 1);
1049 }
1050
1051 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
1052 /* Dequeue from so_comp since sofree() won't do it */
1053 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1054 so->so_qlen--;
1055
1056 if (so->so_proto->pr_getlock != NULL) {
1057 socket_unlock(so, 0);
1058 socket_lock(sp, 1);
1059 }
1060
1061 if (sp->so_state & SS_COMP) {
1062 sp->so_state &= ~SS_COMP;
1063 sp->so_head = NULL;
1064
1065 (void) soabort(sp);
1066 }
1067
1068 if (so->so_proto->pr_getlock != NULL) {
1069 socket_unlock(sp, 1);
1070 socket_lock(so, 0);
1071 }
1072 }
1073 }
1074 if (so->so_pcb == NULL) {
1075 /* 3915887: mark the socket as ready for dealloc */
1076 so->so_flags |= SOF_PCBCLEARING;
1077 goto discard;
1078 }
1079 if (so->so_state & SS_ISCONNECTED) {
1080 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1081 error = sodisconnectlocked(so);
1082 if (error)
1083 goto drop;
1084 }
1085 if (so->so_options & SO_LINGER) {
1086 if ((so->so_state & SS_ISDISCONNECTING) &&
1087 (so->so_state & SS_NBIO))
1088 goto drop;
1089 if (so->so_proto->pr_getlock != NULL)
1090 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1091 else
1092 mutex_held = so->so_proto->pr_domain->dom_mtx;
1093 while (so->so_state & SS_ISCONNECTED) {
1094 ts.tv_sec = (so->so_linger/100);
1095 ts.tv_nsec = (so->so_linger % 100) *
1096 NSEC_PER_USEC * 1000 * 10;
1097 error = msleep((caddr_t)&so->so_timeo,
1098 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1099 if (error) {
1100 /*
1101 * It's OK when the time fires,
1102 * don't report an error
1103 */
1104 if (error == EWOULDBLOCK)
1105 error = 0;
1106 break;
1107 }
1108 }
1109 }
1110 }
1111drop:
1112 if (so->so_usecount == 0) {
1113 panic("soclose: usecount is zero so=%p\n", so);
1114 /* NOTREACHED */
1115 }
1116 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1117 /*
1118 * Let NetworkStatistics know this PCB is going away
1119 * before we detach it.
1120 */
1121 if (nstat_collect &&
1122 (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6))
1123 nstat_pcb_detach(so->so_pcb);
1124
1125 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1126 if (error == 0)
1127 error = error2;
1128 }
1129 if (so->so_usecount <= 0) {
1130 panic("soclose: usecount is zero so=%p\n", so);
1131 /* NOTREACHED */
1132 }
1133discard:
1134 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1135 (so->so_state & SS_NOFDREF)) {
1136 panic("soclose: NOFDREF");
1137 /* NOTREACHED */
1138 }
1139 so->so_state |= SS_NOFDREF;
1140
1141 if (so->so_flags & SOF_MP_SUBFLOW)
1142 so->so_flags &= ~SOF_MP_SUBFLOW;
1143
1144 if ((so->so_flags & SOF_KNOTE) != 0)
1145 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1146
1147 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1148 evsofree(so);
1149
1150 so->so_usecount--;
1151 sofree(so);
1152 return (error);
1153}
1154
1155int
1156soclose(struct socket *so)
1157{
1158 int error = 0;
1159 socket_lock(so, 1);
1160
1161 if (so->so_retaincnt == 0) {
1162 error = soclose_locked(so);
1163 } else {
1164 /*
1165 * if the FD is going away, but socket is
1166 * retained in kernel remove its reference
1167 */
1168 so->so_usecount--;
1169 if (so->so_usecount < 2)
1170 panic("soclose: retaincnt non null and so=%p "
1171 "usecount=%d\n", so, so->so_usecount);
1172 }
1173 socket_unlock(so, 1);
1174 return (error);
1175}
1176
1177/*
1178 * Must be called at splnet...
1179 */
1180/* Should already be locked */
1181int
1182soabort(struct socket *so)
1183{
1184 int error;
1185
1186#ifdef MORE_LOCKING_DEBUG
1187 lck_mtx_t *mutex_held;
1188
1189 if (so->so_proto->pr_getlock != NULL)
1190 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1191 else
1192 mutex_held = so->so_proto->pr_domain->dom_mtx;
1193 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1194#endif
1195
1196 if ((so->so_flags & SOF_ABORTED) == 0) {
1197 so->so_flags |= SOF_ABORTED;
1198 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1199 if (error) {
1200 sofree(so);
1201 return (error);
1202 }
1203 }
1204 return (0);
1205}
1206
1207int
1208soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1209{
1210 int error;
1211
1212 if (dolock)
1213 socket_lock(so, 1);
1214
1215 so_update_last_owner_locked(so, PROC_NULL);
1216 so_update_policy(so);
1217
1218 if ((so->so_state & SS_NOFDREF) == 0)
1219 panic("soaccept: !NOFDREF");
1220 so->so_state &= ~SS_NOFDREF;
1221 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1222
1223 if (dolock)
1224 socket_unlock(so, 1);
1225 return (error);
1226}
1227
1228int
1229soaccept(struct socket *so, struct sockaddr **nam)
1230{
1231 return (soacceptlock(so, nam, 1));
1232}
1233
1234int
1235soacceptfilter(struct socket *so)
1236{
1237 struct sockaddr *local = NULL, *remote = NULL;
1238 int error = 0;
1239 struct socket *head = so->so_head;
1240
1241 /*
1242 * Hold the lock even if this socket has not been made visible
1243 * to the filter(s). For sockets with global locks, this protects
1244 * against the head or peer going away
1245 */
1246 socket_lock(so, 1);
1247 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1248 sogetaddr_locked(so, &local, 0) != 0) {
1249 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1250 so->so_head = NULL;
1251 socket_unlock(so, 1);
1252 soclose(so);
1253 /* Out of resources; try it again next time */
1254 error = ECONNABORTED;
1255 goto done;
1256 }
1257
1258 error = sflt_accept(head, so, local, remote);
1259
1260 /*
1261 * If we get EJUSTRETURN from one of the filters, mark this socket
1262 * as inactive and return it anyway. This newly accepted socket
1263 * will be disconnected later before we hand it off to the caller.
1264 */
1265 if (error == EJUSTRETURN) {
1266 error = 0;
1267 (void) sosetdefunct(current_proc(), so,
1268 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1269 }
1270
1271 if (error != 0) {
1272 /*
1273 * This may seem like a duplication to the above error
1274 * handling part when we return ECONNABORTED, except
1275 * the following is done while holding the lock since
1276 * the socket has been exposed to the filter(s) earlier.
1277 */
1278 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1279 so->so_head = NULL;
1280 socket_unlock(so, 1);
1281 soclose(so);
1282 /* Propagate socket filter's error code to the caller */
1283 } else {
1284 socket_unlock(so, 1);
1285 }
1286done:
1287 /* Callee checks for NULL pointer */
1288 sock_freeaddr(remote);
1289 sock_freeaddr(local);
1290 return (error);
1291}
1292
1293/*
1294 * Returns: 0 Success
1295 * EOPNOTSUPP Operation not supported on socket
1296 * EISCONN Socket is connected
1297 * <pru_connect>:EADDRNOTAVAIL Address not available.
1298 * <pru_connect>:EINVAL Invalid argument
1299 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1300 * <pru_connect>:EACCES Permission denied
1301 * <pru_connect>:EADDRINUSE Address in use
1302 * <pru_connect>:EAGAIN Resource unavailable, try again
1303 * <pru_connect>:EPERM Operation not permitted
1304 * <sf_connect_out>:??? [anything a filter writer might set]
1305 */
1306int
1307soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1308{
1309 int error;
1310 struct proc *p = current_proc();
1311
1312 if (dolock)
1313 socket_lock(so, 1);
1314
1315 so_update_last_owner_locked(so, p);
1316 so_update_policy(so);
1317
1318 /*
1319 * If this is a listening socket or if this is a previously-accepted
1320 * socket that has been marked as inactive, reject the connect request.
1321 */
1322 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1323 error = EOPNOTSUPP;
1324 if (so->so_flags & SOF_DEFUNCT) {
1325 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1326 "(%d)\n", __func__, proc_pid(p),
1327 (uint64_t)VM_KERNEL_ADDRPERM(so),
1328 SOCK_DOM(so), SOCK_TYPE(so), error));
1329 }
1330 if (dolock)
1331 socket_unlock(so, 1);
1332 return (error);
1333 }
1334
1335 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1336 if (dolock)
1337 socket_unlock(so, 1);
1338 return (EPERM);
1339 }
1340
1341 /*
1342 * If protocol is connection-based, can only connect once.
1343 * Otherwise, if connected, try to disconnect first.
1344 * This allows user to disconnect by connecting to, e.g.,
1345 * a null address.
1346 */
1347 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1348 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1349 (error = sodisconnectlocked(so)))) {
1350 error = EISCONN;
1351 } else {
1352 /*
1353 * Run connect filter before calling protocol:
1354 * - non-blocking connect returns before completion;
1355 */
1356 error = sflt_connectout(so, nam);
1357 if (error != 0) {
1358 if (error == EJUSTRETURN)
1359 error = 0;
1360 } else {
1361 error = (*so->so_proto->pr_usrreqs->pru_connect)
1362 (so, nam, p);
1363 }
1364 }
1365 if (dolock)
1366 socket_unlock(so, 1);
1367 return (error);
1368}
1369
1370int
1371soconnect(struct socket *so, struct sockaddr *nam)
1372{
1373 return (soconnectlock(so, nam, 1));
1374}
1375
1376/*
1377 * Returns: 0 Success
1378 * <pru_connect2>:EINVAL[AF_UNIX]
1379 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1380 * <pru_connect2>:??? [other protocol families]
1381 *
1382 * Notes: <pru_connect2> is not supported by [TCP].
1383 */
1384int
1385soconnect2(struct socket *so1, struct socket *so2)
1386{
1387 int error;
1388
1389 socket_lock(so1, 1);
1390 if (so2->so_proto->pr_lock)
1391 socket_lock(so2, 1);
1392
1393 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1394
1395 socket_unlock(so1, 1);
1396 if (so2->so_proto->pr_lock)
1397 socket_unlock(so2, 1);
1398 return (error);
1399}
1400
1401int
1402soconnectxlocked(struct socket *so, struct sockaddr_list **src_sl,
1403 struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
1404 associd_t aid, connid_t *pcid, uint32_t flags, void *arg,
1405 uint32_t arglen)
1406{
1407 int error;
1408
1409 /*
1410 * If this is a listening socket or if this is a previously-accepted
1411 * socket that has been marked as inactive, reject the connect request.
1412 */
1413 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1414 error = EOPNOTSUPP;
1415 if (so->so_flags & SOF_DEFUNCT) {
1416 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1417 "(%d)\n", __func__, proc_pid(p),
1418 (uint64_t)VM_KERNEL_ADDRPERM(so),
1419 SOCK_DOM(so), SOCK_TYPE(so), error));
1420 }
1421 return (error);
1422 }
1423
1424 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0)
1425 return (EPERM);
1426
1427 /*
1428 * If protocol is connection-based, can only connect once
1429 * unless PR_MULTICONN is set. Otherwise, if connected,
1430 * try to disconnect first. This allows user to disconnect
1431 * by connecting to, e.g., a null address.
1432 */
1433 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) &&
1434 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1435 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1436 (error = sodisconnectlocked(so)) != 0)) {
1437 error = EISCONN;
1438 } else {
1439 /*
1440 * Run connect filter before calling protocol:
1441 * - non-blocking connect returns before completion;
1442 */
1443 error = sflt_connectxout(so, dst_sl);
1444 if (error != 0) {
1445 if (error == EJUSTRETURN)
1446 error = 0;
1447 } else {
1448 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1449 (so, src_sl, dst_sl, p, ifscope, aid, pcid,
1450 flags, arg, arglen);
1451 }
1452 }
1453
1454 return (error);
1455}
1456
1457int
1458sodisconnectlocked(struct socket *so)
1459{
1460 int error;
1461
1462 if ((so->so_state & SS_ISCONNECTED) == 0) {
1463 error = ENOTCONN;
1464 goto bad;
1465 }
1466 if (so->so_state & SS_ISDISCONNECTING) {
1467 error = EALREADY;
1468 goto bad;
1469 }
1470
1471 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1472 if (error == 0)
1473 sflt_notify(so, sock_evt_disconnected, NULL);
1474
1475bad:
1476 return (error);
1477}
1478
1479/* Locking version */
1480int
1481sodisconnect(struct socket *so)
1482{
1483 int error;
1484
1485 socket_lock(so, 1);
1486 error = sodisconnectlocked(so);
1487 socket_unlock(so, 1);
1488 return (error);
1489}
1490
1491int
1492sodisconnectxlocked(struct socket *so, associd_t aid, connid_t cid)
1493{
1494 int error;
1495
1496 /*
1497 * Call the protocol disconnectx handler; let it handle all
1498 * matters related to the connection state of this session.
1499 */
1500 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1501 if (error == 0) {
1502 /*
1503 * The event applies only for the session, not for
1504 * the disconnection of individual subflows.
1505 */
1506 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
1507 sflt_notify(so, sock_evt_disconnected, NULL);
1508 }
1509 return (error);
1510}
1511
1512int
1513sodisconnectx(struct socket *so, associd_t aid, connid_t cid)
1514{
1515 int error;
1516
1517 socket_lock(so, 1);
1518 error = sodisconnectxlocked(so, aid, cid);
1519 socket_unlock(so, 1);
1520 return (error);
1521}
1522
1523int
1524sopeelofflocked(struct socket *so, associd_t aid, struct socket **psop)
1525{
1526 return ((*so->so_proto->pr_usrreqs->pru_peeloff)(so, aid, psop));
1527}
1528
1529#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1530
1531/*
1532 * sosendcheck will lock the socket buffer if it isn't locked and
1533 * verify that there is space for the data being inserted.
1534 *
1535 * Returns: 0 Success
1536 * EPIPE
1537 * sblock:EWOULDBLOCK
1538 * sblock:EINTR
1539 * sbwait:EBADF
1540 * sbwait:EINTR
1541 * [so_error]:???
1542 */
1543int
1544sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1545 int32_t clen, int32_t atomic, int flags, int *sblocked,
1546 struct mbuf *control)
1547{
1548 int error = 0;
1549 int32_t space;
1550 int assumelock = 0;
1551
1552restart:
1553 if (*sblocked == 0) {
1554 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1555 so->so_send_filt_thread != 0 &&
1556 so->so_send_filt_thread == current_thread()) {
1557 /*
1558 * We're being called recursively from a filter,
1559 * allow this to continue. Radar 4150520.
1560 * Don't set sblocked because we don't want
1561 * to perform an unlock later.
1562 */
1563 assumelock = 1;
1564 } else {
1565 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1566 if (error) {
1567 if (so->so_flags & SOF_DEFUNCT)
1568 goto defunct;
1569 return (error);
1570 }
1571 *sblocked = 1;
1572 }
1573 }
1574
1575 /*
1576 * If a send attempt is made on a socket that has been marked
1577 * as inactive (disconnected), reject the request.
1578 */
1579 if (so->so_flags & SOF_DEFUNCT) {
1580defunct:
1581 error = EPIPE;
1582 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
1583 __func__, proc_selfpid(), (uint64_t)VM_KERNEL_ADDRPERM(so),
1584 SOCK_DOM(so), SOCK_TYPE(so), error));
1585 return (error);
1586 }
1587
1588 if (so->so_state & SS_CANTSENDMORE)
1589 return (EPIPE);
1590
1591 if (so->so_error) {
1592 error = so->so_error;
1593 so->so_error = 0;
1594 return (error);
1595 }
1596
1597 if ((so->so_state & SS_ISCONNECTED) == 0) {
1598 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1599 if ((so->so_state & SS_ISCONFIRMING) == 0 &&
1600 !(resid == 0 && clen != 0))
1601 return (ENOTCONN);
1602 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1603 return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1604 ENOTCONN : EDESTADDRREQ);
1605 }
1606 }
1607 if (so->so_flags & SOF_ENABLE_MSGS)
1608 space = msgq_sbspace(so, control);
1609 else
1610 space = sbspace(&so->so_snd);
1611
1612 if (flags & MSG_OOB)
1613 space += 1024;
1614 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1615 clen > so->so_snd.sb_hiwat)
1616 return (EMSGSIZE);
1617
1618 if ((space < resid + clen &&
1619 (atomic || space < (int32_t)so->so_snd.sb_lowat || space < clen)) ||
1620 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1621 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1622 assumelock) {
1623 return (EWOULDBLOCK);
1624 }
1625 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
1626 *sblocked = 0;
1627 error = sbwait(&so->so_snd);
1628 if (error) {
1629 if (so->so_flags & SOF_DEFUNCT)
1630 goto defunct;
1631 return (error);
1632 }
1633 goto restart;
1634 }
1635 return (0);
1636}
1637
1638/*
1639 * Send on a socket.
1640 * If send must go all at once and message is larger than
1641 * send buffering, then hard error.
1642 * Lock against other senders.
1643 * If must go all at once and not enough room now, then
1644 * inform user that this would block and do nothing.
1645 * Otherwise, if nonblocking, send as much as possible.
1646 * The data to be sent is described by "uio" if nonzero,
1647 * otherwise by the mbuf chain "top" (which must be null
1648 * if uio is not). Data provided in mbuf chain must be small
1649 * enough to send all at once.
1650 *
1651 * Returns nonzero on error, timeout or signal; callers
1652 * must check for short counts if EINTR/ERESTART are returned.
1653 * Data and control buffers are freed on return.
1654 * Experiment:
1655 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1656 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1657 * point at the mbuf chain being constructed and go from there.
1658 *
1659 * Returns: 0 Success
1660 * EOPNOTSUPP
1661 * EINVAL
1662 * ENOBUFS
1663 * uiomove:EFAULT
1664 * sosendcheck:EPIPE
1665 * sosendcheck:EWOULDBLOCK
1666 * sosendcheck:EINTR
1667 * sosendcheck:EBADF
1668 * sosendcheck:EINTR
1669 * sosendcheck:??? [value from so_error]
1670 * <pru_send>:ECONNRESET[TCP]
1671 * <pru_send>:EINVAL[TCP]
1672 * <pru_send>:ENOBUFS[TCP]
1673 * <pru_send>:EADDRINUSE[TCP]
1674 * <pru_send>:EADDRNOTAVAIL[TCP]
1675 * <pru_send>:EAFNOSUPPORT[TCP]
1676 * <pru_send>:EACCES[TCP]
1677 * <pru_send>:EAGAIN[TCP]
1678 * <pru_send>:EPERM[TCP]
1679 * <pru_send>:EMSGSIZE[TCP]
1680 * <pru_send>:EHOSTUNREACH[TCP]
1681 * <pru_send>:ENETUNREACH[TCP]
1682 * <pru_send>:ENETDOWN[TCP]
1683 * <pru_send>:ENOMEM[TCP]
1684 * <pru_send>:ENOBUFS[TCP]
1685 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
1686 * <pru_send>:EINVAL[AF_UNIX]
1687 * <pru_send>:EOPNOTSUPP[AF_UNIX]
1688 * <pru_send>:EPIPE[AF_UNIX]
1689 * <pru_send>:ENOTCONN[AF_UNIX]
1690 * <pru_send>:EISCONN[AF_UNIX]
1691 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
1692 * <sf_data_out>:??? [whatever a filter author chooses]
1693 *
1694 * Notes: Other <pru_send> returns depend on the protocol family; all
1695 * <sf_data_out> returns depend on what the filter author causes
1696 * their filter to return.
1697 */
1698int
1699sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1700 struct mbuf *top, struct mbuf *control, int flags)
1701{
1702 struct mbuf **mp;
1703 struct mbuf *m, *freelist = NULL;
1704 user_ssize_t space, len, resid;
1705 int clen = 0, error, dontroute, mlen, sendflags;
1706 int atomic = sosendallatonce(so) || top;
1707 int sblocked = 0;
1708 struct proc *p = current_proc();
1709 struct mbuf *control_copy = NULL;
1710
1711 if (uio != NULL)
1712 resid = uio_resid(uio);
1713 else
1714 resid = top->m_pkthdr.len;
1715
1716 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
1717 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1718
1719 socket_lock(so, 1);
1720 so_update_last_owner_locked(so, p);
1721 so_update_policy(so);
1722
1723 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
1724 error = EOPNOTSUPP;
1725 socket_unlock(so, 1);
1726 goto out;
1727 }
1728
1729 /*
1730 * In theory resid should be unsigned.
1731 * However, space must be signed, as it might be less than 0
1732 * if we over-committed, and we must use a signed comparison
1733 * of space and resid. On the other hand, a negative resid
1734 * causes us to loop sending 0-length segments to the protocol.
1735 *
1736 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
1737 * But it will be used by sockets doing message delivery.
1738 *
1739 * Note: We limit resid to be a positive 32 bits value as we use
1740 * imin() to set bytes_to_copy -- radr://14558484
1741 */
1742 if ((int32_t)resid < 0 || (so->so_type == SOCK_STREAM &&
1743 !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
1744 error = EINVAL;
1745 socket_unlock(so, 1);
1746 goto out;
1747 }
1748
1749 dontroute = (flags & MSG_DONTROUTE) &&
1750 (so->so_options & SO_DONTROUTE) == 0 &&
1751 (so->so_proto->pr_flags & PR_ATOMIC);
1752 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
1753
1754 if (control != NULL)
1755 clen = control->m_len;
1756
1757 do {
1758 error = sosendcheck(so, addr, resid, clen, atomic, flags,
1759 &sblocked, control);
1760 if (error)
1761 goto release;
1762
1763 mp = &top;
1764 if (so->so_flags & SOF_ENABLE_MSGS)
1765 space = msgq_sbspace(so, control);
1766 else
1767 space = sbspace(&so->so_snd) - clen;
1768 space += ((flags & MSG_OOB) ? 1024 : 0);
1769
1770 do {
1771 if (uio == NULL) {
1772 /*
1773 * Data is prepackaged in "top".
1774 */
1775 resid = 0;
1776 if (flags & MSG_EOR)
1777 top->m_flags |= M_EOR;
1778 } else {
1779 int chainlength;
1780 int bytes_to_copy;
1781 boolean_t jumbocl;
1782
1783 bytes_to_copy = imin(resid, space);
1784
1785 if (sosendminchain > 0)
1786 chainlength = 0;
1787 else
1788 chainlength = sosendmaxchain;
1789
1790 /*
1791 * Attempt to use larger than system page-size
1792 * clusters for large writes only if there is
1793 * a jumbo cluster pool and if the socket is
1794 * marked accordingly.
1795 */
1796 jumbocl = sosendjcl && njcl > 0 &&
1797 ((so->so_flags & SOF_MULTIPAGES) ||
1798 sosendjcl_ignore_capab);
1799
1800 socket_unlock(so, 0);
1801
1802 do {
1803 int num_needed;
1804 int hdrs_needed = (top == NULL) ? 1 : 0;
1805
1806 /*
1807 * try to maintain a local cache of mbuf
1808 * clusters needed to complete this
1809 * write the list is further limited to
1810 * the number that are currently needed
1811 * to fill the socket this mechanism
1812 * allows a large number of mbufs/
1813 * clusters to be grabbed under a single
1814 * mbuf lock... if we can't get any
1815 * clusters, than fall back to trying
1816 * for mbufs if we fail early (or
1817 * miscalcluate the number needed) make
1818 * sure to release any clusters we
1819 * haven't yet consumed.
1820 */
1821 if (freelist == NULL &&
1822 bytes_to_copy > MBIGCLBYTES &&
1823 jumbocl) {
1824 num_needed =
1825 bytes_to_copy / M16KCLBYTES;
1826
1827 if ((bytes_to_copy -
1828 (num_needed * M16KCLBYTES))
1829 >= MINCLSIZE)
1830 num_needed++;
1831
1832 freelist =
1833 m_getpackets_internal(
1834 (unsigned int *)&num_needed,
1835 hdrs_needed, M_WAIT, 0,
1836 M16KCLBYTES);
1837 /*
1838 * Fall back to 4K cluster size
1839 * if allocation failed
1840 */
1841 }
1842
1843 if (freelist == NULL &&
1844 bytes_to_copy > MCLBYTES) {
1845 num_needed =
1846 bytes_to_copy / MBIGCLBYTES;
1847
1848 if ((bytes_to_copy -
1849 (num_needed * MBIGCLBYTES)) >=
1850 MINCLSIZE)
1851 num_needed++;
1852
1853 freelist =
1854 m_getpackets_internal(
1855 (unsigned int *)&num_needed,
1856 hdrs_needed, M_WAIT, 0,
1857 MBIGCLBYTES);
1858 /*
1859 * Fall back to cluster size
1860 * if allocation failed
1861 */
1862 }
1863
1864 if (freelist == NULL &&
1865 bytes_to_copy > MINCLSIZE) {
1866 num_needed =
1867 bytes_to_copy / MCLBYTES;
1868
1869 if ((bytes_to_copy -
1870 (num_needed * MCLBYTES)) >=
1871 MINCLSIZE)
1872 num_needed++;
1873
1874 freelist =
1875 m_getpackets_internal(
1876 (unsigned int *)&num_needed,
1877 hdrs_needed, M_WAIT, 0,
1878 MCLBYTES);
1879 /*
1880 * Fall back to a single mbuf
1881 * if allocation failed
1882 */
1883 }
1884
1885 if (freelist == NULL) {
1886 if (top == NULL)
1887 MGETHDR(freelist,
1888 M_WAIT, MT_DATA);
1889 else
1890 MGET(freelist,
1891 M_WAIT, MT_DATA);
1892
1893 if (freelist == NULL) {
1894 error = ENOBUFS;
1895 socket_lock(so, 0);
1896 goto release;
1897 }
1898 /*
1899 * For datagram protocols,
1900 * leave room for protocol
1901 * headers in first mbuf.
1902 */
1903 if (atomic && top == NULL &&
1904 bytes_to_copy < MHLEN) {
1905 MH_ALIGN(freelist,
1906 bytes_to_copy);
1907 }
1908 }
1909 m = freelist;
1910 freelist = m->m_next;
1911 m->m_next = NULL;
1912
1913 if ((m->m_flags & M_EXT))
1914 mlen = m->m_ext.ext_size;
1915 else if ((m->m_flags & M_PKTHDR))
1916 mlen =
1917 MHLEN - m_leadingspace(m);
1918 else
1919 mlen = MLEN;
1920 len = imin(mlen, bytes_to_copy);
1921
1922 chainlength += len;
1923
1924 space -= len;
1925
1926 error = uiomove(mtod(m, caddr_t),
1927 len, uio);
1928
1929 resid = uio_resid(uio);
1930
1931 m->m_len = len;
1932 *mp = m;
1933 top->m_pkthdr.len += len;
1934 if (error)
1935 break;
1936 mp = &m->m_next;
1937 if (resid <= 0) {
1938 if (flags & MSG_EOR)
1939 top->m_flags |= M_EOR;
1940 break;
1941 }
1942 bytes_to_copy = min(resid, space);
1943
1944 } while (space > 0 &&
1945 (chainlength < sosendmaxchain || atomic ||
1946 resid < MINCLSIZE));
1947
1948 socket_lock(so, 0);
1949
1950 if (error)
1951 goto release;
1952 }
1953
1954 if (flags & (MSG_HOLD|MSG_SEND)) {
1955 /* Enqueue for later, go away if HOLD */
1956 struct mbuf *mb1;
1957 if (so->so_temp && (flags & MSG_FLUSH)) {
1958 m_freem(so->so_temp);
1959 so->so_temp = NULL;
1960 }
1961 if (so->so_temp)
1962 so->so_tail->m_next = top;
1963 else
1964 so->so_temp = top;
1965 mb1 = top;
1966 while (mb1->m_next)
1967 mb1 = mb1->m_next;
1968 so->so_tail = mb1;
1969 if (flags & MSG_HOLD) {
1970 top = NULL;
1971 goto release;
1972 }
1973 top = so->so_temp;
1974 }
1975 if (dontroute)
1976 so->so_options |= SO_DONTROUTE;
1977
1978 /* Compute flags here, for pru_send and NKEs */
1979 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
1980 /*
1981 * If the user set MSG_EOF, the protocol
1982 * understands this flag and nothing left to
1983 * send then use PRU_SEND_EOF instead of PRU_SEND.
1984 */
1985 ((flags & MSG_EOF) &&
1986 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
1987 (resid <= 0)) ? PRUS_EOF :
1988 /* If there is more to send set PRUS_MORETOCOME */
1989 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
1990
1991 /*
1992 * Socket filter processing
1993 */
1994 error = sflt_data_out(so, addr, &top,
1995 &control, (sendflags & MSG_OOB) ?
1996 sock_data_filt_flag_oob : 0);
1997 if (error) {
1998 if (error == EJUSTRETURN) {
1999 error = 0;
2000 clen = 0;
2001 control = NULL;
2002 top = NULL;
2003 }
2004
2005 goto release;
2006 }
2007 /*
2008 * End Socket filter processing
2009 */
2010
2011 if (so->so_flags & SOF_ENABLE_MSGS) {
2012 /*
2013 * Make a copy of control mbuf,
2014 * so that msg priority can be
2015 * passed to subsequent mbufs.
2016 */
2017 control_copy = m_dup(control, M_NOWAIT);
2018 }
2019 error = (*so->so_proto->pr_usrreqs->pru_send)
2020 (so, sendflags, top, addr, control, p);
2021
2022 if (flags & MSG_SEND)
2023 so->so_temp = NULL;
2024
2025 if (dontroute)
2026 so->so_options &= ~SO_DONTROUTE;
2027
2028 clen = 0;
2029 control = control_copy;
2030 control_copy = NULL;
2031 top = NULL;
2032 mp = &top;
2033 if (error)
2034 goto release;
2035 } while (resid && space > 0);
2036 } while (resid);
2037
2038release:
2039 if (sblocked)
2040 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2041 else
2042 socket_unlock(so, 1);
2043out:
2044 if (top != NULL)
2045 m_freem(top);
2046 if (control != NULL)
2047 m_freem(control);
2048 if (freelist != NULL)
2049 m_freem_list(freelist);
2050 if (control_copy != NULL)
2051 m_freem(control_copy);
2052
2053 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid, so->so_snd.sb_cc,
2054 space, error);
2055
2056 return (error);
2057}
2058
2059/*
2060 * Implement receive operations on a socket.
2061 * We depend on the way that records are added to the sockbuf
2062 * by sbappend*. In particular, each record (mbufs linked through m_next)
2063 * must begin with an address if the protocol so specifies,
2064 * followed by an optional mbuf or mbufs containing ancillary data,
2065 * and then zero or more mbufs of data.
2066 * In order to avoid blocking network interrupts for the entire time here,
2067 * we splx() while doing the actual copy to user space.
2068 * Although the sockbuf is locked, new data may still be appended,
2069 * and thus we must maintain consistency of the sockbuf during that time.
2070 *
2071 * The caller may receive the data as a single mbuf chain by supplying
2072 * an mbuf **mp0 for use in returning the chain. The uio is then used
2073 * only for the count in uio_resid.
2074 *
2075 * Returns: 0 Success
2076 * ENOBUFS
2077 * ENOTCONN
2078 * EWOULDBLOCK
2079 * uiomove:EFAULT
2080 * sblock:EWOULDBLOCK
2081 * sblock:EINTR
2082 * sbwait:EBADF
2083 * sbwait:EINTR
2084 * sodelayed_copy:EFAULT
2085 * <pru_rcvoob>:EINVAL[TCP]
2086 * <pru_rcvoob>:EWOULDBLOCK[TCP]
2087 * <pru_rcvoob>:???
2088 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
2089 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
2090 * <pr_domain->dom_externalize>:???
2091 *
2092 * Notes: Additional return values from calls through <pru_rcvoob> and
2093 * <pr_domain->dom_externalize> depend on protocols other than
2094 * TCP or AF_UNIX, which are documented above.
2095 */
2096int
2097soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2098 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2099{
2100 struct mbuf *m, **mp, *ml = NULL;
2101 struct mbuf *nextrecord, *free_list;
2102 int flags, error, offset;
2103 user_ssize_t len;
2104 struct protosw *pr = so->so_proto;
2105 int moff, type =0;
2106 user_ssize_t orig_resid = uio_resid(uio);
2107 user_ssize_t delayed_copy_len;
2108 int can_delay;
2109 int need_event;
2110 struct proc *p = current_proc();
2111
2112 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so, uio_resid(uio),
2113 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
2114
2115 socket_lock(so, 1);
2116 so_update_last_owner_locked(so, p);
2117 so_update_policy(so);
2118
2119#ifdef MORE_LOCKING_DEBUG
2120 if (so->so_usecount == 1) {
2121 panic("%s: so=%x no other reference on socket\n", __func__, so);
2122 /* NOTREACHED */
2123 }
2124#endif
2125 mp = mp0;
2126 if (psa != NULL)
2127 *psa = NULL;
2128 if (controlp != NULL)
2129 *controlp = NULL;
2130 if (flagsp != NULL)
2131 flags = *flagsp &~ MSG_EOR;
2132 else
2133 flags = 0;
2134
2135 /*
2136 * If a recv attempt is made on a previously-accepted socket
2137 * that has been marked as inactive (disconnected), reject
2138 * the request.
2139 */
2140 if (so->so_flags & SOF_DEFUNCT) {
2141 struct sockbuf *sb = &so->so_rcv;
2142
2143 error = ENOTCONN;
2144 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
2145 __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
2146 SOCK_DOM(so), SOCK_TYPE(so), error));
2147 /*
2148 * This socket should have been disconnected and flushed
2149 * prior to being returned from sodefunct(); there should
2150 * be no data on its receive list, so panic otherwise.
2151 */
2152 if (so->so_state & SS_DEFUNCT)
2153 sb_empty_assert(sb, __func__);
2154 socket_unlock(so, 1);
2155 return (error);
2156 }
2157
2158 /*
2159 * When SO_WANTOOBFLAG is set we try to get out-of-band data
2160 * regardless of the flags argument. Here is the case were
2161 * out-of-band data is not inline.
2162 */
2163 if ((flags & MSG_OOB) ||
2164 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2165 (so->so_options & SO_OOBINLINE) == 0 &&
2166 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
2167 m = m_get(M_WAIT, MT_DATA);
2168 if (m == NULL) {
2169 socket_unlock(so, 1);
2170 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
2171 ENOBUFS, 0, 0, 0, 0);
2172 return (ENOBUFS);
2173 }
2174 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
2175 if (error)
2176 goto bad;
2177 socket_unlock(so, 0);
2178 do {
2179 error = uiomove(mtod(m, caddr_t),
2180 imin(uio_resid(uio), m->m_len), uio);
2181 m = m_free(m);
2182 } while (uio_resid(uio) && error == 0 && m != NULL);
2183 socket_lock(so, 0);
2184bad:
2185 if (m != NULL)
2186 m_freem(m);
2187
2188 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
2189 if (error == EWOULDBLOCK || error == EINVAL) {
2190 /*
2191 * Let's try to get normal data:
2192 * EWOULDBLOCK: out-of-band data not
2193 * receive yet. EINVAL: out-of-band data
2194 * already read.
2195 */
2196 error = 0;
2197 goto nooob;
2198 } else if (error == 0 && flagsp != NULL) {
2199 *flagsp |= MSG_OOB;
2200 }
2201 }
2202 socket_unlock(so, 1);
2203 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2204 0, 0, 0, 0);
2205
2206 return (error);
2207 }
2208nooob:
2209 if (mp != NULL)
2210 *mp = NULL;
2211 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio))
2212 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
2213
2214 free_list = NULL;
2215 delayed_copy_len = 0;
2216restart:
2217#ifdef MORE_LOCKING_DEBUG
2218 if (so->so_usecount <= 1)
2219 printf("soreceive: sblock so=%p ref=%d on socket\n",
2220 so, so->so_usecount);
2221#endif
2222 /*
2223 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2224 * and if so just return to the caller. This could happen when
2225 * soreceive() is called by a socket upcall function during the
2226 * time the socket is freed. The socket buffer would have been
2227 * locked across the upcall, therefore we cannot put this thread
2228 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2229 * we may livelock), because the lock on the socket buffer will
2230 * only be released when the upcall routine returns to its caller.
2231 * Because the socket has been officially closed, there can be
2232 * no further read on it.
2233 *
2234 * A multipath subflow socket would have its SS_NOFDREF set by
2235 * default, so check for SOF_MP_SUBFLOW socket flag; when the
2236 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
2237 */
2238 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2239 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
2240 socket_unlock(so, 1);
2241 return (0);
2242 }
2243
2244 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
2245 if (error) {
2246 socket_unlock(so, 1);
2247 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2248 0, 0, 0, 0);
2249 return (error);
2250 }
2251
2252 m = so->so_rcv.sb_mb;
2253 /*
2254 * If we have less data than requested, block awaiting more
2255 * (subject to any timeout) if:
2256 * 1. the current count is less than the low water mark, or
2257 * 2. MSG_WAITALL is set, and it is possible to do the entire
2258 * receive operation at once if we block (resid <= hiwat).
2259 * 3. MSG_DONTWAIT is not set
2260 * If MSG_WAITALL is set but resid is larger than the receive buffer,
2261 * we have to do the receive in sections, and thus risk returning
2262 * a short count if a timeout or signal occurs after we start.
2263 */
2264 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
2265 so->so_rcv.sb_cc < uio_resid(uio)) &&
2266 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
2267 ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
2268 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
2269 /*
2270 * Panic if we notice inconsistencies in the socket's
2271 * receive list; both sb_mb and sb_cc should correctly
2272 * reflect the contents of the list, otherwise we may
2273 * end up with false positives during select() or poll()
2274 * which could put the application in a bad state.
2275 */
2276 SB_MB_CHECK(&so->so_rcv);
2277
2278 if (so->so_error) {
2279 if (m != NULL)
2280 goto dontblock;
2281 error = so->so_error;
2282 if ((flags & MSG_PEEK) == 0)
2283 so->so_error = 0;
2284 goto release;
2285 }
2286 if (so->so_state & SS_CANTRCVMORE) {
2287 if (m != NULL)
2288 goto dontblock;
2289 else
2290 goto release;
2291 }
2292 for (; m != NULL; m = m->m_next)
2293 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
2294 m = so->so_rcv.sb_mb;
2295 goto dontblock;
2296 }
2297 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
2298 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2299 error = ENOTCONN;
2300 goto release;
2301 }
2302 if (uio_resid(uio) == 0)
2303 goto release;
2304 if ((so->so_state & SS_NBIO) ||
2305 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2306 error = EWOULDBLOCK;
2307 goto release;
2308 }
2309 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
2310 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
2311 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
2312#if EVEN_MORE_LOCKING_DEBUG
2313 if (socket_debug)
2314 printf("Waiting for socket data\n");
2315#endif
2316
2317 error = sbwait(&so->so_rcv);
2318#if EVEN_MORE_LOCKING_DEBUG
2319 if (socket_debug)
2320 printf("SORECEIVE - sbwait returned %d\n", error);
2321#endif
2322 if (so->so_usecount < 1) {
2323 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
2324 __func__, so, so->so_usecount);
2325 /* NOTREACHED */
2326 }
2327 if (error) {
2328 socket_unlock(so, 1);
2329 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2330 0, 0, 0, 0);
2331 return (error);
2332 }
2333 goto restart;
2334 }
2335dontblock:
2336 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
2337 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
2338 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
2339 nextrecord = m->m_nextpkt;
2340 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
2341 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2342#if CONFIG_MACF_SOCKET_SUBSET
2343 /*
2344 * Call the MAC framework for policy checking if we're in
2345 * the user process context and the socket isn't connected.
2346 */
2347 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2348 struct mbuf *m0 = m;
2349 /*
2350 * Dequeue this record (temporarily) from the receive
2351 * list since we're about to drop the socket's lock
2352 * where a new record may arrive and be appended to
2353 * the list. Upon MAC policy failure, the record
2354 * will be freed. Otherwise, we'll add it back to
2355 * the head of the list. We cannot rely on SB_LOCK
2356 * because append operation uses the socket's lock.
2357 */
2358 do {
2359 m->m_nextpkt = NULL;
2360 sbfree(&so->so_rcv, m);
2361 m = m->m_next;
2362 } while (m != NULL);
2363 m = m0;
2364 so->so_rcv.sb_mb = nextrecord;
2365 SB_EMPTY_FIXUP(&so->so_rcv);
2366 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2367 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2368 socket_unlock(so, 0);
2369 if (mac_socket_check_received(proc_ucred(p), so,
2370 mtod(m, struct sockaddr *)) != 0) {
2371 /*
2372 * MAC policy failure; free this record and
2373 * process the next record (or block until
2374 * one is available). We have adjusted sb_cc
2375 * and sb_mbcnt above so there is no need to
2376 * call sbfree() again.
2377 */
2378 do {
2379 m = m_free(m);
2380 } while (m != NULL);
2381 /*
2382 * Clear SB_LOCK but don't unlock the socket.
2383 * Process the next record or wait for one.
2384 */
2385 socket_lock(so, 0);
2386 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2387 goto restart;
2388 }
2389 socket_lock(so, 0);
2390 /*
2391 * If the socket has been defunct'd, drop it.
2392 */
2393 if (so->so_flags & SOF_DEFUNCT) {
2394 m_freem(m);
2395 error = ENOTCONN;
2396 goto release;
2397 }
2398 /*
2399 * Re-adjust the socket receive list and re-enqueue
2400 * the record in front of any packets which may have
2401 * been appended while we dropped the lock.
2402 */
2403 for (m = m0; m->m_next != NULL; m = m->m_next)
2404 sballoc(&so->so_rcv, m);
2405 sballoc(&so->so_rcv, m);
2406 if (so->so_rcv.sb_mb == NULL) {
2407 so->so_rcv.sb_lastrecord = m0;
2408 so->so_rcv.sb_mbtail = m;
2409 }
2410 m = m0;
2411 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2412 so->so_rcv.sb_mb = m;
2413 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2414 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2415 }
2416#endif /* CONFIG_MACF_SOCKET_SUBSET */
2417 orig_resid = 0;
2418 if (psa != NULL) {
2419 *psa = dup_sockaddr(mtod(m, struct sockaddr *),
2420 mp0 == NULL);
2421 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2422 error = EWOULDBLOCK;
2423 goto release;
2424 }
2425 }
2426 if (flags & MSG_PEEK) {
2427 m = m->m_next;
2428 } else {
2429 sbfree(&so->so_rcv, m);
2430 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2431 panic("%s: about to create invalid socketbuf",
2432 __func__);
2433 /* NOTREACHED */
2434 }
2435 MFREE(m, so->so_rcv.sb_mb);
2436 m = so->so_rcv.sb_mb;
2437 if (m != NULL) {
2438 m->m_nextpkt = nextrecord;
2439 } else {
2440 so->so_rcv.sb_mb = nextrecord;
2441 SB_EMPTY_FIXUP(&so->so_rcv);
2442 }
2443 }
2444 }
2445
2446 /*
2447 * Process one or more MT_CONTROL mbufs present before any data mbufs
2448 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2449 * just copy the data; if !MSG_PEEK, we call into the protocol to
2450 * perform externalization.
2451 */
2452 if (m != NULL && m->m_type == MT_CONTROL) {
2453 struct mbuf *cm = NULL, *cmn;
2454 struct mbuf **cme = &cm;
2455 struct sockbuf *sb_rcv = &so->so_rcv;
2456 struct mbuf **msgpcm = NULL;
2457
2458 /*
2459 * Externalizing the control messages would require us to
2460 * drop the socket's lock below. Once we re-acquire the
2461 * lock, the mbuf chain might change. In order to preserve
2462 * consistency, we unlink all control messages from the
2463 * first mbuf chain in one shot and link them separately
2464 * onto a different chain.
2465 */
2466 do {
2467 if (flags & MSG_PEEK) {
2468 if (controlp != NULL) {
2469 if (*controlp == NULL) {
2470 msgpcm = controlp;
2471 }
2472 *controlp = m_copy(m, 0, m->m_len);
2473
2474 /*
2475 * If we failed to allocate an mbuf,
2476 * release any previously allocated
2477 * mbufs for control data. Return
2478 * an error. Keep the mbufs in the
2479 * socket as this is using
2480 * MSG_PEEK flag.
2481 */
2482 if (*controlp == NULL) {
2483 m_freem(*msgpcm);
2484 error = ENOBUFS;
2485 goto release;
2486 }
2487 controlp = &(*controlp)->m_next;
2488 }
2489 m = m->m_next;
2490 } else {
2491 m->m_nextpkt = NULL;
2492 sbfree(sb_rcv, m);
2493 sb_rcv->sb_mb = m->m_next;
2494 m->m_next = NULL;
2495 *cme = m;
2496 cme = &(*cme)->m_next;
2497 m = sb_rcv->sb_mb;
2498 }
2499 } while (m != NULL && m->m_type == MT_CONTROL);
2500
2501 if (!(flags & MSG_PEEK)) {
2502 if (sb_rcv->sb_mb != NULL) {
2503 sb_rcv->sb_mb->m_nextpkt = nextrecord;
2504 } else {
2505 sb_rcv->sb_mb = nextrecord;
2506 SB_EMPTY_FIXUP(sb_rcv);
2507 }
2508 if (nextrecord == NULL)
2509 sb_rcv->sb_lastrecord = m;
2510 }
2511
2512 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2513 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2514
2515 while (cm != NULL) {
2516 int cmsg_type;
2517
2518 cmn = cm->m_next;
2519 cm->m_next = NULL;
2520 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2521
2522 /*
2523 * Call the protocol to externalize SCM_RIGHTS message
2524 * and return the modified message to the caller upon
2525 * success. Otherwise, all other control messages are
2526 * returned unmodified to the caller. Note that we
2527 * only get into this loop if MSG_PEEK is not set.
2528 */
2529 if (pr->pr_domain->dom_externalize != NULL &&
2530 cmsg_type == SCM_RIGHTS) {
2531 /*
2532 * Release socket lock: see 3903171. This
2533 * would also allow more records to be appended
2534 * to the socket buffer. We still have SB_LOCK
2535 * set on it, so we can be sure that the head
2536 * of the mbuf chain won't change.
2537 */
2538 socket_unlock(so, 0);
2539 error = (*pr->pr_domain->dom_externalize)(cm);
2540 socket_lock(so, 0);
2541 } else {
2542 error = 0;
2543 }
2544
2545 if (controlp != NULL && error == 0) {
2546 *controlp = cm;
2547 controlp = &(*controlp)->m_next;
2548 orig_resid = 0;
2549 } else {
2550 (void) m_free(cm);
2551 }
2552 cm = cmn;
2553 }
2554 /*
2555 * Update the value of nextrecord in case we received new
2556 * records when the socket was unlocked above for
2557 * externalizing SCM_RIGHTS.
2558 */
2559 if (m != NULL)
2560 nextrecord = sb_rcv->sb_mb->m_nextpkt;
2561 else
2562 nextrecord = sb_rcv->sb_mb;
2563 orig_resid = 0;
2564 }
2565
2566 /*
2567 * If the socket is a TCP socket with message delivery
2568 * enabled, then create a control msg to deliver the
2569 * relative TCP sequence number for this data. Waiting
2570 * until this point will protect against failures to
2571 * allocate an mbuf for control msgs.
2572 */
2573 if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
2574 (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
2575 struct mbuf *seq_cm;
2576
2577 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
2578 sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
2579 if (seq_cm == NULL) {
2580 /* unable to allocate a control mbuf */
2581 error = ENOBUFS;
2582 goto release;
2583 }
2584 *controlp = seq_cm;
2585 controlp = &seq_cm->m_next;
2586 }
2587
2588 if (m != NULL) {
2589 if (!(flags & MSG_PEEK)) {
2590 /*
2591 * We get here because m points to an mbuf following
2592 * any MT_SONAME or MT_CONTROL mbufs which have been
2593 * processed above. In any case, m should be pointing
2594 * to the head of the mbuf chain, and the nextrecord
2595 * should be either NULL or equal to m->m_nextpkt.
2596 * See comments above about SB_LOCK.
2597 */
2598 if (m != so->so_rcv.sb_mb ||
2599 m->m_nextpkt != nextrecord) {
2600 panic("%s: post-control !sync so=%p m=%p "
2601 "nextrecord=%p\n", __func__, so, m,
2602 nextrecord);
2603 /* NOTREACHED */
2604 }
2605 if (nextrecord == NULL)
2606 so->so_rcv.sb_lastrecord = m;
2607 }
2608 type = m->m_type;
2609 if (type == MT_OOBDATA)
2610 flags |= MSG_OOB;
2611 } else {
2612 if (!(flags & MSG_PEEK)) {
2613 SB_EMPTY_FIXUP(&so->so_rcv);
2614 }
2615 }
2616 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
2617 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
2618
2619 moff = 0;
2620 offset = 0;
2621
2622 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
2623 can_delay = 1;
2624 else
2625 can_delay = 0;
2626
2627 need_event = 0;
2628
2629 while (m != NULL &&
2630 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
2631 if (m->m_type == MT_OOBDATA) {
2632 if (type != MT_OOBDATA)
2633 break;
2634 } else if (type == MT_OOBDATA) {
2635 break;
2636 }
2637 /*
2638 * Make sure to allways set MSG_OOB event when getting
2639 * out of band data inline.
2640 */
2641 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2642 (so->so_options & SO_OOBINLINE) != 0 &&
2643 (so->so_state & SS_RCVATMARK) != 0) {
2644 flags |= MSG_OOB;
2645 }
2646 so->so_state &= ~SS_RCVATMARK;
2647 len = uio_resid(uio) - delayed_copy_len;
2648 if (so->so_oobmark && len > so->so_oobmark - offset)
2649 len = so->so_oobmark - offset;
2650 if (len > m->m_len - moff)
2651 len = m->m_len - moff;
2652 /*
2653 * If mp is set, just pass back the mbufs.
2654 * Otherwise copy them out via the uio, then free.
2655 * Sockbuf must be consistent here (points to current mbuf,
2656 * it points to next record) when we drop priority;
2657 * we must note any additions to the sockbuf when we
2658 * block interrupts again.
2659 */
2660 if (mp == NULL) {
2661 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
2662 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
2663 if (can_delay && len == m->m_len) {
2664 /*
2665 * only delay the copy if we're consuming the
2666 * mbuf and we're NOT in MSG_PEEK mode
2667 * and we have enough data to make it worthwile
2668 * to drop and retake the lock... can_delay
2669 * reflects the state of the 2 latter
2670 * constraints moff should always be zero
2671 * in these cases
2672 */
2673 delayed_copy_len += len;
2674 } else {
2675 if (delayed_copy_len) {
2676 error = sodelayed_copy(so, uio,
2677 &free_list, &delayed_copy_len);
2678
2679 if (error) {
2680 goto release;
2681 }
2682 /*
2683 * can only get here if MSG_PEEK is not
2684 * set therefore, m should point at the
2685 * head of the rcv queue; if it doesn't,
2686 * it means something drastically
2687 * changed while we were out from behind
2688 * the lock in sodelayed_copy. perhaps
2689 * a RST on the stream. in any event,
2690 * the stream has been interrupted. it's
2691 * probably best just to return whatever
2692 * data we've moved and let the caller
2693 * sort it out...
2694 */
2695 if (m != so->so_rcv.sb_mb) {
2696 break;
2697 }
2698 }
2699 socket_unlock(so, 0);
2700 error = uiomove(mtod(m, caddr_t) + moff,
2701 (int)len, uio);
2702 socket_lock(so, 0);
2703
2704 if (error)
2705 goto release;
2706 }
2707 } else {
2708 uio_setresid(uio, (uio_resid(uio) - len));
2709 }
2710 if (len == m->m_len - moff) {
2711 if (m->m_flags & M_EOR)
2712 flags |= MSG_EOR;
2713 if (flags & MSG_PEEK) {
2714 m = m->m_next;
2715 moff = 0;
2716 } else {
2717 nextrecord = m->m_nextpkt;
2718 sbfree(&so->so_rcv, m);
2719 m->m_nextpkt = NULL;
2720
2721 /*
2722 * If this packet is an unordered packet
2723 * (indicated by M_UNORDERED_DATA flag), remove
2724 * the additional bytes added to the
2725 * receive socket buffer size.
2726 */
2727 if ((so->so_flags & SOF_ENABLE_MSGS) &&
2728 m->m_len &&
2729 (m->m_flags & M_UNORDERED_DATA) &&
2730 sbreserve(&so->so_rcv,
2731 so->so_rcv.sb_hiwat - m->m_len)) {
2732 if (so->so_msg_state->msg_uno_bytes >
2733 m->m_len) {
2734 so->so_msg_state->
2735 msg_uno_bytes -= m->m_len;
2736 } else {
2737 so->so_msg_state->
2738 msg_uno_bytes = 0;
2739 }
2740 m->m_flags &= ~M_UNORDERED_DATA;
2741 }
2742
2743 if (mp != NULL) {
2744 *mp = m;
2745 mp = &m->m_next;
2746 so->so_rcv.sb_mb = m = m->m_next;
2747 *mp = NULL;
2748 } else {
2749 if (free_list == NULL)
2750 free_list = m;
2751 else
2752 ml->m_next = m;
2753 ml = m;
2754 so->so_rcv.sb_mb = m = m->m_next;
2755 ml->m_next = NULL;
2756 }
2757 if (m != NULL) {
2758 m->m_nextpkt = nextrecord;
2759 if (nextrecord == NULL)
2760 so->so_rcv.sb_lastrecord = m;
2761 } else {
2762 so->so_rcv.sb_mb = nextrecord;
2763 SB_EMPTY_FIXUP(&so->so_rcv);
2764 }
2765 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
2766 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
2767 }
2768 } else {
2769 if (flags & MSG_PEEK) {
2770 moff += len;
2771 } else {
2772 if (mp != NULL) {
2773 int copy_flag;
2774
2775 if (flags & MSG_DONTWAIT)
2776 copy_flag = M_DONTWAIT;
2777 else
2778 copy_flag = M_WAIT;
2779 *mp = m_copym(m, 0, len, copy_flag);
2780 /*
2781 * Failed to allocate an mbuf?
2782 * Adjust uio_resid back, it was
2783 * adjusted down by len bytes which
2784 * we didn't copy over.
2785 */
2786 if (*mp == NULL) {
2787 uio_setresid(uio,
2788 (uio_resid(uio) + len));
2789 break;
2790 }
2791 }
2792 m->m_data += len;
2793 m->m_len -= len;
2794 so->so_rcv.sb_cc -= len;
2795 }
2796 }
2797 if (so->so_oobmark) {
2798 if ((flags & MSG_PEEK) == 0) {
2799 so->so_oobmark -= len;
2800 if (so->so_oobmark == 0) {
2801 so->so_state |= SS_RCVATMARK;
2802 /*
2803 * delay posting the actual event until
2804 * after any delayed copy processing
2805 * has finished
2806 */
2807 need_event = 1;
2808 break;
2809 }
2810 } else {
2811 offset += len;
2812 if (offset == so->so_oobmark)
2813 break;
2814 }
2815 }
2816 if (flags & MSG_EOR)
2817 break;
2818 /*
2819 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
2820 * (for non-atomic socket), we must not quit until
2821 * "uio->uio_resid == 0" or an error termination.
2822 * If a signal/timeout occurs, return with a short
2823 * count but without error. Keep sockbuf locked
2824 * against other readers.
2825 */
2826 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL &&
2827 (uio_resid(uio) - delayed_copy_len) > 0 &&
2828 !sosendallatonce(so) && !nextrecord) {
2829 if (so->so_error || so->so_state & SS_CANTRCVMORE)
2830 goto release;
2831
2832 /*
2833 * Depending on the protocol (e.g. TCP), the following
2834 * might cause the socket lock to be dropped and later
2835 * be reacquired, and more data could have arrived and
2836 * have been appended to the receive socket buffer by
2837 * the time it returns. Therefore, we only sleep in
2838 * sbwait() below if and only if the socket buffer is
2839 * empty, in order to avoid a false sleep.
2840 */
2841 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
2842 (((struct inpcb *)so->so_pcb)->inp_state !=
2843 INPCB_STATE_DEAD))
2844 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2845
2846 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
2847 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
2848
2849 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
2850 error = 0;
2851 goto release;
2852 }
2853 /*
2854 * have to wait until after we get back from the sbwait
2855 * to do the copy because we will drop the lock if we
2856 * have enough data that has been delayed... by dropping
2857 * the lock we open up a window allowing the netisr
2858 * thread to process the incoming packets and to change
2859 * the state of this socket... we're issuing the sbwait
2860 * because the socket is empty and we're expecting the
2861 * netisr thread to wake us up when more packets arrive;
2862 * if we allow that processing to happen and then sbwait
2863 * we could stall forever with packets sitting in the
2864 * socket if no further packets arrive from the remote
2865 * side.
2866 *
2867 * we want to copy before we've collected all the data
2868 * to satisfy this request to allow the copy to overlap
2869 * the incoming packet processing on an MP system
2870 */
2871 if (delayed_copy_len > sorecvmincopy &&
2872 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
2873 error = sodelayed_copy(so, uio,
2874 &free_list, &delayed_copy_len);
2875
2876 if (error)
2877 goto release;
2878 }
2879 m = so->so_rcv.sb_mb;
2880 if (m != NULL) {
2881 nextrecord = m->m_nextpkt;
2882 }
2883 SB_MB_CHECK(&so->so_rcv);
2884 }
2885 }
2886#ifdef MORE_LOCKING_DEBUG
2887 if (so->so_usecount <= 1) {
2888 panic("%s: after big while so=%p ref=%d on socket\n",
2889 __func__, so, so->so_usecount);
2890 /* NOTREACHED */
2891 }
2892#endif
2893
2894 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
2895 if (so->so_options & SO_DONTTRUNC) {
2896 flags |= MSG_RCVMORE;
2897 } else {
2898 flags |= MSG_TRUNC;
2899 if ((flags & MSG_PEEK) == 0)
2900 (void) sbdroprecord(&so->so_rcv);
2901 }
2902 }
2903
2904 /*
2905 * pru_rcvd below (for TCP) may cause more data to be received
2906 * if the socket lock is dropped prior to sending the ACK; some
2907 * legacy OpenTransport applications don't handle this well
2908 * (if it receives less data than requested while MSG_HAVEMORE
2909 * is set), and so we set the flag now based on what we know
2910 * prior to calling pru_rcvd.
2911 */
2912 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
2913 flags |= MSG_HAVEMORE;
2914
2915 if ((flags & MSG_PEEK) == 0) {
2916 if (m == NULL) {
2917 so->so_rcv.sb_mb = nextrecord;
2918 /*
2919 * First part is an inline SB_EMPTY_FIXUP(). Second
2920 * part makes sure sb_lastrecord is up-to-date if
2921 * there is still data in the socket buffer.
2922 */
2923 if (so->so_rcv.sb_mb == NULL) {
2924 so->so_rcv.sb_mbtail = NULL;
2925 so->so_rcv.sb_lastrecord = NULL;
2926 } else if (nextrecord->m_nextpkt == NULL) {
2927 so->so_rcv.sb_lastrecord = nextrecord;
2928 }
2929 SB_MB_CHECK(&so->so_rcv);
2930 }
2931 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
2932 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
2933 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
2934 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
2935 }
2936
2937 if (delayed_copy_len) {
2938 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
2939 if (error)
2940 goto release;
2941 }
2942 if (free_list != NULL) {
2943 m_freem_list(free_list);
2944 free_list = NULL;
2945 }
2946 if (need_event)
2947 postevent(so, 0, EV_OOB);
2948
2949 if (orig_resid == uio_resid(uio) && orig_resid &&
2950 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
2951 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
2952 goto restart;
2953 }
2954
2955 if (flagsp != NULL)
2956 *flagsp |= flags;
2957release:
2958#ifdef MORE_LOCKING_DEBUG
2959 if (so->so_usecount <= 1) {
2960 panic("%s: release so=%p ref=%d on socket\n", __func__,
2961 so, so->so_usecount);
2962 /* NOTREACHED */
2963 }
2964#endif
2965 if (delayed_copy_len)
2966 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
2967
2968 if (free_list != NULL)
2969 m_freem_list(free_list);
2970
2971 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
2972
2973 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
2974 so->so_rcv.sb_cc, 0, error);
2975
2976 return (error);
2977}
2978
2979/*
2980 * Returns: 0 Success
2981 * uiomove:EFAULT
2982 */
2983static int
2984sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
2985 user_ssize_t *resid)
2986{
2987 int error = 0;
2988 struct mbuf *m;
2989
2990 m = *free_list;
2991
2992 socket_unlock(so, 0);
2993
2994 while (m != NULL && error == 0) {
2995 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
2996 m = m->m_next;
2997 }
2998 m_freem_list(*free_list);
2999
3000 *free_list = NULL;
3001 *resid = 0;
3002
3003 socket_lock(so, 0);
3004
3005 return (error);
3006}
3007
3008/*
3009 * Returns: 0 Success
3010 * EINVAL
3011 * ENOTCONN
3012 * <pru_shutdown>:EINVAL
3013 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
3014 * <pru_shutdown>:ENOBUFS[TCP]
3015 * <pru_shutdown>:EMSGSIZE[TCP]
3016 * <pru_shutdown>:EHOSTUNREACH[TCP]
3017 * <pru_shutdown>:ENETUNREACH[TCP]
3018 * <pru_shutdown>:ENETDOWN[TCP]
3019 * <pru_shutdown>:ENOMEM[TCP]
3020 * <pru_shutdown>:EACCES[TCP]
3021 * <pru_shutdown>:EMSGSIZE[TCP]
3022 * <pru_shutdown>:ENOBUFS[TCP]
3023 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
3024 * <pru_shutdown>:??? [other protocol families]
3025 */
3026int
3027soshutdown(struct socket *so, int how)
3028{
3029 int error;
3030
3031 switch (how) {
3032 case SHUT_RD:
3033 case SHUT_WR:
3034 case SHUT_RDWR:
3035 socket_lock(so, 1);
3036 if ((so->so_state &
3037 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
3038 error = ENOTCONN;
3039 } else {
3040 error = soshutdownlock(so, how);
3041 }
3042 socket_unlock(so, 1);
3043 break;
3044 default:
3045 error = EINVAL;
3046 break;
3047 }
3048
3049 return (error);
3050}
3051
3052int
3053soshutdownlock(struct socket *so, int how)
3054{
3055 struct protosw *pr = so->so_proto;
3056 int error = 0;
3057
3058 sflt_notify(so, sock_evt_shutdown, &how);
3059
3060 if (how != SHUT_WR) {
3061 if ((so->so_state & SS_CANTRCVMORE) != 0) {
3062 /* read already shut down */
3063 error = ENOTCONN;
3064 goto done;
3065 }
3066 sorflush(so);
3067 postevent(so, 0, EV_RCLOSED);
3068 }
3069 if (how != SHUT_RD) {
3070 if ((so->so_state & SS_CANTSENDMORE) != 0) {
3071 /* write already shut down */
3072 error = ENOTCONN;
3073 goto done;
3074 }
3075 error = (*pr->pr_usrreqs->pru_shutdown)(so);
3076 postevent(so, 0, EV_WCLOSED);
3077 }
3078done:
3079 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, 0, 0, 0, 0, 0);
3080 return (error);
3081}
3082
3083void
3084sowflush(struct socket *so)
3085{
3086 struct sockbuf *sb = &so->so_snd;
3087#ifdef notyet
3088 lck_mtx_t *mutex_held;
3089 /*
3090 * XXX: This code is currently commented out, because we may get here
3091 * as part of sofreelastref(), and at that time, pr_getlock() may no
3092 * longer be able to return us the lock; this will be fixed in future.
3093 */
3094 if (so->so_proto->pr_getlock != NULL)
3095 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
3096 else
3097 mutex_held = so->so_proto->pr_domain->dom_mtx;
3098
3099 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
3100#endif /* notyet */
3101
3102 /*
3103 * Obtain lock on the socket buffer (SB_LOCK). This is required
3104 * to prevent the socket buffer from being unexpectedly altered
3105 * while it is used by another thread in socket send/receive.
3106 *
3107 * sblock() must not fail here, hence the assertion.
3108 */
3109 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
3110 VERIFY(sb->sb_flags & SB_LOCK);
3111
3112 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
3113 sb->sb_flags |= SB_DROP;
3114 sb->sb_upcall = NULL;
3115 sb->sb_upcallarg = NULL;
3116
3117 sbunlock(sb, TRUE); /* keep socket locked */
3118
3119 selthreadclear(&sb->sb_sel);
3120 sbrelease(sb);
3121}
3122
3123void
3124sorflush(struct socket *so)
3125{
3126 struct sockbuf *sb = &so->so_rcv;
3127 struct protosw *pr = so->so_proto;
3128 struct sockbuf asb;
3129#ifdef notyet
3130 lck_mtx_t *mutex_held;
3131 /*
3132 * XXX: This code is currently commented out, because we may get here
3133 * as part of sofreelastref(), and at that time, pr_getlock() may no
3134 * longer be able to return us the lock; this will be fixed in future.
3135 */
3136 if (so->so_proto->pr_getlock != NULL)
3137 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
3138 else
3139 mutex_held = so->so_proto->pr_domain->dom_mtx;
3140
3141 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
3142#endif /* notyet */
3143
3144 sflt_notify(so, sock_evt_flush_read, NULL);
3145
3146 socantrcvmore(so);
3147
3148 /*
3149 * Obtain lock on the socket buffer (SB_LOCK). This is required
3150 * to prevent the socket buffer from being unexpectedly altered
3151 * while it is used by another thread in socket send/receive.
3152 *
3153 * sblock() must not fail here, hence the assertion.
3154 */
3155 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
3156 VERIFY(sb->sb_flags & SB_LOCK);
3157
3158 /*
3159 * Copy only the relevant fields from "sb" to "asb" which we
3160 * need for sbrelease() to function. In particular, skip
3161 * sb_sel as it contains the wait queue linkage, which would
3162 * wreak havoc if we were to issue selthreadclear() on "asb".
3163 * Make sure to not carry over SB_LOCK in "asb", as we need
3164 * to acquire it later as part of sbrelease().
3165 */
3166 bzero(&asb, sizeof (asb));
3167 asb.sb_cc = sb->sb_cc;
3168 asb.sb_hiwat = sb->sb_hiwat;
3169 asb.sb_mbcnt = sb->sb_mbcnt;
3170 asb.sb_mbmax = sb->sb_mbmax;
3171 asb.sb_ctl = sb->sb_ctl;
3172 asb.sb_lowat = sb->sb_lowat;
3173 asb.sb_mb = sb->sb_mb;
3174 asb.sb_mbtail = sb->sb_mbtail;
3175 asb.sb_lastrecord = sb->sb_lastrecord;
3176 asb.sb_so = sb->sb_so;
3177 asb.sb_flags = sb->sb_flags;
3178 asb.sb_flags &= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL);
3179 asb.sb_flags |= SB_DROP;
3180
3181 /*
3182 * Ideally we'd bzero() these and preserve the ones we need;
3183 * but to do that we'd need to shuffle things around in the
3184 * sockbuf, and we can't do it now because there are KEXTS
3185 * that are directly referring to the socket structure.
3186 *
3187 * Setting SB_DROP acts as a barrier to prevent further appends.
3188 * Clearing SB_SEL is done for selthreadclear() below.
3189 */
3190 sb->sb_cc = 0;
3191 sb->sb_hiwat = 0;
3192 sb->sb_mbcnt = 0;
3193 sb->sb_mbmax = 0;
3194 sb->sb_ctl = 0;
3195 sb->sb_lowat = 0;
3196 sb->sb_mb = NULL;
3197 sb->sb_mbtail = NULL;
3198 sb->sb_lastrecord = NULL;
3199 sb->sb_timeo.tv_sec = 0;
3200 sb->sb_timeo.tv_usec = 0;
3201 sb->sb_upcall = NULL;
3202 sb->sb_upcallarg = NULL;
3203 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
3204 sb->sb_flags |= SB_DROP;
3205
3206 sbunlock(sb, TRUE); /* keep socket locked */
3207
3208 /*
3209 * Note that selthreadclear() is called on the original "sb" and
3210 * not the local "asb" because of the way wait queue linkage is
3211 * implemented. Given that selwakeup() may be triggered, SB_SEL
3212 * should no longer be set (cleared above.)
3213 */
3214 selthreadclear(&sb->sb_sel);
3215
3216 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
3217 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
3218
3219 sbrelease(&asb);
3220}
3221
3222/*
3223 * Perhaps this routine, and sooptcopyout(), below, ought to come in
3224 * an additional variant to handle the case where the option value needs
3225 * to be some kind of integer, but not a specific size.
3226 * In addition to their use here, these functions are also called by the
3227 * protocol-level pr_ctloutput() routines.
3228 *
3229 * Returns: 0 Success
3230 * EINVAL
3231 * copyin:EFAULT
3232 */
3233int
3234sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
3235{
3236 size_t valsize;
3237
3238 /*
3239 * If the user gives us more than we wanted, we ignore it,
3240 * but if we don't get the minimum length the caller
3241 * wants, we return EINVAL. On success, sopt->sopt_valsize
3242 * is set to however much we actually retrieved.
3243 */
3244 if ((valsize = sopt->sopt_valsize) < minlen)
3245 return (EINVAL);
3246 if (valsize > len)
3247 sopt->sopt_valsize = valsize = len;
3248
3249 if (sopt->sopt_p != kernproc)
3250 return (copyin(sopt->sopt_val, buf, valsize));
3251
3252 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
3253 return (0);
3254}
3255
3256/*
3257 * sooptcopyin_timeval
3258 * Copy in a timeval value into tv_p, and take into account whether the
3259 * the calling process is 64-bit or 32-bit. Moved the sanity checking
3260 * code here so that we can verify the 64-bit tv_sec value before we lose
3261 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
3262 */
3263static int
3264sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
3265{
3266 int error;
3267
3268 if (proc_is64bit(sopt->sopt_p)) {
3269 struct user64_timeval tv64;
3270
3271 if (sopt->sopt_valsize < sizeof (tv64))
3272 return (EINVAL);
3273
3274 sopt->sopt_valsize = sizeof (tv64);
3275 if (sopt->sopt_p != kernproc) {
3276 error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
3277 if (error != 0)
3278 return (error);
3279 } else {
3280 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
3281 sizeof (tv64));
3282 }
3283 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
3284 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000)
3285 return (EDOM);
3286
3287 tv_p->tv_sec = tv64.tv_sec;
3288 tv_p->tv_usec = tv64.tv_usec;
3289 } else {
3290 struct user32_timeval tv32;
3291
3292 if (sopt->sopt_valsize < sizeof (tv32))
3293 return (EINVAL);
3294
3295 sopt->sopt_valsize = sizeof (tv32);
3296 if (sopt->sopt_p != kernproc) {
3297 error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
3298 if (error != 0) {
3299 return (error);
3300 }
3301 } else {
3302 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
3303 sizeof (tv32));
3304 }
3305#ifndef __LP64__
3306 /*
3307 * K64todo "comparison is always false due to
3308 * limited range of data type"
3309 */
3310 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
3311 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000)
3312 return (EDOM);
3313#endif
3314 tv_p->tv_sec = tv32.tv_sec;
3315 tv_p->tv_usec = tv32.tv_usec;
3316 }
3317 return (0);
3318}
3319
3320/*
3321 * Returns: 0 Success
3322 * EINVAL
3323 * ENOPROTOOPT
3324 * ENOBUFS
3325 * EDOM
3326 * sooptcopyin:EINVAL
3327 * sooptcopyin:EFAULT
3328 * sooptcopyin_timeval:EINVAL
3329 * sooptcopyin_timeval:EFAULT
3330 * sooptcopyin_timeval:EDOM
3331 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
3332 * <pr_ctloutput>:???w
3333 * sflt_attach_private:??? [whatever a filter author chooses]
3334 * <sf_setoption>:??? [whatever a filter author chooses]
3335 *
3336 * Notes: Other <pru_listen> returns depend on the protocol family; all
3337 * <sf_listen> returns depend on what the filter author causes
3338 * their filter to return.
3339 */
3340int
3341sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
3342{
3343 int error, optval;
3344 struct linger l;
3345 struct timeval tv;
3346#if CONFIG_MACF_SOCKET
3347 struct mac extmac;
3348#endif /* MAC_SOCKET */
3349
3350 if (sopt->sopt_dir != SOPT_SET)
3351 sopt->sopt_dir = SOPT_SET;
3352
3353 if (dolock)
3354 socket_lock(so, 1);
3355
3356 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
3357 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
3358 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
3359 /* the socket has been shutdown, no more sockopt's */
3360 error = EINVAL;
3361 goto out;
3362 }
3363
3364 error = sflt_setsockopt(so, sopt);
3365 if (error != 0) {
3366 if (error == EJUSTRETURN)
3367 error = 0;
3368 goto out;
3369 }
3370
3371 if (sopt->sopt_level != SOL_SOCKET) {
3372 if (so->so_proto != NULL &&
3373 so->so_proto->pr_ctloutput != NULL) {
3374 error = (*so->so_proto->pr_ctloutput)(so, sopt);
3375 goto out;
3376 }
3377 error = ENOPROTOOPT;
3378 } else {
3379 /*
3380 * Allow socket-level (SOL_SOCKET) options to be filtered by
3381 * the protocol layer, if needed. A zero value returned from
3382 * the handler means use default socket-level processing as
3383 * done by the rest of this routine. Otherwise, any other
3384 * return value indicates that the option is unsupported.
3385 */
3386 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
3387 pru_socheckopt(so, sopt)) != 0)
3388 goto out;
3389
3390 error = 0;
3391 switch (sopt->sopt_name) {
3392 case SO_LINGER:
3393 case SO_LINGER_SEC:
3394 error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
3395 if (error != 0)
3396 goto out;
3397
3398 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
3399 l.l_linger : l.l_linger * hz;
3400 if (l.l_onoff != 0)
3401 so->so_options |= SO_LINGER;
3402 else
3403 so->so_options &= ~SO_LINGER;
3404 break;
3405
3406 case SO_DEBUG:
3407 case SO_KEEPALIVE:
3408 case SO_DONTROUTE:
3409 case SO_USELOOPBACK:
3410 case SO_BROADCAST:
3411 case SO_REUSEADDR:
3412 case SO_REUSEPORT:
3413 case SO_OOBINLINE:
3414 case SO_TIMESTAMP:
3415 case SO_TIMESTAMP_MONOTONIC:
3416 case SO_DONTTRUNC:
3417 case SO_WANTMORE:
3418 case SO_WANTOOBFLAG:
3419 error = sooptcopyin(sopt, &optval, sizeof (optval),
3420 sizeof (optval));
3421 if (error != 0)
3422 goto out;
3423 if (optval)
3424 so->so_options |= sopt->sopt_name;
3425 else
3426 so->so_options &= ~sopt->sopt_name;
3427 break;
3428
3429 case SO_SNDBUF:
3430 case SO_RCVBUF:
3431 case SO_SNDLOWAT:
3432 case SO_RCVLOWAT:
3433 error = sooptcopyin(sopt, &optval, sizeof (optval),
3434 sizeof (optval));
3435 if (error != 0)
3436 goto out;
3437
3438 /*
3439 * Values < 1 make no sense for any of these
3440 * options, so disallow them.
3441 */
3442 if (optval < 1) {
3443 error = EINVAL;
3444 goto out;
3445 }
3446
3447 switch (sopt->sopt_name) {
3448 case SO_SNDBUF:
3449 case SO_RCVBUF: {
3450 struct sockbuf *sb =
3451 (sopt->sopt_name == SO_SNDBUF) ?
3452 &so->so_snd : &so->so_rcv;
3453 if (sbreserve(sb, (u_int32_t)optval) == 0) {
3454 error = ENOBUFS;
3455 goto out;
3456 }
3457 sb->sb_flags |= SB_USRSIZE;
3458 sb->sb_flags &= ~SB_AUTOSIZE;
3459 sb->sb_idealsize = (u_int32_t)optval;
3460 break;
3461 }
3462 /*
3463 * Make sure the low-water is never greater than
3464 * the high-water.
3465 */
3466 case SO_SNDLOWAT:
3467 so->so_snd.sb_lowat =
3468 (optval > so->so_snd.sb_hiwat) ?
3469 so->so_snd.sb_hiwat : optval;
3470 break;
3471 case SO_RCVLOWAT:
3472 so->so_rcv.sb_lowat =
3473 (optval > so->so_rcv.sb_hiwat) ?
3474 so->so_rcv.sb_hiwat : optval;
3475 break;
3476 }
3477 break;
3478
3479 case SO_SNDTIMEO:
3480 case SO_RCVTIMEO:
3481 error = sooptcopyin_timeval(sopt, &tv);
3482 if (error != 0)
3483 goto out;
3484
3485 switch (sopt->sopt_name) {
3486 case SO_SNDTIMEO:
3487 so->so_snd.sb_timeo = tv;
3488 break;
3489 case SO_RCVTIMEO:
3490 so->so_rcv.sb_timeo = tv;
3491 break;
3492 }
3493 break;
3494
3495 case SO_NKE: {
3496 struct so_nke nke;
3497
3498 error = sooptcopyin(sopt, &nke, sizeof (nke),
3499 sizeof (nke));
3500 if (error != 0)
3501 goto out;
3502
3503 error = sflt_attach_internal(so, nke.nke_handle);
3504 break;
3505 }
3506
3507 case SO_NOSIGPIPE:
3508 error = sooptcopyin(sopt, &optval, sizeof (optval),
3509 sizeof (optval));
3510 if (error != 0)
3511 goto out;
3512 if (optval != 0)
3513 so->so_flags |= SOF_NOSIGPIPE;
3514 else
3515 so->so_flags &= ~SOF_NOSIGPIPE;
3516 break;
3517
3518 case SO_NOADDRERR:
3519 error = sooptcopyin(sopt, &optval, sizeof (optval),
3520 sizeof (optval));
3521 if (error != 0)
3522 goto out;
3523 if (optval != 0)
3524 so->so_flags |= SOF_NOADDRAVAIL;
3525 else
3526 so->so_flags &= ~SOF_NOADDRAVAIL;
3527 break;
3528
3529 case SO_REUSESHAREUID:
3530 error = sooptcopyin(sopt, &optval, sizeof (optval),
3531 sizeof (optval));
3532 if (error != 0)
3533 goto out;
3534 if (optval != 0)
3535 so->so_flags |= SOF_REUSESHAREUID;
3536 else
3537 so->so_flags &= ~SOF_REUSESHAREUID;
3538 break;
3539
3540 case SO_NOTIFYCONFLICT:
3541 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
3542 error = EPERM;
3543 goto out;
3544 }
3545 error = sooptcopyin(sopt, &optval, sizeof (optval),
3546 sizeof (optval));
3547 if (error != 0)
3548 goto out;
3549 if (optval != 0)
3550 so->so_flags |= SOF_NOTIFYCONFLICT;
3551 else
3552 so->so_flags &= ~SOF_NOTIFYCONFLICT;
3553 break;
3554
3555 case SO_RESTRICTIONS:
3556 error = sooptcopyin(sopt, &optval, sizeof (optval),
3557 sizeof (optval));
3558 if (error != 0)
3559 goto out;
3560
3561 error = so_set_restrictions(so, optval);
3562 break;
3563
3564 case SO_LABEL:
3565#if CONFIG_MACF_SOCKET
3566 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
3567 sizeof (extmac))) != 0)
3568 goto out;
3569
3570 error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
3571 so, &extmac);
3572#else
3573 error = EOPNOTSUPP;
3574#endif /* MAC_SOCKET */
3575 break;
3576
3577 case SO_UPCALLCLOSEWAIT:
3578 error = sooptcopyin(sopt, &optval, sizeof (optval),
3579 sizeof (optval));
3580 if (error != 0)
3581 goto out;
3582 if (optval != 0)
3583 so->so_flags |= SOF_UPCALLCLOSEWAIT;
3584 else
3585 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
3586 break;
3587
3588 case SO_RANDOMPORT:
3589 error = sooptcopyin(sopt, &optval, sizeof (optval),
3590 sizeof (optval));
3591 if (error != 0)
3592 goto out;
3593 if (optval != 0)
3594 so->so_flags |= SOF_BINDRANDOMPORT;
3595 else
3596 so->so_flags &= ~SOF_BINDRANDOMPORT;
3597 break;
3598
3599 case SO_NP_EXTENSIONS: {
3600 struct so_np_extensions sonpx;
3601
3602 error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
3603 sizeof (sonpx));
3604 if (error != 0)
3605 goto out;
3606 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
3607 error = EINVAL;
3608 goto out;
3609 }
3610 /*
3611 * Only one bit defined for now
3612 */
3613 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
3614 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
3615 so->so_flags |= SOF_NPX_SETOPTSHUT;
3616 else
3617 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
3618 }
3619 break;
3620 }
3621
3622 case SO_TRAFFIC_CLASS: {
3623 error = sooptcopyin(sopt, &optval, sizeof (optval),
3624 sizeof (optval));
3625 if (error != 0)
3626 goto out;
3627 error = so_set_traffic_class(so, optval);
3628 if (error != 0)
3629 goto out;
3630 break;
3631 }
3632
3633 case SO_RECV_TRAFFIC_CLASS: {
3634 error = sooptcopyin(sopt, &optval, sizeof (optval),
3635 sizeof (optval));
3636 if (error != 0)
3637 goto out;
3638 if (optval == 0)
3639 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
3640 else
3641 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
3642 break;
3643 }
3644
3645 case SO_TRAFFIC_CLASS_DBG: {
3646 struct so_tcdbg so_tcdbg;
3647
3648 error = sooptcopyin(sopt, &so_tcdbg,
3649 sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
3650 if (error != 0)
3651 goto out;
3652 error = so_set_tcdbg(so, &so_tcdbg);
3653 if (error != 0)
3654 goto out;
3655 break;
3656 }
3657
3658 case SO_PRIVILEGED_TRAFFIC_CLASS:
3659 error = priv_check_cred(kauth_cred_get(),
3660 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
3661 if (error != 0)
3662 goto out;
3663 error = sooptcopyin(sopt, &optval, sizeof (optval),
3664 sizeof (optval));
3665 if (error != 0)
3666 goto out;
3667 if (optval == 0)
3668 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
3669 else
3670 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
3671 break;
3672
3673 case SO_DEFUNCTOK:
3674 error = sooptcopyin(sopt, &optval, sizeof (optval),
3675 sizeof (optval));
3676 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
3677 if (error == 0)
3678 error = EBADF;
3679 goto out;
3680 }
3681 /*
3682 * Any process can set SO_DEFUNCTOK (clear
3683 * SOF_NODEFUNCT), but only root can clear
3684 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
3685 */
3686 if (optval == 0 &&
3687 kauth_cred_issuser(kauth_cred_get()) == 0) {
3688 error = EPERM;
3689 goto out;
3690 }
3691 if (optval)
3692 so->so_flags &= ~SOF_NODEFUNCT;
3693 else
3694 so->so_flags |= SOF_NODEFUNCT;
3695
3696 if (SOCK_DOM(so) == PF_INET ||
3697 SOCK_DOM(so) == PF_INET6) {
3698 char s[MAX_IPv6_STR_LEN];
3699 char d[MAX_IPv6_STR_LEN];
3700 struct inpcb *inp = sotoinpcb(so);
3701
3702 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%s %s:%d -> "
3703 "%s:%d] is now marked as %seligible for "
3704 "defunct\n", __func__, proc_selfpid(),
3705 (uint64_t)VM_KERNEL_ADDRPERM(so),
3706 (SOCK_TYPE(so) == SOCK_STREAM) ?
3707 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
3708 ((SOCK_DOM(so) == PF_INET) ?
3709 (void *)&inp->inp_laddr.s_addr :
3710 (void *)&inp->in6p_laddr), s, sizeof (s)),
3711 ntohs(inp->in6p_lport),
3712 inet_ntop(SOCK_DOM(so),
3713 (SOCK_DOM(so) == PF_INET) ?
3714 (void *)&inp->inp_faddr.s_addr :
3715 (void *)&inp->in6p_faddr, d, sizeof (d)),
3716 ntohs(inp->in6p_fport),
3717 (so->so_flags & SOF_NODEFUNCT) ?
3718 "not " : ""));
3719 } else {
3720 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d] is "
3721 "now marked as %seligible for defunct\n",
3722 __func__, proc_selfpid(),
3723 (uint64_t)VM_KERNEL_ADDRPERM(so),
3724 SOCK_DOM(so), SOCK_TYPE(so),
3725 (so->so_flags & SOF_NODEFUNCT) ?
3726 "not " : ""));
3727 }
3728 break;
3729
3730 case SO_ISDEFUNCT:
3731 /* This option is not settable */
3732 error = EINVAL;
3733 break;
3734
3735 case SO_OPPORTUNISTIC:
3736 error = sooptcopyin(sopt, &optval, sizeof (optval),
3737 sizeof (optval));
3738 if (error == 0)
3739 error = so_set_opportunistic(so, optval);
3740 break;
3741
3742 case SO_FLUSH:
3743 /* This option is handled by lower layer(s) */
3744 error = 0;
3745 break;
3746
3747 case SO_RECV_ANYIF:
3748 error = sooptcopyin(sopt, &optval, sizeof (optval),
3749 sizeof (optval));
3750 if (error == 0)
3751 error = so_set_recv_anyif(so, optval);
3752 break;
3753
3754 case SO_TRAFFIC_MGT_BACKGROUND: {
3755 /* This option is handled by lower layer(s) */
3756 error = 0;
3757 break;
3758 }
3759
3760#if FLOW_DIVERT
3761 case SO_FLOW_DIVERT_TOKEN:
3762 error = flow_divert_token_set(so, sopt);
3763 break;
3764#endif /* FLOW_DIVERT */
3765
3766
3767 case SO_DELEGATED:
3768 if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
3769 sizeof (optval))) != 0)
3770 break;
3771
3772 error = so_set_effective_pid(so, optval, sopt->sopt_p);
3773 break;
3774
3775 case SO_DELEGATED_UUID: {
3776 uuid_t euuid;
3777
3778 if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
3779 sizeof (euuid))) != 0)
3780 break;
3781
3782 error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
3783 break;
3784 }
3785
3786 default:
3787 error = ENOPROTOOPT;
3788 break;
3789 }
3790 if (error == 0 && so->so_proto != NULL &&
3791 so->so_proto->pr_ctloutput != NULL) {
3792 (void) so->so_proto->pr_ctloutput(so, sopt);
3793 }
3794 }
3795out:
3796 if (dolock)
3797 socket_unlock(so, 1);
3798 return (error);
3799}
3800
3801/* Helper routines for getsockopt */
3802int
3803sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
3804{
3805 int error;
3806 size_t valsize;
3807
3808 error = 0;
3809
3810 /*
3811 * Documented get behavior is that we always return a value,
3812 * possibly truncated to fit in the user's buffer.
3813 * Traditional behavior is that we always tell the user
3814 * precisely how much we copied, rather than something useful
3815 * like the total amount we had available for her.
3816 * Note that this interface is not idempotent; the entire answer must
3817 * generated ahead of time.
3818 */
3819 valsize = min(len, sopt->sopt_valsize);
3820 sopt->sopt_valsize = valsize;
3821 if (sopt->sopt_val != USER_ADDR_NULL) {
3822 if (sopt->sopt_p != kernproc)
3823 error = copyout(buf, sopt->sopt_val, valsize);
3824 else
3825 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
3826 }
3827 return (error);
3828}
3829
3830static int
3831sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
3832{
3833 int error;
3834 size_t len;
3835 struct user64_timeval tv64;
3836 struct user32_timeval tv32;
3837 const void * val;
3838 size_t valsize;
3839
3840 error = 0;
3841 if (proc_is64bit(sopt->sopt_p)) {
3842 len = sizeof (tv64);
3843 tv64.tv_sec = tv_p->tv_sec;
3844 tv64.tv_usec = tv_p->tv_usec;
3845 val = &tv64;
3846 } else {
3847 len = sizeof (tv32);
3848 tv32.tv_sec = tv_p->tv_sec;
3849 tv32.tv_usec = tv_p->tv_usec;
3850 val = &tv32;
3851 }
3852 valsize = min(len, sopt->sopt_valsize);
3853 sopt->sopt_valsize = valsize;
3854 if (sopt->sopt_val != USER_ADDR_NULL) {
3855 if (sopt->sopt_p != kernproc)
3856 error = copyout(val, sopt->sopt_val, valsize);
3857 else
3858 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
3859 }
3860 return (error);
3861}
3862
3863/*
3864 * Return: 0 Success
3865 * ENOPROTOOPT
3866 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
3867 * <pr_ctloutput>:???
3868 * <sf_getoption>:???
3869 */
3870int
3871sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
3872{
3873 int error, optval;
3874 struct linger l;
3875 struct timeval tv;
3876#if CONFIG_MACF_SOCKET
3877 struct mac extmac;
3878#endif /* MAC_SOCKET */
3879
3880 if (sopt->sopt_dir != SOPT_GET)
3881 sopt->sopt_dir = SOPT_GET;
3882
3883 if (dolock)
3884 socket_lock(so, 1);
3885
3886 error = sflt_getsockopt(so, sopt);
3887 if (error != 0) {
3888 if (error == EJUSTRETURN)
3889 error = 0;
3890 goto out;
3891 }
3892
3893 if (sopt->sopt_level != SOL_SOCKET) {
3894 if (so->so_proto != NULL &&
3895 so->so_proto->pr_ctloutput != NULL) {
3896 error = (*so->so_proto->pr_ctloutput)(so, sopt);
3897 goto out;
3898 }
3899 error = ENOPROTOOPT;
3900 } else {
3901 /*
3902 * Allow socket-level (SOL_SOCKET) options to be filtered by
3903 * the protocol layer, if needed. A zero value returned from
3904 * the handler means use default socket-level processing as
3905 * done by the rest of this routine. Otherwise, any other
3906 * return value indicates that the option is unsupported.
3907 */
3908 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
3909 pru_socheckopt(so, sopt)) != 0)
3910 goto out;
3911
3912 error = 0;
3913 switch (sopt->sopt_name) {
3914 case SO_LINGER:
3915 case SO_LINGER_SEC:
3916 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
3917 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
3918 so->so_linger : so->so_linger / hz;
3919 error = sooptcopyout(sopt, &l, sizeof (l));
3920 break;
3921
3922 case SO_USELOOPBACK:
3923 case SO_DONTROUTE:
3924 case SO_DEBUG:
3925 case SO_KEEPALIVE:
3926 case SO_REUSEADDR:
3927 case SO_REUSEPORT:
3928 case SO_BROADCAST:
3929 case SO_OOBINLINE:
3930 case SO_TIMESTAMP:
3931 case SO_TIMESTAMP_MONOTONIC:
3932 case SO_DONTTRUNC:
3933 case SO_WANTMORE:
3934 case SO_WANTOOBFLAG:
3935 optval = so->so_options & sopt->sopt_name;
3936integer:
3937 error = sooptcopyout(sopt, &optval, sizeof (optval));
3938 break;
3939
3940 case SO_TYPE:
3941 optval = so->so_type;
3942 goto integer;
3943
3944 case SO_NREAD:
3945 if (so->so_proto->pr_flags & PR_ATOMIC) {
3946 int pkt_total;
3947 struct mbuf *m1;
3948
3949 pkt_total = 0;
3950 m1 = so->so_rcv.sb_mb;
3951 while (m1 != NULL) {
3952 if (m1->m_type == MT_DATA ||
3953 m1->m_type == MT_HEADER ||
3954 m1->m_type == MT_OOBDATA)
3955 pkt_total += m1->m_len;
3956 m1 = m1->m_next;
3957 }
3958 optval = pkt_total;
3959 } else {
3960 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
3961 }
3962 goto integer;
3963
3964 case SO_NWRITE:
3965 optval = so->so_snd.sb_cc;
3966 goto integer;
3967
3968 case SO_ERROR:
3969 optval = so->so_error;
3970 so->so_error = 0;
3971 goto integer;
3972
3973 case SO_SNDBUF:
3974 optval = so->so_snd.sb_hiwat;
3975 goto integer;
3976
3977 case SO_RCVBUF:
3978 optval = so->so_rcv.sb_hiwat;
3979 goto integer;
3980
3981 case SO_SNDLOWAT:
3982 optval = so->so_snd.sb_lowat;
3983 goto integer;
3984
3985 case SO_RCVLOWAT:
3986 optval = so->so_rcv.sb_lowat;
3987 goto integer;
3988
3989 case SO_SNDTIMEO:
3990 case SO_RCVTIMEO:
3991 tv = (sopt->sopt_name == SO_SNDTIMEO ?
3992 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
3993
3994 error = sooptcopyout_timeval(sopt, &tv);
3995 break;
3996
3997 case SO_NOSIGPIPE:
3998 optval = (so->so_flags & SOF_NOSIGPIPE);
3999 goto integer;
4000
4001 case SO_NOADDRERR:
4002 optval = (so->so_flags & SOF_NOADDRAVAIL);
4003 goto integer;
4004
4005 case SO_REUSESHAREUID:
4006 optval = (so->so_flags & SOF_REUSESHAREUID);
4007 goto integer;
4008
4009
4010 case SO_NOTIFYCONFLICT:
4011 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
4012 goto integer;
4013
4014 case SO_RESTRICTIONS:
4015 optval = so_get_restrictions(so);
4016 goto integer;
4017
4018 case SO_LABEL:
4019#if CONFIG_MACF_SOCKET
4020 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
4021 sizeof (extmac))) != 0 ||
4022 (error = mac_socket_label_get(proc_ucred(
4023 sopt->sopt_p), so, &extmac)) != 0)
4024 break;
4025
4026 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
4027#else
4028 error = EOPNOTSUPP;
4029#endif /* MAC_SOCKET */
4030 break;
4031
4032 case SO_PEERLABEL:
4033#if CONFIG_MACF_SOCKET
4034 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
4035 sizeof (extmac))) != 0 ||
4036 (error = mac_socketpeer_label_get(proc_ucred(
4037 sopt->sopt_p), so, &extmac)) != 0)
4038 break;
4039
4040 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
4041#else
4042 error = EOPNOTSUPP;
4043#endif /* MAC_SOCKET */
4044 break;
4045
4046#ifdef __APPLE_API_PRIVATE
4047 case SO_UPCALLCLOSEWAIT:
4048 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
4049 goto integer;
4050#endif
4051 case SO_RANDOMPORT:
4052 optval = (so->so_flags & SOF_BINDRANDOMPORT);
4053 goto integer;
4054
4055 case SO_NP_EXTENSIONS: {
4056 struct so_np_extensions sonpx;
4057
4058 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
4059 SONPX_SETOPTSHUT : 0;
4060 sonpx.npx_mask = SONPX_MASK_VALID;
4061
4062 error = sooptcopyout(sopt, &sonpx,
4063 sizeof (struct so_np_extensions));
4064 break;
4065 }
4066
4067 case SO_TRAFFIC_CLASS:
4068 optval = so->so_traffic_class;
4069 goto integer;
4070
4071 case SO_RECV_TRAFFIC_CLASS:
4072 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
4073 goto integer;
4074
4075 case SO_TRAFFIC_CLASS_STATS:
4076 error = sooptcopyout(sopt, &so->so_tc_stats,
4077 sizeof (so->so_tc_stats));
4078 break;
4079
4080 case SO_TRAFFIC_CLASS_DBG:
4081 error = sogetopt_tcdbg(so, sopt);
4082 break;
4083
4084 case SO_PRIVILEGED_TRAFFIC_CLASS:
4085 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
4086 goto integer;
4087
4088 case SO_DEFUNCTOK:
4089 optval = !(so->so_flags & SOF_NODEFUNCT);
4090 goto integer;
4091
4092 case SO_ISDEFUNCT:
4093 optval = (so->so_flags & SOF_DEFUNCT);
4094 goto integer;
4095
4096 case SO_OPPORTUNISTIC:
4097 optval = so_get_opportunistic(so);
4098 goto integer;
4099
4100 case SO_FLUSH:
4101 /* This option is not gettable */
4102 error = EINVAL;
4103 break;
4104
4105 case SO_RECV_ANYIF:
4106 optval = so_get_recv_anyif(so);
4107 goto integer;
4108
4109 case SO_TRAFFIC_MGT_BACKGROUND:
4110 /* This option is handled by lower layer(s) */
4111 if (so->so_proto != NULL &&
4112 so->so_proto->pr_ctloutput != NULL) {
4113 (void) so->so_proto->pr_ctloutput(so, sopt);
4114 }
4115 break;
4116
4117#if FLOW_DIVERT
4118 case SO_FLOW_DIVERT_TOKEN:
4119 error = flow_divert_token_get(so, sopt);
4120 break;
4121#endif /* FLOW_DIVERT */
4122
4123 default:
4124 error = ENOPROTOOPT;
4125 break;
4126 }
4127 }
4128out:
4129 if (dolock)
4130 socket_unlock(so, 1);
4131 return (error);
4132}
4133
4134/*
4135 * The size limits on our soopt_getm is different from that on FreeBSD.
4136 * We limit the size of options to MCLBYTES. This will have to change
4137 * if we need to define options that need more space than MCLBYTES.
4138 */
4139int
4140soopt_getm(struct sockopt *sopt, struct mbuf **mp)
4141{
4142 struct mbuf *m, *m_prev;
4143 int sopt_size = sopt->sopt_valsize;
4144 int how;
4145
4146 if (sopt_size <= 0 || sopt_size > MCLBYTES)
4147 return (EMSGSIZE);
4148
4149 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
4150 MGET(m, how, MT_DATA);
4151 if (m == NULL)
4152 return (ENOBUFS);
4153 if (sopt_size > MLEN) {
4154 MCLGET(m, how);
4155 if ((m->m_flags & M_EXT) == 0) {
4156 m_free(m);
4157 return (ENOBUFS);
4158 }
4159 m->m_len = min(MCLBYTES, sopt_size);
4160 } else {
4161 m->m_len = min(MLEN, sopt_size);
4162 }
4163 sopt_size -= m->m_len;
4164 *mp = m;
4165 m_prev = m;
4166
4167 while (sopt_size > 0) {
4168 MGET(m, how, MT_DATA);
4169 if (m == NULL) {
4170 m_freem(*mp);
4171 return (ENOBUFS);
4172 }
4173 if (sopt_size > MLEN) {
4174 MCLGET(m, how);
4175 if ((m->m_flags & M_EXT) == 0) {
4176 m_freem(*mp);
4177 m_freem(m);
4178 return (ENOBUFS);
4179 }
4180 m->m_len = min(MCLBYTES, sopt_size);
4181 } else {
4182 m->m_len = min(MLEN, sopt_size);
4183 }
4184 sopt_size -= m->m_len;
4185 m_prev->m_next = m;
4186 m_prev = m;
4187 }
4188 return (0);
4189}
4190
4191/* copyin sopt data into mbuf chain */
4192int
4193soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
4194{
4195 struct mbuf *m0 = m;
4196
4197 if (sopt->sopt_val == USER_ADDR_NULL)
4198 return (0);
4199 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
4200 if (sopt->sopt_p != kernproc) {
4201 int error;
4202
4203 error = copyin(sopt->sopt_val, mtod(m, char *),
4204 m->m_len);
4205 if (error != 0) {
4206 m_freem(m0);
4207 return (error);
4208 }
4209 } else {
4210 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
4211 mtod(m, char *), m->m_len);
4212 }
4213 sopt->sopt_valsize -= m->m_len;
4214 sopt->sopt_val += m->m_len;
4215 m = m->m_next;
4216 }
4217 /* should be allocated enoughly at ip6_sooptmcopyin() */
4218 if (m != NULL) {
4219 panic("soopt_mcopyin");
4220 /* NOTREACHED */
4221 }
4222 return (0);
4223}
4224
4225/* copyout mbuf chain data into soopt */
4226int
4227soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
4228{
4229 struct mbuf *m0 = m;
4230 size_t valsize = 0;
4231
4232 if (sopt->sopt_val == USER_ADDR_NULL)
4233 return (0);
4234 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
4235 if (sopt->sopt_p != kernproc) {
4236 int error;
4237
4238 error = copyout(mtod(m, char *), sopt->sopt_val,
4239 m->m_len);
4240 if (error != 0) {
4241 m_freem(m0);
4242 return (error);
4243 }
4244 } else {
4245 bcopy(mtod(m, char *),
4246 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
4247 }
4248 sopt->sopt_valsize -= m->m_len;
4249 sopt->sopt_val += m->m_len;
4250 valsize += m->m_len;
4251 m = m->m_next;
4252 }
4253 if (m != NULL) {
4254 /* enough soopt buffer should be given from user-land */
4255 m_freem(m0);
4256 return (EINVAL);
4257 }
4258 sopt->sopt_valsize = valsize;
4259 return (0);
4260}
4261
4262void
4263sohasoutofband(struct socket *so)
4264{
4265 if (so->so_pgid < 0)
4266 gsignal(-so->so_pgid, SIGURG);
4267 else if (so->so_pgid > 0)
4268 proc_signal(so->so_pgid, SIGURG);
4269 selwakeup(&so->so_rcv.sb_sel);
4270}
4271
4272int
4273sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
4274{
4275#pragma unused(cred)
4276 struct proc *p = current_proc();
4277 int revents = 0;
4278
4279 socket_lock(so, 1);
4280 so_update_last_owner_locked(so, PROC_NULL);
4281 so_update_policy(so);
4282
4283 if (events & (POLLIN | POLLRDNORM))
4284 if (soreadable(so))
4285 revents |= events & (POLLIN | POLLRDNORM);
4286
4287 if (events & (POLLOUT | POLLWRNORM))
4288 if (sowriteable(so))
4289 revents |= events & (POLLOUT | POLLWRNORM);
4290
4291 if (events & (POLLPRI | POLLRDBAND))
4292 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
4293 revents |= events & (POLLPRI | POLLRDBAND);
4294
4295 if (revents == 0) {
4296 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
4297 /*
4298 * Darwin sets the flag first,
4299 * BSD calls selrecord first
4300 */
4301 so->so_rcv.sb_flags |= SB_SEL;
4302 selrecord(p, &so->so_rcv.sb_sel, wql);
4303 }
4304
4305 if (events & (POLLOUT | POLLWRNORM)) {
4306 /*
4307 * Darwin sets the flag first,
4308 * BSD calls selrecord first
4309 */
4310 so->so_snd.sb_flags |= SB_SEL;
4311 selrecord(p, &so->so_snd.sb_sel, wql);
4312 }
4313 }
4314
4315 socket_unlock(so, 1);
4316 return (revents);
4317}
4318
4319int
4320soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx)
4321{
4322#pragma unused(fp)
4323#if !CONFIG_MACF_SOCKET
4324#pragma unused(ctx)
4325#endif /* MAC_SOCKET */
4326 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4327 struct klist *skl;
4328
4329 socket_lock(so, 1);
4330 so_update_last_owner_locked(so, PROC_NULL);
4331 so_update_policy(so);
4332
4333#if CONFIG_MACF_SOCKET
4334 if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
4335 kn, so) != 0) {
4336 socket_unlock(so, 1);
4337 return (1);
4338 }
4339#endif /* MAC_SOCKET */
4340
4341 switch (kn->kn_filter) {
4342 case EVFILT_READ:
4343 kn->kn_fop = &soread_filtops;
4344 skl = &so->so_rcv.sb_sel.si_note;
4345 break;
4346 case EVFILT_WRITE:
4347 kn->kn_fop = &sowrite_filtops;
4348 skl = &so->so_snd.sb_sel.si_note;
4349 break;
4350 case EVFILT_SOCK:
4351 kn->kn_fop = &sock_filtops;
4352 skl = &so->so_klist;
4353 break;
4354 default:
4355 socket_unlock(so, 1);
4356 return (1);
4357 }
4358
4359 if (KNOTE_ATTACH(skl, kn)) {
4360 switch (kn->kn_filter) {
4361 case EVFILT_READ:
4362 so->so_rcv.sb_flags |= SB_KNOTE;
4363 break;
4364 case EVFILT_WRITE:
4365 so->so_snd.sb_flags |= SB_KNOTE;
4366 break;
4367 case EVFILT_SOCK:
4368 so->so_flags |= SOF_KNOTE;
4369 break;
4370 default:
4371 socket_unlock(so, 1);
4372 return (1);
4373 }
4374 }
4375 socket_unlock(so, 1);
4376 return (0);
4377}
4378
4379static void
4380filt_sordetach(struct knote *kn)
4381{
4382 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4383
4384 socket_lock(so, 1);
4385 if (so->so_rcv.sb_flags & SB_KNOTE)
4386 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
4387 so->so_rcv.sb_flags &= ~SB_KNOTE;
4388 socket_unlock(so, 1);
4389}
4390
4391/*ARGSUSED*/
4392static int
4393filt_soread(struct knote *kn, long hint)
4394{
4395 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4396
4397 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4398 socket_lock(so, 1);
4399
4400 if (so->so_options & SO_ACCEPTCONN) {
4401 int isempty;
4402
4403 /*
4404 * Radar 6615193 handle the listen case dynamically
4405 * for kqueue read filter. This allows to call listen()
4406 * after registering the kqueue EVFILT_READ.
4407 */
4408
4409 kn->kn_data = so->so_qlen;
4410 isempty = ! TAILQ_EMPTY(&so->so_comp);
4411
4412 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4413 socket_unlock(so, 1);
4414
4415 return (isempty);
4416 }
4417
4418 /* socket isn't a listener */
4419
4420 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
4421
4422 if (so->so_oobmark) {
4423 if (kn->kn_flags & EV_OOBAND) {
4424 kn->kn_data -= so->so_oobmark;
4425 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4426 socket_unlock(so, 1);
4427 return (1);
4428 }
4429 kn->kn_data = so->so_oobmark;
4430 kn->kn_flags |= EV_OOBAND;
4431 } else {
4432 if (so->so_state & SS_CANTRCVMORE) {
4433 kn->kn_flags |= EV_EOF;
4434 kn->kn_fflags = so->so_error;
4435 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4436 socket_unlock(so, 1);
4437 return (1);
4438 }
4439 }
4440
4441 if (so->so_state & SS_RCVATMARK) {
4442 if (kn->kn_flags & EV_OOBAND) {
4443 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4444 socket_unlock(so, 1);
4445 return (1);
4446 }
4447 kn->kn_flags |= EV_OOBAND;
4448 } else if (kn->kn_flags & EV_OOBAND) {
4449 kn->kn_data = 0;
4450 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4451 socket_unlock(so, 1);
4452 return (0);
4453 }
4454
4455 if (so->so_error) { /* temporary udp error */
4456 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4457 socket_unlock(so, 1);
4458 return (1);
4459 }
4460
4461 int64_t lowwat = so->so_rcv.sb_lowat;
4462 if (kn->kn_sfflags & NOTE_LOWAT) {
4463 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
4464 lowwat = so->so_rcv.sb_hiwat;
4465 else if (kn->kn_sdata > lowwat)
4466 lowwat = kn->kn_sdata;
4467 }
4468
4469 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4470 socket_unlock(so, 1);
4471
4472 return ((kn->kn_flags & EV_OOBAND) || kn->kn_data >= lowwat);
4473}
4474
4475static void
4476filt_sowdetach(struct knote *kn)
4477{
4478 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4479 socket_lock(so, 1);
4480
4481 if (so->so_snd.sb_flags & SB_KNOTE)
4482 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
4483 so->so_snd.sb_flags &= ~SB_KNOTE;
4484 socket_unlock(so, 1);
4485}
4486
4487int
4488so_wait_for_if_feedback(struct socket *so)
4489{
4490 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
4491 (so->so_state & SS_ISCONNECTED)) {
4492 struct inpcb *inp = sotoinpcb(so);
4493 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
4494 return (1);
4495 }
4496 return (0);
4497}
4498
4499/*ARGSUSED*/
4500static int
4501filt_sowrite(struct knote *kn, long hint)
4502{
4503 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4504 int ret = 0;
4505
4506 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4507 socket_lock(so, 1);
4508
4509 kn->kn_data = sbspace(&so->so_snd);
4510 if (so->so_state & SS_CANTSENDMORE) {
4511 kn->kn_flags |= EV_EOF;
4512 kn->kn_fflags = so->so_error;
4513 ret = 1;
4514 goto out;
4515 }
4516 if (so->so_error) { /* temporary udp error */
4517 ret = 1;
4518 goto out;
4519 }
4520 if (((so->so_state & SS_ISCONNECTED) == 0) &&
4521 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4522 ret = 0;
4523 goto out;
4524 }
4525 int64_t lowwat = so->so_snd.sb_lowat;
4526 if (kn->kn_sfflags & NOTE_LOWAT) {
4527 if (kn->kn_sdata > so->so_snd.sb_hiwat)
4528 lowwat = so->so_snd.sb_hiwat;
4529 else if (kn->kn_sdata > lowwat)
4530 lowwat = kn->kn_sdata;
4531 }
4532 if (kn->kn_data >= lowwat) {
4533 if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0) {
4534 ret = tcp_notsent_lowat_check(so);
4535 } else {
4536 ret = 1;
4537 }
4538 }
4539 if (so_wait_for_if_feedback(so))
4540 ret = 0;
4541out:
4542 if ((hint & SO_FILT_HINT_LOCKED) == 0)
4543 socket_unlock(so, 1);
4544 return (ret);
4545}
4546
4547static void
4548filt_sockdetach(struct knote *kn)
4549{
4550 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4551 socket_lock(so, 1);
4552
4553 if ((so->so_flags & SOF_KNOTE) != 0)
4554 if (KNOTE_DETACH(&so->so_klist, kn))
4555 so->so_flags &= ~SOF_KNOTE;
4556 socket_unlock(so, 1);
4557}
4558
4559static int
4560filt_sockev(struct knote *kn, long hint)
4561{
4562 int ret = 0, locked = 0;
4563 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
4564 long ev_hint = (hint & SO_FILT_HINT_EV);
4565
4566 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
4567 socket_lock(so, 1);
4568 locked = 1;
4569 }
4570
4571 if (ev_hint & SO_FILT_HINT_CONNRESET) {
4572 if (kn->kn_sfflags & NOTE_CONNRESET)
4573 kn->kn_fflags |= NOTE_CONNRESET;
4574 }
4575 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
4576 if (kn->kn_sfflags & NOTE_TIMEOUT)
4577 kn->kn_fflags |= NOTE_TIMEOUT;
4578 }
4579 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
4580 if (kn->kn_sfflags & NOTE_NOSRCADDR)
4581 kn->kn_fflags |= NOTE_NOSRCADDR;
4582 }
4583 if (ev_hint & SO_FILT_HINT_IFDENIED) {
4584 if ((kn->kn_sfflags & NOTE_IFDENIED))
4585 kn->kn_fflags |= NOTE_IFDENIED;
4586 }
4587 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
4588 if (kn->kn_sfflags & NOTE_KEEPALIVE)
4589 kn->kn_fflags |= NOTE_KEEPALIVE;
4590 }
4591 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
4592 if (kn->kn_sfflags & NOTE_ADAPTIVE_WTIMO)
4593 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
4594 }
4595 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
4596 if (kn->kn_sfflags & NOTE_ADAPTIVE_RTIMO)
4597 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
4598 }
4599 if (ev_hint & SO_FILT_HINT_CONNECTED) {
4600 if (kn->kn_sfflags & NOTE_CONNECTED)
4601 kn->kn_fflags |= NOTE_CONNECTED;
4602 }
4603 if (ev_hint & SO_FILT_HINT_DISCONNECTED) {
4604 if (kn->kn_sfflags & NOTE_DISCONNECTED)
4605 kn->kn_fflags |= NOTE_DISCONNECTED;
4606 }
4607 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
4608 if (so->so_proto != NULL &&
4609 (so->so_proto->pr_flags & PR_EVCONNINFO) &&
4610 (kn->kn_sfflags & NOTE_CONNINFO_UPDATED))
4611 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
4612 }
4613
4614 if ((kn->kn_sfflags & NOTE_READCLOSED) &&
4615 (so->so_state & SS_CANTRCVMORE))
4616 kn->kn_fflags |= NOTE_READCLOSED;
4617
4618 if ((kn->kn_sfflags & NOTE_WRITECLOSED) &&
4619 (so->so_state & SS_CANTSENDMORE))
4620 kn->kn_fflags |= NOTE_WRITECLOSED;
4621
4622 if ((kn->kn_sfflags & NOTE_SUSPEND) &&
4623 ((ev_hint & SO_FILT_HINT_SUSPEND) ||
4624 (so->so_flags & SOF_SUSPENDED))) {
4625 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
4626 kn->kn_fflags |= NOTE_SUSPEND;
4627 }
4628
4629 if ((kn->kn_sfflags & NOTE_RESUME) &&
4630 ((ev_hint & SO_FILT_HINT_RESUME) ||
4631 (so->so_flags & SOF_SUSPENDED) == 0)) {
4632 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
4633 kn->kn_fflags |= NOTE_RESUME;
4634 }
4635
4636 if (so->so_error != 0) {
4637 ret = 1;
4638 kn->kn_data = so->so_error;
4639 kn->kn_flags |= EV_EOF;
4640 } else {
4641 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
4642 }
4643
4644 if (kn->kn_fflags != 0)
4645 ret = 1;
4646
4647 if (locked)
4648 socket_unlock(so, 1);
4649
4650 return (ret);
4651}
4652
4653void
4654get_sockev_state(struct socket *so, u_int32_t *statep)
4655{
4656 u_int32_t state = *(statep);
4657
4658 if (so->so_state & SS_ISCONNECTED)
4659 state |= SOCKEV_CONNECTED;
4660 else
4661 state &= ~(SOCKEV_CONNECTED);
4662 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
4663 *(statep) = state;
4664}
4665
4666#define SO_LOCK_HISTORY_STR_LEN \
4667 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
4668
4669__private_extern__ const char *
4670solockhistory_nr(struct socket *so)
4671{
4672 size_t n = 0;
4673 int i;
4674 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
4675
4676 bzero(lock_history_str, sizeof (lock_history_str));
4677 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
4678 n += snprintf(lock_history_str + n,
4679 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
4680 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
4681 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
4682 }
4683 return (lock_history_str);
4684}
4685
4686int
4687socket_lock(struct socket *so, int refcount)
4688{
4689 int error = 0;
4690 void *lr_saved;
4691
4692 lr_saved = __builtin_return_address(0);
4693
4694 if (so->so_proto->pr_lock) {
4695 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
4696 } else {
4697#ifdef MORE_LOCKING_DEBUG
4698 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
4699 LCK_MTX_ASSERT_NOTOWNED);
4700#endif
4701 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
4702 if (refcount)
4703 so->so_usecount++;
4704 so->lock_lr[so->next_lock_lr] = lr_saved;
4705 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
4706 }
4707
4708 return (error);
4709}
4710
4711int
4712socket_unlock(struct socket *so, int refcount)
4713{
4714 int error = 0;
4715 void *lr_saved;
4716 lck_mtx_t *mutex_held;
4717
4718 lr_saved = __builtin_return_address(0);
4719
4720 if (so->so_proto == NULL) {
4721 panic("%s: null so_proto so=%p\n", __func__, so);
4722 /* NOTREACHED */
4723 }
4724
4725 if (so && so->so_proto->pr_unlock) {
4726 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
4727 } else {
4728 mutex_held = so->so_proto->pr_domain->dom_mtx;
4729#ifdef MORE_LOCKING_DEBUG
4730 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4731#endif
4732 so->unlock_lr[so->next_unlock_lr] = lr_saved;
4733 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
4734
4735 if (refcount) {
4736 if (so->so_usecount <= 0) {
4737 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
4738 "lrh=%s", __func__, so->so_usecount, so,
4739 SOCK_DOM(so), so->so_type,
4740 SOCK_PROTO(so), solockhistory_nr(so));
4741 /* NOTREACHED */
4742 }
4743
4744 so->so_usecount--;
4745 if (so->so_usecount == 0)
4746 sofreelastref(so, 1);
4747 }
4748 lck_mtx_unlock(mutex_held);
4749 }
4750
4751 return (error);
4752}
4753
4754/* Called with socket locked, will unlock socket */
4755void
4756sofree(struct socket *so)
4757{
4758 lck_mtx_t *mutex_held;
4759
4760 if (so->so_proto->pr_getlock != NULL)
4761 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4762 else
4763 mutex_held = so->so_proto->pr_domain->dom_mtx;
4764 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4765
4766 sofreelastref(so, 0);
4767}
4768
4769void
4770soreference(struct socket *so)
4771{
4772 socket_lock(so, 1); /* locks & take one reference on socket */
4773 socket_unlock(so, 0); /* unlock only */
4774}
4775
4776void
4777sodereference(struct socket *so)
4778{
4779 socket_lock(so, 0);
4780 socket_unlock(so, 1);
4781}
4782
4783/*
4784 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
4785 * possibility of using jumbo clusters. Caller must ensure to hold
4786 * the socket lock.
4787 */
4788void
4789somultipages(struct socket *so, boolean_t set)
4790{
4791 if (set)
4792 so->so_flags |= SOF_MULTIPAGES;
4793 else
4794 so->so_flags &= ~SOF_MULTIPAGES;
4795}
4796
4797int
4798so_isdstlocal(struct socket *so) {
4799
4800 struct inpcb *inp = (struct inpcb *)so->so_pcb;
4801
4802 if (SOCK_DOM(so) == PF_INET)
4803 return (inaddr_local(inp->inp_faddr));
4804 else if (SOCK_DOM(so) == PF_INET6)
4805 return (in6addr_local(&inp->in6p_faddr));
4806
4807 return (0);
4808}
4809
4810int
4811sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
4812{
4813 struct sockbuf *rcv, *snd;
4814 int err = 0, defunct;
4815
4816 rcv = &so->so_rcv;
4817 snd = &so->so_snd;
4818
4819 defunct = (so->so_flags & SOF_DEFUNCT);
4820 if (defunct) {
4821 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
4822 panic("%s: SB_DROP not set", __func__);
4823 /* NOTREACHED */
4824 }
4825 goto done;
4826 }
4827
4828 if (so->so_flags & SOF_NODEFUNCT) {
4829 if (noforce) {
4830 err = EOPNOTSUPP;
4831 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) "
4832 "so 0x%llx [%d,%d] is not eligible for defunct "
4833 "(%d)\n", __func__, proc_selfpid(), proc_pid(p),
4834 level, (uint64_t)VM_KERNEL_ADDRPERM(so),
4835 SOCK_DOM(so), SOCK_TYPE(so), err));
4836 return (err);
4837 }
4838 so->so_flags &= ~SOF_NODEFUNCT;
4839 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
4840 "[%d,%d] defunct by force\n", __func__, proc_selfpid(),
4841 proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
4842 SOCK_DOM(so), SOCK_TYPE(so)));
4843 }
4844
4845 so->so_flags |= SOF_DEFUNCT;
4846
4847 /* Prevent further data from being appended to the socket buffers */
4848 snd->sb_flags |= SB_DROP;
4849 rcv->sb_flags |= SB_DROP;
4850
4851 /* Flush any existing data in the socket buffers */
4852 if (rcv->sb_cc != 0) {
4853 rcv->sb_flags &= ~SB_SEL;
4854 selthreadclear(&rcv->sb_sel);
4855 sbrelease(rcv);
4856 }
4857 if (snd->sb_cc != 0) {
4858 snd->sb_flags &= ~SB_SEL;
4859 selthreadclear(&snd->sb_sel);
4860 sbrelease(snd);
4861 }
4862
4863done:
4864 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%d,%d] %s "
4865 "defunct\n", __func__, proc_selfpid(), proc_pid(p), level,
4866 (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so),
4867 defunct ? "is already" : "marked as"));
4868
4869 return (err);
4870}
4871
4872int
4873sodefunct(struct proc *p, struct socket *so, int level)
4874{
4875 struct sockbuf *rcv, *snd;
4876
4877 if (!(so->so_flags & SOF_DEFUNCT)) {
4878 panic("%s improperly called", __func__);
4879 /* NOTREACHED */
4880 }
4881 if (so->so_state & SS_DEFUNCT)
4882 goto done;
4883
4884 rcv = &so->so_rcv;
4885 snd = &so->so_snd;
4886
4887 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
4888 char s[MAX_IPv6_STR_LEN];
4889 char d[MAX_IPv6_STR_LEN];
4890 struct inpcb *inp = sotoinpcb(so);
4891
4892 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%s "
4893 "%s:%d -> %s:%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
4894 "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
4895 proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
4896 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
4897 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
4898 (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr),
4899 s, sizeof (s)), ntohs(inp->in6p_lport),
4900 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
4901 (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr,
4902 d, sizeof (d)), ntohs(inp->in6p_fport),
4903 (uint32_t)rcv->sb_sel.si_flags,
4904 (uint32_t)snd->sb_sel.si_flags,
4905 rcv->sb_flags, snd->sb_flags));
4906 } else {
4907 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
4908 "[%d,%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
4909 "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
4910 proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
4911 SOCK_DOM(so), SOCK_TYPE(so), (uint32_t)rcv->sb_sel.si_flags,
4912 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
4913 snd->sb_flags));
4914 }
4915
4916 /*
4917 * Unwedge threads blocked on sbwait() and sb_lock().
4918 */
4919 sbwakeup(rcv);
4920 sbwakeup(snd);
4921
4922 if (rcv->sb_flags & SB_LOCK)
4923 sbunlock(rcv, TRUE); /* keep socket locked */
4924 if (snd->sb_flags & SB_LOCK)
4925 sbunlock(snd, TRUE); /* keep socket locked */
4926
4927 /*
4928 * Flush the buffers and disconnect. We explicitly call shutdown
4929 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
4930 * states are set for the socket. This would also flush out data
4931 * hanging off the receive list of this socket.
4932 */
4933 (void) soshutdownlock(so, SHUT_RD);
4934 (void) soshutdownlock(so, SHUT_WR);
4935 (void) sodisconnectlocked(so);
4936
4937 /*
4938 * Explicitly handle connectionless-protocol disconnection
4939 * and release any remaining data in the socket buffers.
4940 */
4941 if (!(so->so_flags & SS_ISDISCONNECTED))
4942 (void) soisdisconnected(so);
4943
4944 if (so->so_error == 0)
4945 so->so_error = EBADF;
4946
4947 if (rcv->sb_cc != 0) {
4948 rcv->sb_flags &= ~SB_SEL;
4949 selthreadclear(&rcv->sb_sel);
4950 sbrelease(rcv);
4951 }
4952 if (snd->sb_cc != 0) {
4953 snd->sb_flags &= ~SB_SEL;
4954 selthreadclear(&snd->sb_sel);
4955 sbrelease(snd);
4956 }
4957 so->so_state |= SS_DEFUNCT;
4958
4959done:
4960 return (0);
4961}
4962
4963__private_extern__ int
4964so_set_recv_anyif(struct socket *so, int optval)
4965{
4966 int ret = 0;
4967
4968#if INET6
4969 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
4970#else
4971 if (SOCK_DOM(so) == PF_INET) {
4972#endif /* !INET6 */
4973 if (optval)
4974 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
4975 else
4976 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
4977 }
4978
4979 return (ret);
4980}
4981
4982__private_extern__ int
4983so_get_recv_anyif(struct socket *so)
4984{
4985 int ret = 0;
4986
4987#if INET6
4988 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
4989#else
4990 if (SOCK_DOM(so) == PF_INET) {
4991#endif /* !INET6 */
4992 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
4993 }
4994
4995 return (ret);
4996}
4997
4998int
4999so_set_restrictions(struct socket *so, uint32_t vals)
5000{
5001 int nocell_old, nocell_new;
5002 int ret = 0;
5003
5004 /*
5005 * Deny-type restrictions are trapdoors; once set they cannot be
5006 * unset for the lifetime of the socket. This allows them to be
5007 * issued by a framework on behalf of the application without
5008 * having to worry that they can be undone.
5009 *
5010 * Note here that socket-level restrictions overrides any protocol
5011 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
5012 * socket restriction issued on the socket has a higher precendence
5013 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
5014 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
5015 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
5016 */
5017 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
5018 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
5019 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR));
5020 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
5021
5022 /* other than deny cellular, there's nothing more to do */
5023 if ((nocell_new - nocell_old) == 0)
5024 return (ret);
5025
5026 /* we can only set, not clear restrictions */
5027 VERIFY((nocell_new - nocell_old) > 0);
5028
5029#if INET6
5030 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
5031#else
5032 if (SOCK_DOM(so) == PF_INET) {
5033#endif /* !INET6 */
5034 /* if deny cellular is now set, do what's needed for INPCB */
5035 inp_set_nocellular(sotoinpcb(so));
5036 }
5037
5038 return (ret);
5039}
5040
5041uint32_t
5042so_get_restrictions(struct socket *so)
5043{
5044 return (so->so_restrictions & (SO_RESTRICT_DENY_IN |
5045 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR));
5046}
5047
5048struct sockaddr_entry *
5049sockaddrentry_alloc(int how)
5050{
5051 struct sockaddr_entry *se;
5052
5053 se = (how == M_WAITOK) ? zalloc(se_zone) : zalloc_noblock(se_zone);
5054 if (se != NULL)
5055 bzero(se, se_zone_size);
5056
5057 return (se);
5058}
5059
5060void
5061sockaddrentry_free(struct sockaddr_entry *se)
5062{
5063 if (se->se_addr != NULL) {
5064 FREE(se->se_addr, M_SONAME);
5065 se->se_addr = NULL;
5066 }
5067 zfree(se_zone, se);
5068}
5069
5070struct sockaddr_entry *
5071sockaddrentry_dup(const struct sockaddr_entry *src_se, int how)
5072{
5073 struct sockaddr_entry *dst_se;
5074
5075 dst_se = sockaddrentry_alloc(how);
5076 if (dst_se != NULL) {
5077 int len = src_se->se_addr->sa_len;
5078
5079 MALLOC(dst_se->se_addr, struct sockaddr *,
5080 len, M_SONAME, how | M_ZERO);
5081 if (dst_se->se_addr != NULL) {
5082 bcopy(src_se->se_addr, dst_se->se_addr, len);
5083 } else {
5084 sockaddrentry_free(dst_se);
5085 dst_se = NULL;
5086 }
5087 }
5088
5089 return (dst_se);
5090}
5091
5092struct sockaddr_list *
5093sockaddrlist_alloc(int how)
5094{
5095 struct sockaddr_list *sl;
5096
5097 sl = (how == M_WAITOK) ? zalloc(sl_zone) : zalloc_noblock(sl_zone);
5098 if (sl != NULL) {
5099 bzero(sl, sl_zone_size);
5100 TAILQ_INIT(&sl->sl_head);
5101 }
5102 return (sl);
5103}
5104
5105void
5106sockaddrlist_free(struct sockaddr_list *sl)
5107{
5108 struct sockaddr_entry *se, *tse;
5109
5110 TAILQ_FOREACH_SAFE(se, &sl->sl_head, se_link, tse) {
5111 sockaddrlist_remove(sl, se);
5112 sockaddrentry_free(se);
5113 }
5114 VERIFY(sl->sl_cnt == 0 && TAILQ_EMPTY(&sl->sl_head));
5115 zfree(sl_zone, sl);
5116}
5117
5118void
5119sockaddrlist_insert(struct sockaddr_list *sl, struct sockaddr_entry *se)
5120{
5121 VERIFY(!(se->se_flags & SEF_ATTACHED));
5122 se->se_flags |= SEF_ATTACHED;
5123 TAILQ_INSERT_TAIL(&sl->sl_head, se, se_link);
5124 sl->sl_cnt++;
5125 VERIFY(sl->sl_cnt != 0);
5126}
5127
5128void
5129sockaddrlist_remove(struct sockaddr_list *sl, struct sockaddr_entry *se)
5130{
5131 VERIFY(se->se_flags & SEF_ATTACHED);
5132 se->se_flags &= ~SEF_ATTACHED;
5133 VERIFY(sl->sl_cnt != 0);
5134 sl->sl_cnt--;
5135 TAILQ_REMOVE(&sl->sl_head, se, se_link);
5136}
5137
5138struct sockaddr_list *
5139sockaddrlist_dup(const struct sockaddr_list *src_sl, int how)
5140{
5141 struct sockaddr_entry *src_se, *tse;
5142 struct sockaddr_list *dst_sl;
5143
5144 dst_sl = sockaddrlist_alloc(how);
5145 if (dst_sl == NULL)
5146 return (NULL);
5147
5148 TAILQ_FOREACH_SAFE(src_se, &src_sl->sl_head, se_link, tse) {
5149 struct sockaddr_entry *dst_se;
5150
5151 if (src_se->se_addr == NULL)
5152 continue;
5153
5154 dst_se = sockaddrentry_dup(src_se, how);
5155 if (dst_se == NULL) {
5156 sockaddrlist_free(dst_sl);
5157 return (NULL);
5158 }
5159
5160 sockaddrlist_insert(dst_sl, dst_se);
5161 }
5162 VERIFY(src_sl->sl_cnt == dst_sl->sl_cnt);
5163
5164 return (dst_sl);
5165}
5166
5167int
5168so_set_effective_pid(struct socket *so, int epid, struct proc *p)
5169{
5170 struct proc *ep = PROC_NULL;
5171 int error = 0;
5172
5173 /* pid 0 is reserved for kernel */
5174 if (epid == 0) {
5175 error = EINVAL;
5176 goto done;
5177 }
5178
5179 /*
5180 * If this is an in-kernel socket, prevent its delegate
5181 * association from changing unless the socket option is
5182 * coming from within the kernel itself.
5183 */
5184 if (so->last_pid == 0 && p != kernproc) {
5185 error = EACCES;
5186 goto done;
5187 }
5188
5189 /*
5190 * If this is issued by a process that's recorded as the
5191 * real owner of the socket, or if the pid is the same as
5192 * the process's own pid, then proceed. Otherwise ensure
5193 * that the issuing process has the necessary privileges.
5194 */
5195 if (epid != so->last_pid || epid != proc_pid(p)) {
5196 if ((error = priv_check_cred(kauth_cred_get(),
5197 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
5198 error = EACCES;
5199 goto done;
5200 }
5201 }
5202
5203 /* Find the process that corresponds to the effective pid */
5204 if ((ep = proc_find(epid)) == PROC_NULL) {
5205 error = ESRCH;
5206 goto done;
5207 }
5208
5209 /*
5210 * If a process tries to delegate the socket to itself, then
5211 * there's really nothing to do; treat it as a way for the
5212 * delegate association to be cleared. Note that we check
5213 * the passed-in proc rather than calling proc_selfpid(),
5214 * as we need to check the process issuing the socket option
5215 * which could be kernproc. Given that we don't allow 0 for
5216 * effective pid, it means that a delegated in-kernel socket
5217 * stays delegated during its lifetime (which is probably OK.)
5218 */
5219 if (epid == proc_pid(p)) {
5220 so->so_flags &= ~SOF_DELEGATED;
5221 so->e_upid = 0;
5222 so->e_pid = 0;
5223 uuid_clear(so->e_uuid);
5224 } else {
5225 so->so_flags |= SOF_DELEGATED;
5226 so->e_upid = proc_uniqueid(ep);
5227 so->e_pid = proc_pid(ep);
5228 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
5229 }
5230
5231done:
5232 if (error == 0 && net_io_policy_log) {
5233 uuid_string_t buf;
5234
5235 uuid_unparse(so->e_uuid, buf);
5236 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
5237 "euuid %s%s\n", __func__, proc_name_address(p),
5238 proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
5239 SOCK_TYPE(so), so->e_pid, proc_name_address(ep), buf,
5240 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
5241 } else if (error != 0 && net_io_policy_log) {
5242 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
5243 "ERROR (%d)\n", __func__, proc_name_address(p),
5244 proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
5245 SOCK_TYPE(so), epid, (ep == PROC_NULL) ? "PROC_NULL" :
5246 proc_name_address(ep), error);
5247 }
5248
5249 if (ep != PROC_NULL)
5250 proc_rele(ep);
5251
5252 return (error);
5253}
5254
5255int
5256so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
5257{
5258 uuid_string_t buf;
5259 uuid_t uuid;
5260 int error = 0;
5261
5262 /* UUID must not be all-zeroes (reserved for kernel) */
5263 if (uuid_is_null(euuid)) {
5264 error = EINVAL;
5265 goto done;;
5266 }
5267
5268 /*
5269 * If this is an in-kernel socket, prevent its delegate
5270 * association from changing unless the socket option is
5271 * coming from within the kernel itself.
5272 */
5273 if (so->last_pid == 0 && p != kernproc) {
5274 error = EACCES;
5275 goto done;
5276 }
5277
5278 /* Get the UUID of the issuing process */
5279 proc_getexecutableuuid(p, uuid, sizeof (uuid));
5280
5281 /*
5282 * If this is issued by a process that's recorded as the
5283 * real owner of the socket, or if the uuid is the same as
5284 * the process's own uuid, then proceed. Otherwise ensure
5285 * that the issuing process has the necessary privileges.
5286 */
5287 if (uuid_compare(euuid, so->last_uuid) != 0 ||
5288 uuid_compare(euuid, uuid) != 0) {
5289 if ((error = priv_check_cred(kauth_cred_get(),
5290 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
5291 error = EACCES;
5292 goto done;
5293 }
5294 }
5295
5296 /*
5297 * If a process tries to delegate the socket to itself, then
5298 * there's really nothing to do; treat it as a way for the
5299 * delegate association to be cleared. Note that we check
5300 * the uuid of the passed-in proc rather than that of the
5301 * current process, as we need to check the process issuing
5302 * the socket option which could be kernproc itself. Given
5303 * that we don't allow 0 for effective uuid, it means that
5304 * a delegated in-kernel socket stays delegated during its
5305 * lifetime (which is okay.)
5306 */
5307 if (uuid_compare(euuid, uuid) == 0) {
5308 so->so_flags &= ~SOF_DELEGATED;
5309 so->e_upid = 0;
5310 so->e_pid = 0;
5311 uuid_clear(so->e_uuid);
5312 } else {
5313 so->so_flags |= SOF_DELEGATED;
5314 /*
5315 * Unlike so_set_effective_pid(), we only have the UUID
5316 * here and the process ID is not known. Inherit the
5317 * real {pid,upid} of the socket.
5318 */
5319 so->e_upid = so->last_upid;
5320 so->e_pid = so->last_pid;
5321 uuid_copy(so->e_uuid, euuid);
5322 }
5323
5324done:
5325 if (error == 0 && net_io_policy_log) {
5326 uuid_unparse(so->e_uuid, buf);
5327 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
5328 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
5329 (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
5330 SOCK_TYPE(so), so->e_pid, buf,
5331 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
5332 } else if (error != 0 && net_io_policy_log) {
5333 uuid_unparse(euuid, buf);
5334 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
5335 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
5336 (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
5337 SOCK_TYPE(so), buf, error);
5338 }
5339
5340 return (error);
5341}
5342
5343void
5344netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
5345 uint32_t ev_datalen)
5346{
5347 struct kev_msg ev_msg;
5348
5349 /*
5350 * A netpolicy event always starts with a netpolicy_event_data
5351 * structure, but the caller can provide for a longer event
5352 * structure to post, depending on the event code.
5353 */
5354 VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
5355
5356 bzero(&ev_msg, sizeof (ev_msg));
5357 ev_msg.vendor_code = KEV_VENDOR_APPLE;
5358 ev_msg.kev_class = KEV_NETWORK_CLASS;
5359 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
5360 ev_msg.event_code = ev_code;
5361
5362 ev_msg.dv[0].data_ptr = ev_data;
5363 ev_msg.dv[0].data_length = ev_datalen;
5364
5365 kev_post_msg(&ev_msg);
5366}