]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/uipc_socket.c
6b57b3cf8733da668789a871b9f52f41dafc6f3f
[apple/xnu.git] / bsd / kern / uipc_socket.c
1 /*
2 * Copyright (c) 1998-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/ntstat.h>
102 #include <net/content_filter.h>
103 #include <netinet/in.h>
104 #include <netinet/in_pcb.h>
105 #include <netinet/ip6.h>
106 #include <netinet6/ip6_var.h>
107 #include <netinet/flow_divert.h>
108 #include <kern/zalloc.h>
109 #include <kern/locks.h>
110 #include <machine/limits.h>
111 #include <libkern/OSAtomic.h>
112 #include <pexpert/pexpert.h>
113 #include <kern/assert.h>
114 #include <kern/task.h>
115 #include <sys/kpi_mbuf.h>
116 #include <sys/mcache.h>
117 #include <sys/unpcb.h>
118
119 #if CONFIG_MACF
120 #include <security/mac.h>
121 #include <security/mac_framework.h>
122 #endif /* MAC */
123
124 #if MULTIPATH
125 #include <netinet/mp_pcb.h>
126 #include <netinet/mptcp_var.h>
127 #endif /* MULTIPATH */
128
129 /* TODO: this should be in a header file somewhere */
130 extern char *proc_name_address(void *p);
131
132 static u_int32_t so_cache_hw; /* High water mark for socache */
133 static u_int32_t so_cache_timeouts; /* number of timeouts */
134 static u_int32_t so_cache_max_freed; /* max freed per timeout */
135 static u_int32_t cached_sock_count = 0;
136 STAILQ_HEAD(, socket) so_cache_head;
137 int max_cached_sock_count = MAX_CACHED_SOCKETS;
138 static u_int32_t so_cache_time;
139 static int socketinit_done;
140 static struct zone *so_cache_zone;
141
142 static lck_grp_t *so_cache_mtx_grp;
143 static lck_attr_t *so_cache_mtx_attr;
144 static lck_grp_attr_t *so_cache_mtx_grp_attr;
145 static lck_mtx_t *so_cache_mtx;
146
147 #include <machine/limits.h>
148
149 static void filt_sordetach(struct knote *kn);
150 static int filt_soread(struct knote *kn, long hint);
151 static void filt_sowdetach(struct knote *kn);
152 static int filt_sowrite(struct knote *kn, long hint);
153 static void filt_sockdetach(struct knote *kn);
154 static int filt_sockev(struct knote *kn, long hint);
155
156 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
157 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
158
159 static struct filterops soread_filtops = {
160 .f_isfd = 1,
161 .f_detach = filt_sordetach,
162 .f_event = filt_soread,
163 };
164
165 static struct filterops sowrite_filtops = {
166 .f_isfd = 1,
167 .f_detach = filt_sowdetach,
168 .f_event = filt_sowrite,
169 };
170
171 static struct filterops sock_filtops = {
172 .f_isfd = 1,
173 .f_detach = filt_sockdetach,
174 .f_event = filt_sockev,
175 };
176
177 SYSCTL_DECL(_kern_ipc);
178
179 #define EVEN_MORE_LOCKING_DEBUG 0
180
181 int socket_debug = 0;
182 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
183 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
184
185 static int socket_zone = M_SOCKET;
186 so_gen_t so_gencnt; /* generation count for sockets */
187
188 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
189 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
190
191 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
192 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
193 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
194 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
195 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
196 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
197 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
198 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
199 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
200
201 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
202
203 int somaxconn = SOMAXCONN;
204 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
205 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
206
207 /* Should we get a maximum also ??? */
208 static int sosendmaxchain = 65536;
209 static int sosendminchain = 16384;
210 static int sorecvmincopy = 16384;
211 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
212 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
213 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
214 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
215
216 /*
217 * Set to enable jumbo clusters (if available) for large writes when
218 * the socket is marked with SOF_MULTIPAGES; see below.
219 */
220 int sosendjcl = 1;
221 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
222 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
223
224 /*
225 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
226 * writes on the socket for all protocols on any network interfaces,
227 * depending upon sosendjcl above. Be extra careful when setting this
228 * to 1, because sending down packets that cross physical pages down to
229 * broken drivers (those that falsely assume that the physical pages
230 * are contiguous) might lead to system panics or silent data corruption.
231 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
232 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
233 * capable. Set this to 1 only for testing/debugging purposes.
234 */
235 int sosendjcl_ignore_capab = 0;
236 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
237 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
238
239 int sosendbigcl_ignore_capab = 0;
240 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
241 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
242
243 int sodefunctlog = 0;
244 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
245 &sodefunctlog, 0, "");
246
247 int sothrottlelog = 0;
248 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
249 &sothrottlelog, 0, "");
250
251 int sorestrictrecv = 1;
252 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
253 &sorestrictrecv, 0, "Enable inbound interface restrictions");
254
255 int sorestrictsend = 1;
256 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
257 &sorestrictsend, 0, "Enable outbound interface restrictions");
258
259 extern struct inpcbinfo tcbinfo;
260
261 /* TODO: these should be in header file */
262 extern int get_inpcb_str_size(void);
263 extern int get_tcp_str_size(void);
264
265 static unsigned int sl_zone_size; /* size of sockaddr_list */
266 static struct zone *sl_zone; /* zone for sockaddr_list */
267
268 static unsigned int se_zone_size; /* size of sockaddr_entry */
269 static struct zone *se_zone; /* zone for sockaddr_entry */
270
271 vm_size_t so_cache_zone_element_size;
272
273 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **, user_ssize_t *);
274 static void cached_sock_alloc(struct socket **, int);
275 static void cached_sock_free(struct socket *);
276
277 /*
278 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
279 * setting the DSCP code on the packet based on the service class; see
280 * <rdar://problem/11277343> for details.
281 */
282 __private_extern__ u_int32_t sotcdb = SOTCDB_NO_DSCP;
283 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
284 &sotcdb, 0, "");
285
286 void
287 socketinit(void)
288 {
289 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
290 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
291
292 if (socketinit_done) {
293 printf("socketinit: already called...\n");
294 return;
295 }
296 socketinit_done = 1;
297
298 PE_parse_boot_argn("socket_debug", &socket_debug,
299 sizeof (socket_debug));
300
301 /*
302 * allocate lock group attribute and group for socket cache mutex
303 */
304 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
305 so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
306 so_cache_mtx_grp_attr);
307
308 /*
309 * allocate the lock attribute for socket cache mutex
310 */
311 so_cache_mtx_attr = lck_attr_alloc_init();
312
313 /* cached sockets mutex */
314 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
315 if (so_cache_mtx == NULL) {
316 panic("%s: unable to allocate so_cache_mtx\n", __func__);
317 /* NOTREACHED */
318 }
319 STAILQ_INIT(&so_cache_head);
320
321 so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4
322 + get_inpcb_str_size() + 4 + get_tcp_str_size());
323
324 so_cache_zone = zinit(so_cache_zone_element_size,
325 (120000 * so_cache_zone_element_size), 8192, "socache zone");
326 zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
327 zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
328
329 sl_zone_size = sizeof (struct sockaddr_list);
330 if ((sl_zone = zinit(sl_zone_size, 1024 * sl_zone_size, 1024,
331 "sockaddr_list")) == NULL) {
332 panic("%s: unable to allocate sockaddr_list zone\n", __func__);
333 /* NOTREACHED */
334 }
335 zone_change(sl_zone, Z_CALLERACCT, FALSE);
336 zone_change(sl_zone, Z_EXPAND, TRUE);
337
338 se_zone_size = sizeof (struct sockaddr_entry);
339 if ((se_zone = zinit(se_zone_size, 1024 * se_zone_size, 1024,
340 "sockaddr_entry")) == NULL) {
341 panic("%s: unable to allocate sockaddr_entry zone\n", __func__);
342 /* NOTREACHED */
343 }
344 zone_change(se_zone, Z_CALLERACCT, FALSE);
345 zone_change(se_zone, Z_EXPAND, TRUE);
346
347
348 in_pcbinit();
349 sflt_init();
350 socket_tclass_init();
351 #if MULTIPATH
352 mp_pcbinit();
353 #endif /* MULTIPATH */
354 }
355
356 static void
357 cached_sock_alloc(struct socket **so, int waitok)
358 {
359 caddr_t temp;
360 uintptr_t offset;
361
362 lck_mtx_lock(so_cache_mtx);
363
364 if (!STAILQ_EMPTY(&so_cache_head)) {
365 VERIFY(cached_sock_count > 0);
366
367 *so = STAILQ_FIRST(&so_cache_head);
368 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
369 STAILQ_NEXT((*so), so_cache_ent) = NULL;
370
371 cached_sock_count--;
372 lck_mtx_unlock(so_cache_mtx);
373
374 temp = (*so)->so_saved_pcb;
375 bzero((caddr_t)*so, sizeof (struct socket));
376
377 (*so)->so_saved_pcb = temp;
378 } else {
379
380 lck_mtx_unlock(so_cache_mtx);
381
382 if (waitok)
383 *so = (struct socket *)zalloc(so_cache_zone);
384 else
385 *so = (struct socket *)zalloc_noblock(so_cache_zone);
386
387 if (*so == NULL)
388 return;
389
390 bzero((caddr_t)*so, sizeof (struct socket));
391
392 /*
393 * Define offsets for extra structures into our
394 * single block of memory. Align extra structures
395 * on longword boundaries.
396 */
397
398 offset = (uintptr_t)*so;
399 offset += sizeof (struct socket);
400
401 offset = ALIGN(offset);
402
403 (*so)->so_saved_pcb = (caddr_t)offset;
404 offset += get_inpcb_str_size();
405
406 offset = ALIGN(offset);
407
408 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
409 (caddr_t)offset;
410 }
411
412 (*so)->cached_in_sock_layer = true;
413 }
414
415 static void
416 cached_sock_free(struct socket *so)
417 {
418
419 lck_mtx_lock(so_cache_mtx);
420
421 so_cache_time = net_uptime();
422 if (++cached_sock_count > max_cached_sock_count) {
423 --cached_sock_count;
424 lck_mtx_unlock(so_cache_mtx);
425 zfree(so_cache_zone, so);
426 } else {
427 if (so_cache_hw < cached_sock_count)
428 so_cache_hw = cached_sock_count;
429
430 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
431
432 so->cache_timestamp = so_cache_time;
433 lck_mtx_unlock(so_cache_mtx);
434 }
435 }
436
437 void
438 so_update_last_owner_locked(struct socket *so, proc_t self)
439 {
440 if (so->last_pid != 0) {
441 /*
442 * last_pid and last_upid should remain zero for sockets
443 * created using sock_socket. The check above achieves that
444 */
445 if (self == PROC_NULL)
446 self = current_proc();
447
448 if (so->last_upid != proc_uniqueid(self) ||
449 so->last_pid != proc_pid(self)) {
450 so->last_upid = proc_uniqueid(self);
451 so->last_pid = proc_pid(self);
452 proc_getexecutableuuid(self, so->last_uuid,
453 sizeof (so->last_uuid));
454 }
455 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
456 }
457 }
458
459 void
460 so_update_policy(struct socket *so)
461 {
462 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
463 (void) inp_update_policy(sotoinpcb(so));
464 }
465
466 #if NECP
467 static void
468 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr, struct sockaddr *override_remote_addr)
469 {
470 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
471 inp_update_necp_policy(sotoinpcb(so), override_local_addr, override_remote_addr, 0);
472 }
473 #endif /* NECP */
474
475 boolean_t
476 so_cache_timer(void)
477 {
478 struct socket *p;
479 int n_freed = 0;
480 boolean_t rc = FALSE;
481
482 lck_mtx_lock(so_cache_mtx);
483 so_cache_timeouts++;
484 so_cache_time = net_uptime();
485
486 while (!STAILQ_EMPTY(&so_cache_head)) {
487 VERIFY(cached_sock_count > 0);
488 p = STAILQ_FIRST(&so_cache_head);
489 if ((so_cache_time - p->cache_timestamp) <
490 SO_CACHE_TIME_LIMIT)
491 break;
492
493 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
494 --cached_sock_count;
495
496 zfree(so_cache_zone, p);
497
498 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
499 so_cache_max_freed++;
500 break;
501 }
502 }
503
504 /* Schedule again if there is more to cleanup */
505 if (!STAILQ_EMPTY(&so_cache_head))
506 rc = TRUE;
507
508 lck_mtx_unlock(so_cache_mtx);
509 return (rc);
510 }
511
512 /*
513 * Get a socket structure from our zone, and initialize it.
514 * We don't implement `waitok' yet (see comments in uipc_domain.c).
515 * Note that it would probably be better to allocate socket
516 * and PCB at the same time, but I'm not convinced that all
517 * the protocols can be easily modified to do this.
518 */
519 struct socket *
520 soalloc(int waitok, int dom, int type)
521 {
522 struct socket *so;
523
524 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
525 cached_sock_alloc(&so, waitok);
526 } else {
527 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
528 M_WAITOK);
529 if (so != NULL)
530 bzero(so, sizeof (*so));
531 }
532 if (so != NULL) {
533 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
534 so->so_zone = socket_zone;
535 #if CONFIG_MACF_SOCKET
536 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
537 if (mac_socket_label_init(so, !waitok) != 0) {
538 sodealloc(so);
539 return (NULL);
540 }
541 #endif /* MAC_SOCKET */
542 }
543
544 return (so);
545 }
546
547 int
548 socreate_internal(int dom, struct socket **aso, int type, int proto,
549 struct proc *p, uint32_t flags, struct proc *ep)
550 {
551 struct protosw *prp;
552 struct socket *so;
553 int error = 0;
554
555 #if TCPDEBUG
556 extern int tcpconsdebug;
557 #endif
558
559 VERIFY(aso != NULL);
560 *aso = NULL;
561
562 if (proto != 0)
563 prp = pffindproto(dom, proto, type);
564 else
565 prp = pffindtype(dom, type);
566
567 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
568 if (pffinddomain(dom) == NULL)
569 return (EAFNOSUPPORT);
570 if (proto != 0) {
571 if (pffindprotonotype(dom, proto) != NULL)
572 return (EPROTOTYPE);
573 }
574 return (EPROTONOSUPPORT);
575 }
576 if (prp->pr_type != type)
577 return (EPROTOTYPE);
578 so = soalloc(1, dom, type);
579 if (so == NULL)
580 return (ENOBUFS);
581
582 if (flags & SOCF_ASYNC)
583 so->so_state |= SS_NBIO;
584 #if MULTIPATH
585 if (flags & SOCF_MP_SUBFLOW) {
586 /*
587 * A multipath subflow socket is used internally in the kernel,
588 * therefore it does not have a file desciptor associated by
589 * default.
590 */
591 so->so_state |= SS_NOFDREF;
592 so->so_flags |= SOF_MP_SUBFLOW;
593 }
594 #endif /* MULTIPATH */
595
596 TAILQ_INIT(&so->so_incomp);
597 TAILQ_INIT(&so->so_comp);
598 so->so_type = type;
599 so->last_upid = proc_uniqueid(p);
600 so->last_pid = proc_pid(p);
601 proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
602 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
603
604 if (ep != PROC_NULL && ep != p) {
605 so->e_upid = proc_uniqueid(ep);
606 so->e_pid = proc_pid(ep);
607 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
608 so->so_flags |= SOF_DELEGATED;
609 }
610
611 so->so_cred = kauth_cred_proc_ref(p);
612 if (!suser(kauth_cred_get(), NULL))
613 so->so_state |= SS_PRIV;
614
615 so->so_proto = prp;
616 so->so_rcv.sb_flags |= SB_RECV;
617 so->so_rcv.sb_so = so->so_snd.sb_so = so;
618 so->next_lock_lr = 0;
619 so->next_unlock_lr = 0;
620
621 #if CONFIG_MACF_SOCKET
622 mac_socket_label_associate(kauth_cred_get(), so);
623 #endif /* MAC_SOCKET */
624
625 /*
626 * Attachment will create the per pcb lock if necessary and
627 * increase refcount for creation, make sure it's done before
628 * socket is inserted in lists.
629 */
630 so->so_usecount++;
631
632 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
633 if (error != 0) {
634 /*
635 * Warning:
636 * If so_pcb is not zero, the socket will be leaked,
637 * so protocol attachment handler must be coded carefuly
638 */
639 so->so_state |= SS_NOFDREF;
640 so->so_usecount--;
641 sofreelastref(so, 1); /* will deallocate the socket */
642 return (error);
643 }
644
645 atomic_add_32(&prp->pr_domain->dom_refs, 1);
646 TAILQ_INIT(&so->so_evlist);
647
648 /* Attach socket filters for this protocol */
649 sflt_initsock(so);
650 #if TCPDEBUG
651 if (tcpconsdebug == 2)
652 so->so_options |= SO_DEBUG;
653 #endif
654 so_set_default_traffic_class(so);
655
656 /*
657 * If this thread or task is marked to create backgrounded sockets,
658 * mark the socket as background.
659 */
660 if (proc_get_effective_thread_policy(current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
661 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
662 so->so_background_thread = current_thread();
663 }
664
665 switch (dom) {
666 /*
667 * Don't mark Unix domain, system or multipath sockets as
668 * eligible for defunct by default.
669 */
670 case PF_LOCAL:
671 case PF_SYSTEM:
672 case PF_MULTIPATH:
673 so->so_flags |= SOF_NODEFUNCT;
674 break;
675 default:
676 break;
677 }
678
679 /*
680 * Entitlements can't be checked at socket creation time except if the
681 * application requested a feature guarded by a privilege (c.f., socket
682 * delegation).
683 * The priv(9) and the Sandboxing APIs are designed with the idea that
684 * a privilege check should only be triggered by a userland request.
685 * A privilege check at socket creation time is time consuming and
686 * could trigger many authorisation error messages from the security
687 * APIs.
688 */
689
690 *aso = so;
691
692 return (0);
693 }
694
695 /*
696 * Returns: 0 Success
697 * EAFNOSUPPORT
698 * EPROTOTYPE
699 * EPROTONOSUPPORT
700 * ENOBUFS
701 * <pru_attach>:ENOBUFS[AF_UNIX]
702 * <pru_attach>:ENOBUFS[TCP]
703 * <pru_attach>:ENOMEM[TCP]
704 * <pru_attach>:??? [other protocol families, IPSEC]
705 */
706 int
707 socreate(int dom, struct socket **aso, int type, int proto)
708 {
709 return (socreate_internal(dom, aso, type, proto, current_proc(), 0,
710 PROC_NULL));
711 }
712
713 int
714 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
715 {
716 int error = 0;
717 struct proc *ep = PROC_NULL;
718
719 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
720 error = ESRCH;
721 goto done;
722 }
723
724 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
725
726 /*
727 * It might not be wise to hold the proc reference when calling
728 * socreate_internal since it calls soalloc with M_WAITOK
729 */
730 done:
731 if (ep != PROC_NULL)
732 proc_rele(ep);
733
734 return (error);
735 }
736
737 /*
738 * Returns: 0 Success
739 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
740 * <pru_bind>:EAFNOSUPPORT Address family not supported
741 * <pru_bind>:EADDRNOTAVAIL Address not available.
742 * <pru_bind>:EINVAL Invalid argument
743 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
744 * <pru_bind>:EACCES Permission denied
745 * <pru_bind>:EADDRINUSE Address in use
746 * <pru_bind>:EAGAIN Resource unavailable, try again
747 * <pru_bind>:EPERM Operation not permitted
748 * <pru_bind>:???
749 * <sf_bind>:???
750 *
751 * Notes: It's not possible to fully enumerate the return codes above,
752 * since socket filter authors and protocol family authors may
753 * not choose to limit their error returns to those listed, even
754 * though this may result in some software operating incorrectly.
755 *
756 * The error codes which are enumerated above are those known to
757 * be returned by the tcp_usr_bind function supplied.
758 */
759 int
760 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
761 {
762 struct proc *p = current_proc();
763 int error = 0;
764
765 if (dolock)
766 socket_lock(so, 1);
767 VERIFY(so->so_usecount > 1);
768
769 so_update_last_owner_locked(so, p);
770 so_update_policy(so);
771
772 #if NECP
773 so_update_necp_policy(so, nam, NULL);
774 #endif /* NECP */
775
776 /*
777 * If this is a bind request on a socket that has been marked
778 * as inactive, reject it now before we go any further.
779 */
780 if (so->so_flags & SOF_DEFUNCT) {
781 error = EINVAL;
782 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
783 __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
784 SOCK_DOM(so), SOCK_TYPE(so), error));
785 goto out;
786 }
787
788 /* Socket filter */
789 error = sflt_bind(so, nam);
790
791 if (error == 0)
792 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
793 out:
794 if (dolock)
795 socket_unlock(so, 1);
796
797 if (error == EJUSTRETURN)
798 error = 0;
799
800 return (error);
801 }
802
803 void
804 sodealloc(struct socket *so)
805 {
806 kauth_cred_unref(&so->so_cred);
807
808 /* Remove any filters */
809 sflt_termsock(so);
810
811 #if CONTENT_FILTER
812 cfil_sock_detach(so);
813 #endif /* CONTENT_FILTER */
814
815 /* Delete the state allocated for msg queues on a socket */
816 if (so->so_flags & SOF_ENABLE_MSGS) {
817 FREE(so->so_msg_state, M_TEMP);
818 so->so_msg_state = NULL;
819 }
820 VERIFY(so->so_msg_state == NULL);
821
822 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
823
824 #if CONFIG_MACF_SOCKET
825 mac_socket_label_destroy(so);
826 #endif /* MAC_SOCKET */
827
828 if (so->cached_in_sock_layer) {
829 cached_sock_free(so);
830 } else {
831 FREE_ZONE(so, sizeof (*so), so->so_zone);
832 }
833 }
834
835 /*
836 * Returns: 0 Success
837 * EINVAL
838 * EOPNOTSUPP
839 * <pru_listen>:EINVAL[AF_UNIX]
840 * <pru_listen>:EINVAL[TCP]
841 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
842 * <pru_listen>:EINVAL[TCP] Invalid argument
843 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
844 * <pru_listen>:EACCES[TCP] Permission denied
845 * <pru_listen>:EADDRINUSE[TCP] Address in use
846 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
847 * <pru_listen>:EPERM[TCP] Operation not permitted
848 * <sf_listen>:???
849 *
850 * Notes: Other <pru_listen> returns depend on the protocol family; all
851 * <sf_listen> returns depend on what the filter author causes
852 * their filter to return.
853 */
854 int
855 solisten(struct socket *so, int backlog)
856 {
857 struct proc *p = current_proc();
858 int error = 0;
859
860 socket_lock(so, 1);
861
862 so_update_last_owner_locked(so, p);
863 so_update_policy(so);
864
865 #if NECP
866 so_update_necp_policy(so, NULL, NULL);
867 #endif /* NECP */
868
869 if (so->so_proto == NULL) {
870 error = EINVAL;
871 goto out;
872 }
873 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
874 error = EOPNOTSUPP;
875 goto out;
876 }
877
878 /*
879 * If the listen request is made on a socket that is not fully
880 * disconnected, or on a socket that has been marked as inactive,
881 * reject the request now.
882 */
883 if ((so->so_state &
884 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
885 (so->so_flags & SOF_DEFUNCT)) {
886 error = EINVAL;
887 if (so->so_flags & SOF_DEFUNCT) {
888 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
889 "(%d)\n", __func__, proc_pid(p),
890 (uint64_t)VM_KERNEL_ADDRPERM(so),
891 SOCK_DOM(so), SOCK_TYPE(so), error));
892 }
893 goto out;
894 }
895
896 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
897 error = EPERM;
898 goto out;
899 }
900
901 error = sflt_listen(so);
902 if (error == 0)
903 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
904
905 if (error) {
906 if (error == EJUSTRETURN)
907 error = 0;
908 goto out;
909 }
910
911 if (TAILQ_EMPTY(&so->so_comp))
912 so->so_options |= SO_ACCEPTCONN;
913 /*
914 * POSIX: The implementation may have an upper limit on the length of
915 * the listen queue-either global or per accepting socket. If backlog
916 * exceeds this limit, the length of the listen queue is set to the
917 * limit.
918 *
919 * If listen() is called with a backlog argument value that is less
920 * than 0, the function behaves as if it had been called with a backlog
921 * argument value of 0.
922 *
923 * A backlog argument of 0 may allow the socket to accept connections,
924 * in which case the length of the listen queue may be set to an
925 * implementation-defined minimum value.
926 */
927 if (backlog <= 0 || backlog > somaxconn)
928 backlog = somaxconn;
929
930 so->so_qlimit = backlog;
931 out:
932 socket_unlock(so, 1);
933 return (error);
934 }
935
936 void
937 sofreelastref(struct socket *so, int dealloc)
938 {
939 struct socket *head = so->so_head;
940
941 /* Assume socket is locked */
942
943 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
944 selthreadclear(&so->so_snd.sb_sel);
945 selthreadclear(&so->so_rcv.sb_sel);
946 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
947 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
948 so->so_event = sonullevent;
949 return;
950 }
951 if (head != NULL) {
952 socket_lock(head, 1);
953 if (so->so_state & SS_INCOMP) {
954 TAILQ_REMOVE(&head->so_incomp, so, so_list);
955 head->so_incqlen--;
956 } else if (so->so_state & SS_COMP) {
957 /*
958 * We must not decommission a socket that's
959 * on the accept(2) queue. If we do, then
960 * accept(2) may hang after select(2) indicated
961 * that the listening socket was ready.
962 */
963 selthreadclear(&so->so_snd.sb_sel);
964 selthreadclear(&so->so_rcv.sb_sel);
965 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
966 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
967 so->so_event = sonullevent;
968 socket_unlock(head, 1);
969 return;
970 } else {
971 panic("sofree: not queued");
972 }
973 head->so_qlen--;
974 so->so_state &= ~SS_INCOMP;
975 so->so_head = NULL;
976 socket_unlock(head, 1);
977 }
978 sowflush(so);
979 sorflush(so);
980
981 #if FLOW_DIVERT
982 if (so->so_flags & SOF_FLOW_DIVERT) {
983 flow_divert_detach(so);
984 }
985 #endif /* FLOW_DIVERT */
986
987 /* 3932268: disable upcall */
988 so->so_rcv.sb_flags &= ~SB_UPCALL;
989 so->so_snd.sb_flags &= ~SB_UPCALL;
990 so->so_event = sonullevent;
991
992 if (dealloc)
993 sodealloc(so);
994 }
995
996 void
997 soclose_wait_locked(struct socket *so)
998 {
999 lck_mtx_t *mutex_held;
1000
1001 if (so->so_proto->pr_getlock != NULL)
1002 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1003 else
1004 mutex_held = so->so_proto->pr_domain->dom_mtx;
1005 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1006
1007 /*
1008 * Double check here and return if there's no outstanding upcall;
1009 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1010 */
1011 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
1012 return;
1013 so->so_rcv.sb_flags &= ~SB_UPCALL;
1014 so->so_snd.sb_flags &= ~SB_UPCALL;
1015 so->so_flags |= SOF_CLOSEWAIT;
1016 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1017 "soclose_wait_locked", NULL);
1018 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1019 so->so_flags &= ~SOF_CLOSEWAIT;
1020 }
1021
1022 /*
1023 * Close a socket on last file table reference removal.
1024 * Initiate disconnect if connected.
1025 * Free socket when disconnect complete.
1026 */
1027 int
1028 soclose_locked(struct socket *so)
1029 {
1030 int error = 0;
1031 lck_mtx_t *mutex_held;
1032 struct timespec ts;
1033
1034 if (so->so_usecount == 0) {
1035 panic("soclose: so=%p refcount=0\n", so);
1036 /* NOTREACHED */
1037 }
1038
1039 sflt_notify(so, sock_evt_closing, NULL);
1040
1041 if (so->so_upcallusecount)
1042 soclose_wait_locked(so);
1043
1044 #if CONTENT_FILTER
1045 /*
1046 * We have to wait until the content filters are done
1047 */
1048 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1049 cfil_sock_close_wait(so);
1050 cfil_sock_is_closed(so);
1051 cfil_sock_detach(so);
1052 }
1053 #endif /* CONTENT_FILTER */
1054
1055 if ((so->so_options & SO_ACCEPTCONN)) {
1056 struct socket *sp, *sonext;
1057 int socklock = 0;
1058
1059 /*
1060 * We do not want new connection to be added
1061 * to the connection queues
1062 */
1063 so->so_options &= ~SO_ACCEPTCONN;
1064
1065 for (sp = TAILQ_FIRST(&so->so_incomp);
1066 sp != NULL; sp = sonext) {
1067 sonext = TAILQ_NEXT(sp, so_list);
1068
1069 /*
1070 * Radar 5350314
1071 * skip sockets thrown away by tcpdropdropblreq
1072 * they will get cleanup by the garbage collection.
1073 * otherwise, remove the incomp socket from the queue
1074 * and let soabort trigger the appropriate cleanup.
1075 */
1076 if (sp->so_flags & SOF_OVERFLOW)
1077 continue;
1078
1079 if (so->so_proto->pr_getlock != NULL) {
1080 /*
1081 * Lock ordering for consistency with the
1082 * rest of the stack, we lock the socket
1083 * first and then grabb the head.
1084 */
1085 socket_unlock(so, 0);
1086 socket_lock(sp, 1);
1087 socket_lock(so, 0);
1088 socklock = 1;
1089 }
1090
1091 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1092 so->so_incqlen--;
1093
1094 if (sp->so_state & SS_INCOMP) {
1095 sp->so_state &= ~SS_INCOMP;
1096 sp->so_head = NULL;
1097
1098 (void) soabort(sp);
1099 }
1100
1101 if (socklock)
1102 socket_unlock(sp, 1);
1103 }
1104
1105 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
1106 /* Dequeue from so_comp since sofree() won't do it */
1107 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1108 so->so_qlen--;
1109
1110 if (so->so_proto->pr_getlock != NULL) {
1111 socket_unlock(so, 0);
1112 socket_lock(sp, 1);
1113 }
1114
1115 if (sp->so_state & SS_COMP) {
1116 sp->so_state &= ~SS_COMP;
1117 sp->so_head = NULL;
1118
1119 (void) soabort(sp);
1120 }
1121
1122 if (so->so_proto->pr_getlock != NULL) {
1123 socket_unlock(sp, 1);
1124 socket_lock(so, 0);
1125 }
1126 }
1127 }
1128 if (so->so_pcb == NULL) {
1129 /* 3915887: mark the socket as ready for dealloc */
1130 so->so_flags |= SOF_PCBCLEARING;
1131 goto discard;
1132 }
1133 if (so->so_state & SS_ISCONNECTED) {
1134 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1135 error = sodisconnectlocked(so);
1136 if (error)
1137 goto drop;
1138 }
1139 if (so->so_options & SO_LINGER) {
1140 if ((so->so_state & SS_ISDISCONNECTING) &&
1141 (so->so_state & SS_NBIO))
1142 goto drop;
1143 if (so->so_proto->pr_getlock != NULL)
1144 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1145 else
1146 mutex_held = so->so_proto->pr_domain->dom_mtx;
1147 while (so->so_state & SS_ISCONNECTED) {
1148 ts.tv_sec = (so->so_linger/100);
1149 ts.tv_nsec = (so->so_linger % 100) *
1150 NSEC_PER_USEC * 1000 * 10;
1151 error = msleep((caddr_t)&so->so_timeo,
1152 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1153 if (error) {
1154 /*
1155 * It's OK when the time fires,
1156 * don't report an error
1157 */
1158 if (error == EWOULDBLOCK)
1159 error = 0;
1160 break;
1161 }
1162 }
1163 }
1164 }
1165 drop:
1166 if (so->so_usecount == 0) {
1167 panic("soclose: usecount is zero so=%p\n", so);
1168 /* NOTREACHED */
1169 }
1170 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1171 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1172 if (error == 0)
1173 error = error2;
1174 }
1175 if (so->so_usecount <= 0) {
1176 panic("soclose: usecount is zero so=%p\n", so);
1177 /* NOTREACHED */
1178 }
1179 discard:
1180 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1181 (so->so_state & SS_NOFDREF)) {
1182 panic("soclose: NOFDREF");
1183 /* NOTREACHED */
1184 }
1185 so->so_state |= SS_NOFDREF;
1186
1187 if (so->so_flags & SOF_MP_SUBFLOW)
1188 so->so_flags &= ~SOF_MP_SUBFLOW;
1189
1190 if ((so->so_flags & SOF_KNOTE) != 0)
1191 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1192
1193 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1194 evsofree(so);
1195
1196 so->so_usecount--;
1197 sofree(so);
1198 return (error);
1199 }
1200
1201 int
1202 soclose(struct socket *so)
1203 {
1204 int error = 0;
1205 socket_lock(so, 1);
1206
1207 if (so->so_retaincnt == 0) {
1208 error = soclose_locked(so);
1209 } else {
1210 /*
1211 * if the FD is going away, but socket is
1212 * retained in kernel remove its reference
1213 */
1214 so->so_usecount--;
1215 if (so->so_usecount < 2)
1216 panic("soclose: retaincnt non null and so=%p "
1217 "usecount=%d\n", so, so->so_usecount);
1218 }
1219 socket_unlock(so, 1);
1220 return (error);
1221 }
1222
1223 /*
1224 * Must be called at splnet...
1225 */
1226 /* Should already be locked */
1227 int
1228 soabort(struct socket *so)
1229 {
1230 int error;
1231
1232 #ifdef MORE_LOCKING_DEBUG
1233 lck_mtx_t *mutex_held;
1234
1235 if (so->so_proto->pr_getlock != NULL)
1236 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1237 else
1238 mutex_held = so->so_proto->pr_domain->dom_mtx;
1239 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1240 #endif
1241
1242 if ((so->so_flags & SOF_ABORTED) == 0) {
1243 so->so_flags |= SOF_ABORTED;
1244 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1245 if (error) {
1246 sofree(so);
1247 return (error);
1248 }
1249 }
1250 return (0);
1251 }
1252
1253 int
1254 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1255 {
1256 int error;
1257
1258 if (dolock)
1259 socket_lock(so, 1);
1260
1261 so_update_last_owner_locked(so, PROC_NULL);
1262 so_update_policy(so);
1263 #if NECP
1264 so_update_necp_policy(so, NULL, NULL);
1265 #endif /* NECP */
1266
1267 if ((so->so_state & SS_NOFDREF) == 0)
1268 panic("soaccept: !NOFDREF");
1269 so->so_state &= ~SS_NOFDREF;
1270 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1271
1272 if (dolock)
1273 socket_unlock(so, 1);
1274 return (error);
1275 }
1276
1277 int
1278 soaccept(struct socket *so, struct sockaddr **nam)
1279 {
1280 return (soacceptlock(so, nam, 1));
1281 }
1282
1283 int
1284 soacceptfilter(struct socket *so)
1285 {
1286 struct sockaddr *local = NULL, *remote = NULL;
1287 int error = 0;
1288 struct socket *head = so->so_head;
1289
1290 /*
1291 * Hold the lock even if this socket has not been made visible
1292 * to the filter(s). For sockets with global locks, this protects
1293 * against the head or peer going away
1294 */
1295 socket_lock(so, 1);
1296 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1297 sogetaddr_locked(so, &local, 0) != 0) {
1298 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1299 so->so_head = NULL;
1300 socket_unlock(so, 1);
1301 soclose(so);
1302 /* Out of resources; try it again next time */
1303 error = ECONNABORTED;
1304 goto done;
1305 }
1306
1307 error = sflt_accept(head, so, local, remote);
1308
1309 /*
1310 * If we get EJUSTRETURN from one of the filters, mark this socket
1311 * as inactive and return it anyway. This newly accepted socket
1312 * will be disconnected later before we hand it off to the caller.
1313 */
1314 if (error == EJUSTRETURN) {
1315 error = 0;
1316 (void) sosetdefunct(current_proc(), so,
1317 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1318 }
1319
1320 if (error != 0) {
1321 /*
1322 * This may seem like a duplication to the above error
1323 * handling part when we return ECONNABORTED, except
1324 * the following is done while holding the lock since
1325 * the socket has been exposed to the filter(s) earlier.
1326 */
1327 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1328 so->so_head = NULL;
1329 socket_unlock(so, 1);
1330 soclose(so);
1331 /* Propagate socket filter's error code to the caller */
1332 } else {
1333 socket_unlock(so, 1);
1334 }
1335 done:
1336 /* Callee checks for NULL pointer */
1337 sock_freeaddr(remote);
1338 sock_freeaddr(local);
1339 return (error);
1340 }
1341
1342 /*
1343 * Returns: 0 Success
1344 * EOPNOTSUPP Operation not supported on socket
1345 * EISCONN Socket is connected
1346 * <pru_connect>:EADDRNOTAVAIL Address not available.
1347 * <pru_connect>:EINVAL Invalid argument
1348 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1349 * <pru_connect>:EACCES Permission denied
1350 * <pru_connect>:EADDRINUSE Address in use
1351 * <pru_connect>:EAGAIN Resource unavailable, try again
1352 * <pru_connect>:EPERM Operation not permitted
1353 * <sf_connect_out>:??? [anything a filter writer might set]
1354 */
1355 int
1356 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1357 {
1358 int error;
1359 struct proc *p = current_proc();
1360
1361 if (dolock)
1362 socket_lock(so, 1);
1363
1364 so_update_last_owner_locked(so, p);
1365 so_update_policy(so);
1366
1367 #if NECP
1368 so_update_necp_policy(so, NULL, nam);
1369 #endif /* NECP */
1370
1371 /*
1372 * If this is a listening socket or if this is a previously-accepted
1373 * socket that has been marked as inactive, reject the connect request.
1374 */
1375 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1376 error = EOPNOTSUPP;
1377 if (so->so_flags & SOF_DEFUNCT) {
1378 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1379 "(%d)\n", __func__, proc_pid(p),
1380 (uint64_t)VM_KERNEL_ADDRPERM(so),
1381 SOCK_DOM(so), SOCK_TYPE(so), error));
1382 }
1383 if (dolock)
1384 socket_unlock(so, 1);
1385 return (error);
1386 }
1387
1388 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1389 if (dolock)
1390 socket_unlock(so, 1);
1391 return (EPERM);
1392 }
1393
1394 /*
1395 * If protocol is connection-based, can only connect once.
1396 * Otherwise, if connected, try to disconnect first.
1397 * This allows user to disconnect by connecting to, e.g.,
1398 * a null address.
1399 */
1400 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1401 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1402 (error = sodisconnectlocked(so)))) {
1403 error = EISCONN;
1404 } else {
1405 /*
1406 * Run connect filter before calling protocol:
1407 * - non-blocking connect returns before completion;
1408 */
1409 error = sflt_connectout(so, nam);
1410 if (error != 0) {
1411 if (error == EJUSTRETURN)
1412 error = 0;
1413 } else {
1414 error = (*so->so_proto->pr_usrreqs->pru_connect)
1415 (so, nam, p);
1416 }
1417 }
1418 if (dolock)
1419 socket_unlock(so, 1);
1420 return (error);
1421 }
1422
1423 int
1424 soconnect(struct socket *so, struct sockaddr *nam)
1425 {
1426 return (soconnectlock(so, nam, 1));
1427 }
1428
1429 /*
1430 * Returns: 0 Success
1431 * <pru_connect2>:EINVAL[AF_UNIX]
1432 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1433 * <pru_connect2>:??? [other protocol families]
1434 *
1435 * Notes: <pru_connect2> is not supported by [TCP].
1436 */
1437 int
1438 soconnect2(struct socket *so1, struct socket *so2)
1439 {
1440 int error;
1441
1442 socket_lock(so1, 1);
1443 if (so2->so_proto->pr_lock)
1444 socket_lock(so2, 1);
1445
1446 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1447
1448 socket_unlock(so1, 1);
1449 if (so2->so_proto->pr_lock)
1450 socket_unlock(so2, 1);
1451 return (error);
1452 }
1453
1454 int
1455 soconnectxlocked(struct socket *so, struct sockaddr_list **src_sl,
1456 struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
1457 associd_t aid, connid_t *pcid, uint32_t flags, void *arg,
1458 uint32_t arglen)
1459 {
1460 int error;
1461
1462 so_update_last_owner_locked(so, p);
1463 so_update_policy(so);
1464
1465 /*
1466 * If this is a listening socket or if this is a previously-accepted
1467 * socket that has been marked as inactive, reject the connect request.
1468 */
1469 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1470 error = EOPNOTSUPP;
1471 if (so->so_flags & SOF_DEFUNCT) {
1472 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1473 "(%d)\n", __func__, proc_pid(p),
1474 (uint64_t)VM_KERNEL_ADDRPERM(so),
1475 SOCK_DOM(so), SOCK_TYPE(so), error));
1476 }
1477 return (error);
1478 }
1479
1480 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0)
1481 return (EPERM);
1482
1483 /*
1484 * If protocol is connection-based, can only connect once
1485 * unless PR_MULTICONN is set. Otherwise, if connected,
1486 * try to disconnect first. This allows user to disconnect
1487 * by connecting to, e.g., a null address.
1488 */
1489 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) &&
1490 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1491 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1492 (error = sodisconnectlocked(so)) != 0)) {
1493 error = EISCONN;
1494 } else {
1495 /*
1496 * Run connect filter before calling protocol:
1497 * - non-blocking connect returns before completion;
1498 */
1499 error = sflt_connectxout(so, dst_sl);
1500 if (error != 0) {
1501 if (error == EJUSTRETURN)
1502 error = 0;
1503 } else {
1504 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1505 (so, src_sl, dst_sl, p, ifscope, aid, pcid,
1506 flags, arg, arglen);
1507 }
1508 }
1509
1510 return (error);
1511 }
1512
1513 int
1514 sodisconnectlocked(struct socket *so)
1515 {
1516 int error;
1517
1518 if ((so->so_state & SS_ISCONNECTED) == 0) {
1519 error = ENOTCONN;
1520 goto bad;
1521 }
1522 if (so->so_state & SS_ISDISCONNECTING) {
1523 error = EALREADY;
1524 goto bad;
1525 }
1526
1527 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1528 if (error == 0)
1529 sflt_notify(so, sock_evt_disconnected, NULL);
1530
1531 bad:
1532 return (error);
1533 }
1534
1535 /* Locking version */
1536 int
1537 sodisconnect(struct socket *so)
1538 {
1539 int error;
1540
1541 socket_lock(so, 1);
1542 error = sodisconnectlocked(so);
1543 socket_unlock(so, 1);
1544 return (error);
1545 }
1546
1547 int
1548 sodisconnectxlocked(struct socket *so, associd_t aid, connid_t cid)
1549 {
1550 int error;
1551
1552 /*
1553 * Call the protocol disconnectx handler; let it handle all
1554 * matters related to the connection state of this session.
1555 */
1556 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1557 if (error == 0) {
1558 /*
1559 * The event applies only for the session, not for
1560 * the disconnection of individual subflows.
1561 */
1562 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
1563 sflt_notify(so, sock_evt_disconnected, NULL);
1564 }
1565 return (error);
1566 }
1567
1568 int
1569 sodisconnectx(struct socket *so, associd_t aid, connid_t cid)
1570 {
1571 int error;
1572
1573 socket_lock(so, 1);
1574 error = sodisconnectxlocked(so, aid, cid);
1575 socket_unlock(so, 1);
1576 return (error);
1577 }
1578
1579 int
1580 sopeelofflocked(struct socket *so, associd_t aid, struct socket **psop)
1581 {
1582 return ((*so->so_proto->pr_usrreqs->pru_peeloff)(so, aid, psop));
1583 }
1584
1585 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1586
1587 /*
1588 * sosendcheck will lock the socket buffer if it isn't locked and
1589 * verify that there is space for the data being inserted.
1590 *
1591 * Returns: 0 Success
1592 * EPIPE
1593 * sblock:EWOULDBLOCK
1594 * sblock:EINTR
1595 * sbwait:EBADF
1596 * sbwait:EINTR
1597 * [so_error]:???
1598 */
1599 int
1600 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1601 int32_t clen, int32_t atomic, int flags, int *sblocked,
1602 struct mbuf *control)
1603 {
1604 int error = 0;
1605 int32_t space;
1606 int assumelock = 0;
1607
1608 restart:
1609 if (*sblocked == 0) {
1610 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1611 so->so_send_filt_thread != 0 &&
1612 so->so_send_filt_thread == current_thread()) {
1613 /*
1614 * We're being called recursively from a filter,
1615 * allow this to continue. Radar 4150520.
1616 * Don't set sblocked because we don't want
1617 * to perform an unlock later.
1618 */
1619 assumelock = 1;
1620 } else {
1621 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1622 if (error) {
1623 if (so->so_flags & SOF_DEFUNCT)
1624 goto defunct;
1625 return (error);
1626 }
1627 *sblocked = 1;
1628 }
1629 }
1630
1631 /*
1632 * If a send attempt is made on a socket that has been marked
1633 * as inactive (disconnected), reject the request.
1634 */
1635 if (so->so_flags & SOF_DEFUNCT) {
1636 defunct:
1637 error = EPIPE;
1638 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
1639 __func__, proc_selfpid(), (uint64_t)VM_KERNEL_ADDRPERM(so),
1640 SOCK_DOM(so), SOCK_TYPE(so), error));
1641 return (error);
1642 }
1643
1644 if (so->so_state & SS_CANTSENDMORE) {
1645 #if CONTENT_FILTER
1646 /*
1647 * Can re-inject data of half closed connections
1648 */
1649 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1650 so->so_snd.sb_cfil_thread == current_thread() &&
1651 cfil_sock_data_pending(&so->so_snd) != 0)
1652 CFIL_LOG(LOG_INFO,
1653 "so %llx ignore SS_CANTSENDMORE",
1654 (uint64_t)VM_KERNEL_ADDRPERM(so));
1655 else
1656 #endif /* CONTENT_FILTER */
1657 return (EPIPE);
1658 }
1659 if (so->so_error) {
1660 error = so->so_error;
1661 so->so_error = 0;
1662 return (error);
1663 }
1664
1665 if ((so->so_state & SS_ISCONNECTED) == 0) {
1666 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1667 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1668 (resid != 0 || clen == 0)) {
1669 #if MPTCP
1670 /*
1671 * MPTCP Fast Join sends data before the
1672 * socket is truly connected.
1673 */
1674 if ((so->so_flags & (SOF_MP_SUBFLOW |
1675 SOF_MPTCP_FASTJOIN)) !=
1676 (SOF_MP_SUBFLOW | SOF_MPTCP_FASTJOIN))
1677 #endif /* MPTCP */
1678 return (ENOTCONN);
1679 }
1680 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1681 return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1682 ENOTCONN : EDESTADDRREQ);
1683 }
1684 }
1685 if (so->so_flags & SOF_ENABLE_MSGS)
1686 space = msgq_sbspace(so, control);
1687 else
1688 space = sbspace(&so->so_snd);
1689
1690 if (flags & MSG_OOB)
1691 space += 1024;
1692 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1693 clen > so->so_snd.sb_hiwat)
1694 return (EMSGSIZE);
1695
1696 if ((space < resid + clen &&
1697 (atomic || space < (int32_t)so->so_snd.sb_lowat || space < clen)) ||
1698 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1699 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1700 assumelock) {
1701 return (EWOULDBLOCK);
1702 }
1703 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
1704 *sblocked = 0;
1705 error = sbwait(&so->so_snd);
1706 if (error) {
1707 if (so->so_flags & SOF_DEFUNCT)
1708 goto defunct;
1709 return (error);
1710 }
1711 goto restart;
1712 }
1713 return (0);
1714 }
1715
1716 /*
1717 * Send on a socket.
1718 * If send must go all at once and message is larger than
1719 * send buffering, then hard error.
1720 * Lock against other senders.
1721 * If must go all at once and not enough room now, then
1722 * inform user that this would block and do nothing.
1723 * Otherwise, if nonblocking, send as much as possible.
1724 * The data to be sent is described by "uio" if nonzero,
1725 * otherwise by the mbuf chain "top" (which must be null
1726 * if uio is not). Data provided in mbuf chain must be small
1727 * enough to send all at once.
1728 *
1729 * Returns nonzero on error, timeout or signal; callers
1730 * must check for short counts if EINTR/ERESTART are returned.
1731 * Data and control buffers are freed on return.
1732 * Experiment:
1733 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1734 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1735 * point at the mbuf chain being constructed and go from there.
1736 *
1737 * Returns: 0 Success
1738 * EOPNOTSUPP
1739 * EINVAL
1740 * ENOBUFS
1741 * uiomove:EFAULT
1742 * sosendcheck:EPIPE
1743 * sosendcheck:EWOULDBLOCK
1744 * sosendcheck:EINTR
1745 * sosendcheck:EBADF
1746 * sosendcheck:EINTR
1747 * sosendcheck:??? [value from so_error]
1748 * <pru_send>:ECONNRESET[TCP]
1749 * <pru_send>:EINVAL[TCP]
1750 * <pru_send>:ENOBUFS[TCP]
1751 * <pru_send>:EADDRINUSE[TCP]
1752 * <pru_send>:EADDRNOTAVAIL[TCP]
1753 * <pru_send>:EAFNOSUPPORT[TCP]
1754 * <pru_send>:EACCES[TCP]
1755 * <pru_send>:EAGAIN[TCP]
1756 * <pru_send>:EPERM[TCP]
1757 * <pru_send>:EMSGSIZE[TCP]
1758 * <pru_send>:EHOSTUNREACH[TCP]
1759 * <pru_send>:ENETUNREACH[TCP]
1760 * <pru_send>:ENETDOWN[TCP]
1761 * <pru_send>:ENOMEM[TCP]
1762 * <pru_send>:ENOBUFS[TCP]
1763 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
1764 * <pru_send>:EINVAL[AF_UNIX]
1765 * <pru_send>:EOPNOTSUPP[AF_UNIX]
1766 * <pru_send>:EPIPE[AF_UNIX]
1767 * <pru_send>:ENOTCONN[AF_UNIX]
1768 * <pru_send>:EISCONN[AF_UNIX]
1769 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
1770 * <sf_data_out>:??? [whatever a filter author chooses]
1771 *
1772 * Notes: Other <pru_send> returns depend on the protocol family; all
1773 * <sf_data_out> returns depend on what the filter author causes
1774 * their filter to return.
1775 */
1776 int
1777 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1778 struct mbuf *top, struct mbuf *control, int flags)
1779 {
1780 struct mbuf **mp;
1781 struct mbuf *m, *freelist = NULL;
1782 user_ssize_t space, len, resid;
1783 int clen = 0, error, dontroute, mlen, sendflags;
1784 int atomic = sosendallatonce(so) || top;
1785 int sblocked = 0;
1786 struct proc *p = current_proc();
1787 struct mbuf *control_copy = NULL;
1788
1789 if (uio != NULL)
1790 resid = uio_resid(uio);
1791 else
1792 resid = top->m_pkthdr.len;
1793
1794 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
1795 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1796
1797 socket_lock(so, 1);
1798
1799 /*
1800 * Re-injection should not affect process accounting
1801 */
1802 if ((flags & MSG_SKIPCFIL) == 0) {
1803 so_update_last_owner_locked(so, p);
1804 so_update_policy(so);
1805
1806 #if NECP
1807 so_update_necp_policy(so, NULL, addr);
1808 #endif /* NECP */
1809 }
1810
1811 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
1812 error = EOPNOTSUPP;
1813 socket_unlock(so, 1);
1814 goto out;
1815 }
1816
1817 /*
1818 * In theory resid should be unsigned.
1819 * However, space must be signed, as it might be less than 0
1820 * if we over-committed, and we must use a signed comparison
1821 * of space and resid. On the other hand, a negative resid
1822 * causes us to loop sending 0-length segments to the protocol.
1823 *
1824 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
1825 * But it will be used by sockets doing message delivery.
1826 *
1827 * Note: We limit resid to be a positive int value as we use
1828 * imin() to set bytes_to_copy -- radr://14558484
1829 */
1830 if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
1831 !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
1832 error = EINVAL;
1833 socket_unlock(so, 1);
1834 goto out;
1835 }
1836
1837 dontroute = (flags & MSG_DONTROUTE) &&
1838 (so->so_options & SO_DONTROUTE) == 0 &&
1839 (so->so_proto->pr_flags & PR_ATOMIC);
1840 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
1841
1842 if (control != NULL)
1843 clen = control->m_len;
1844
1845 do {
1846 error = sosendcheck(so, addr, resid, clen, atomic, flags,
1847 &sblocked, control);
1848 if (error)
1849 goto release;
1850
1851 mp = &top;
1852 if (so->so_flags & SOF_ENABLE_MSGS)
1853 space = msgq_sbspace(so, control);
1854 else
1855 space = sbspace(&so->so_snd) - clen;
1856 space += ((flags & MSG_OOB) ? 1024 : 0);
1857
1858 do {
1859 if (uio == NULL) {
1860 /*
1861 * Data is prepackaged in "top".
1862 */
1863 resid = 0;
1864 if (flags & MSG_EOR)
1865 top->m_flags |= M_EOR;
1866 } else {
1867 int chainlength;
1868 int bytes_to_copy;
1869 boolean_t jumbocl;
1870 boolean_t bigcl;
1871
1872 bytes_to_copy = imin(resid, space);
1873
1874 if (sosendminchain > 0)
1875 chainlength = 0;
1876 else
1877 chainlength = sosendmaxchain;
1878
1879 /*
1880 * Use big 4 KB cluster only when outgoing
1881 * interface does not want 2 LB clusters
1882 */
1883 bigcl =
1884 !(so->so_flags1 & SOF1_IF_2KCL) ||
1885 sosendbigcl_ignore_capab;
1886
1887 /*
1888 * Attempt to use larger than system page-size
1889 * clusters for large writes only if there is
1890 * a jumbo cluster pool and if the socket is
1891 * marked accordingly.
1892 */
1893 jumbocl = sosendjcl && njcl > 0 &&
1894 ((so->so_flags & SOF_MULTIPAGES) ||
1895 sosendjcl_ignore_capab) &&
1896 bigcl;
1897
1898 socket_unlock(so, 0);
1899
1900 do {
1901 int num_needed;
1902 int hdrs_needed = (top == NULL) ? 1 : 0;
1903
1904 /*
1905 * try to maintain a local cache of mbuf
1906 * clusters needed to complete this
1907 * write the list is further limited to
1908 * the number that are currently needed
1909 * to fill the socket this mechanism
1910 * allows a large number of mbufs/
1911 * clusters to be grabbed under a single
1912 * mbuf lock... if we can't get any
1913 * clusters, than fall back to trying
1914 * for mbufs if we fail early (or
1915 * miscalcluate the number needed) make
1916 * sure to release any clusters we
1917 * haven't yet consumed.
1918 */
1919 if (freelist == NULL &&
1920 bytes_to_copy > MBIGCLBYTES &&
1921 jumbocl) {
1922 num_needed =
1923 bytes_to_copy / M16KCLBYTES;
1924
1925 if ((bytes_to_copy -
1926 (num_needed * M16KCLBYTES))
1927 >= MINCLSIZE)
1928 num_needed++;
1929
1930 freelist =
1931 m_getpackets_internal(
1932 (unsigned int *)&num_needed,
1933 hdrs_needed, M_WAIT, 0,
1934 M16KCLBYTES);
1935 /*
1936 * Fall back to 4K cluster size
1937 * if allocation failed
1938 */
1939 }
1940
1941 if (freelist == NULL &&
1942 bytes_to_copy > MCLBYTES &&
1943 bigcl) {
1944 num_needed =
1945 bytes_to_copy / MBIGCLBYTES;
1946
1947 if ((bytes_to_copy -
1948 (num_needed * MBIGCLBYTES)) >=
1949 MINCLSIZE)
1950 num_needed++;
1951
1952 freelist =
1953 m_getpackets_internal(
1954 (unsigned int *)&num_needed,
1955 hdrs_needed, M_WAIT, 0,
1956 MBIGCLBYTES);
1957 /*
1958 * Fall back to cluster size
1959 * if allocation failed
1960 */
1961 }
1962
1963 if (freelist == NULL &&
1964 bytes_to_copy > MINCLSIZE) {
1965 num_needed =
1966 bytes_to_copy / MCLBYTES;
1967
1968 if ((bytes_to_copy -
1969 (num_needed * MCLBYTES)) >=
1970 MINCLSIZE)
1971 num_needed++;
1972
1973 freelist =
1974 m_getpackets_internal(
1975 (unsigned int *)&num_needed,
1976 hdrs_needed, M_WAIT, 0,
1977 MCLBYTES);
1978 /*
1979 * Fall back to a single mbuf
1980 * if allocation failed
1981 */
1982 }
1983
1984 if (freelist == NULL) {
1985 if (top == NULL)
1986 MGETHDR(freelist,
1987 M_WAIT, MT_DATA);
1988 else
1989 MGET(freelist,
1990 M_WAIT, MT_DATA);
1991
1992 if (freelist == NULL) {
1993 error = ENOBUFS;
1994 socket_lock(so, 0);
1995 goto release;
1996 }
1997 /*
1998 * For datagram protocols,
1999 * leave room for protocol
2000 * headers in first mbuf.
2001 */
2002 if (atomic && top == NULL &&
2003 bytes_to_copy < MHLEN) {
2004 MH_ALIGN(freelist,
2005 bytes_to_copy);
2006 }
2007 }
2008 m = freelist;
2009 freelist = m->m_next;
2010 m->m_next = NULL;
2011
2012 if ((m->m_flags & M_EXT))
2013 mlen = m->m_ext.ext_size;
2014 else if ((m->m_flags & M_PKTHDR))
2015 mlen =
2016 MHLEN - m_leadingspace(m);
2017 else
2018 mlen = MLEN;
2019 len = imin(mlen, bytes_to_copy);
2020
2021 chainlength += len;
2022
2023 space -= len;
2024
2025 error = uiomove(mtod(m, caddr_t),
2026 len, uio);
2027
2028 resid = uio_resid(uio);
2029
2030 m->m_len = len;
2031 *mp = m;
2032 top->m_pkthdr.len += len;
2033 if (error)
2034 break;
2035 mp = &m->m_next;
2036 if (resid <= 0) {
2037 if (flags & MSG_EOR)
2038 top->m_flags |= M_EOR;
2039 break;
2040 }
2041 bytes_to_copy = min(resid, space);
2042
2043 } while (space > 0 &&
2044 (chainlength < sosendmaxchain || atomic ||
2045 resid < MINCLSIZE));
2046
2047 socket_lock(so, 0);
2048
2049 if (error)
2050 goto release;
2051 }
2052
2053 if (flags & (MSG_HOLD|MSG_SEND)) {
2054 /* Enqueue for later, go away if HOLD */
2055 struct mbuf *mb1;
2056 if (so->so_temp && (flags & MSG_FLUSH)) {
2057 m_freem(so->so_temp);
2058 so->so_temp = NULL;
2059 }
2060 if (so->so_temp)
2061 so->so_tail->m_next = top;
2062 else
2063 so->so_temp = top;
2064 mb1 = top;
2065 while (mb1->m_next)
2066 mb1 = mb1->m_next;
2067 so->so_tail = mb1;
2068 if (flags & MSG_HOLD) {
2069 top = NULL;
2070 goto release;
2071 }
2072 top = so->so_temp;
2073 }
2074 if (dontroute)
2075 so->so_options |= SO_DONTROUTE;
2076
2077 /* Compute flags here, for pru_send and NKEs */
2078 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2079 /*
2080 * If the user set MSG_EOF, the protocol
2081 * understands this flag and nothing left to
2082 * send then use PRU_SEND_EOF instead of PRU_SEND.
2083 */
2084 ((flags & MSG_EOF) &&
2085 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2086 (resid <= 0)) ? PRUS_EOF :
2087 /* If there is more to send set PRUS_MORETOCOME */
2088 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2089
2090 if ((flags & MSG_SKIPCFIL) == 0) {
2091 /*
2092 * Socket filter processing
2093 */
2094 error = sflt_data_out(so, addr, &top,
2095 &control, (sendflags & MSG_OOB) ?
2096 sock_data_filt_flag_oob : 0);
2097 if (error) {
2098 if (error == EJUSTRETURN) {
2099 error = 0;
2100 clen = 0;
2101 control = NULL;
2102 top = NULL;
2103 }
2104 goto release;
2105 }
2106 #if CONTENT_FILTER
2107 /*
2108 * Content filter processing
2109 */
2110 error = cfil_sock_data_out(so, addr, top,
2111 control, (sendflags & MSG_OOB) ?
2112 sock_data_filt_flag_oob : 0);
2113 if (error) {
2114 if (error == EJUSTRETURN) {
2115 error = 0;
2116 clen = 0;
2117 control = NULL;
2118 top = NULL;
2119 }
2120 goto release;
2121 }
2122 #endif /* CONTENT_FILTER */
2123 }
2124 if (so->so_flags & SOF_ENABLE_MSGS) {
2125 /*
2126 * Make a copy of control mbuf,
2127 * so that msg priority can be
2128 * passed to subsequent mbufs.
2129 */
2130 control_copy = m_dup(control, M_NOWAIT);
2131 }
2132 error = (*so->so_proto->pr_usrreqs->pru_send)
2133 (so, sendflags, top, addr, control, p);
2134
2135 if (flags & MSG_SEND)
2136 so->so_temp = NULL;
2137
2138 if (dontroute)
2139 so->so_options &= ~SO_DONTROUTE;
2140
2141 clen = 0;
2142 control = control_copy;
2143 control_copy = NULL;
2144 top = NULL;
2145 mp = &top;
2146 if (error)
2147 goto release;
2148 } while (resid && space > 0);
2149 } while (resid);
2150
2151 release:
2152 if (sblocked)
2153 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2154 else
2155 socket_unlock(so, 1);
2156 out:
2157 if (top != NULL)
2158 m_freem(top);
2159 if (control != NULL)
2160 m_freem(control);
2161 if (freelist != NULL)
2162 m_freem_list(freelist);
2163 if (control_copy != NULL)
2164 m_freem(control_copy);
2165
2166 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid, so->so_snd.sb_cc,
2167 space, error);
2168
2169 return (error);
2170 }
2171
2172 int
2173 sosend_list(struct socket *so, struct sockaddr *addr, struct uio **uioarray,
2174 u_int uiocnt, struct mbuf *top, struct mbuf *control, int flags)
2175 {
2176 struct mbuf *m, *freelist = NULL;
2177 user_ssize_t len, resid;
2178 int clen = 0, error, dontroute, mlen;
2179 int atomic = sosendallatonce(so) || top;
2180 int sblocked = 0;
2181 struct proc *p = current_proc();
2182 u_int uiofirst = 0;
2183 u_int uiolast = 0;
2184
2185 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2186 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2187
2188 if (so->so_type != SOCK_DGRAM) {
2189 error = EINVAL;
2190 goto out;
2191 }
2192 if (atomic == 0) {
2193 error = EINVAL;
2194 goto out;
2195 }
2196 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2197 error = EPROTONOSUPPORT;
2198 goto out;
2199 }
2200 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2201 error = EINVAL;
2202 goto out;
2203 }
2204 if (uioarray != NULL)
2205 resid = uio_array_resid(uioarray, uiocnt);
2206 else
2207 resid = mbuf_pkt_list_len(top);
2208
2209 /*
2210 * In theory resid should be unsigned.
2211 * However, space must be signed, as it might be less than 0
2212 * if we over-committed, and we must use a signed comparison
2213 * of space and resid. On the other hand, a negative resid
2214 * causes us to loop sending 0-length segments to the protocol.
2215 *
2216 * Note: We limit resid to be a positive int value as we use
2217 * imin() to set bytes_to_copy -- radr://14558484
2218 */
2219 if (resid < 0 || resid > INT_MAX) {
2220 error = EINVAL;
2221 goto out;
2222 }
2223 /*
2224 * Disallow functionality not currently supported
2225 * Note: Will need to treat arrays of addresses and controls
2226 */
2227 if (addr != NULL) {
2228 printf("%s addr not supported\n", __func__);
2229 error = EOPNOTSUPP;
2230 goto out;
2231 }
2232 if (control != NULL) {
2233 printf("%s control not supported\n", __func__);
2234 error = EOPNOTSUPP;
2235 goto out;
2236 }
2237
2238 socket_lock(so, 1);
2239 so_update_last_owner_locked(so, p);
2240 so_update_policy(so);
2241
2242 #if NECP
2243 so_update_necp_policy(so, NULL, addr);
2244 #endif /* NECP */
2245
2246 dontroute = (flags & MSG_DONTROUTE) &&
2247 (so->so_options & SO_DONTROUTE) == 0 &&
2248 (so->so_proto->pr_flags & PR_ATOMIC);
2249 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2250
2251 if (control != NULL)
2252 clen = control->m_len;
2253
2254 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2255 &sblocked, control);
2256 if (error)
2257 goto release;
2258
2259 do {
2260 int i;
2261
2262 if (uioarray == NULL) {
2263 /*
2264 * Data is prepackaged in "top".
2265 */
2266 resid = 0;
2267 } else {
2268 int num_needed = 0;
2269 int chainlength;
2270 size_t maxpktlen = 0;
2271
2272 if (sosendminchain > 0)
2273 chainlength = 0;
2274 else
2275 chainlength = sosendmaxchain;
2276
2277 socket_unlock(so, 0);
2278
2279 /*
2280 * Find a set of uio that fit in a reasonable number
2281 * of mbuf packets
2282 */
2283 for (i = uiofirst; i < uiocnt; i++) {
2284 struct uio *auio = uioarray[i];
2285
2286 len = uio_resid(auio);
2287
2288 /* Do nothing for empty messages */
2289 if (len == 0)
2290 continue;
2291
2292 num_needed += 1;
2293 uiolast += 1;
2294
2295 if (len > maxpktlen)
2296 maxpktlen = len;
2297
2298 chainlength += len;
2299 if (chainlength > sosendmaxchain)
2300 break;
2301 }
2302 /*
2303 * Nothing left to send
2304 */
2305 if (num_needed == 0) {
2306 socket_lock(so, 0);
2307 break;
2308 }
2309 /*
2310 * Allocate the mbuf packets at once
2311 */
2312 freelist = m_allocpacket_internal(
2313 (unsigned int *)&num_needed,
2314 maxpktlen, NULL, M_WAIT, 1, 0);
2315
2316 if (freelist == NULL) {
2317 socket_lock(so, 0);
2318 error = ENOMEM;
2319 goto release;
2320 }
2321 /*
2322 * Copy each uio of the set into its own mbuf packet
2323 */
2324 for (i = uiofirst, m = freelist;
2325 i < uiolast && m != NULL;
2326 i++) {
2327 int bytes_to_copy;
2328 struct mbuf *n;
2329 struct uio *auio = uioarray[i];
2330
2331 bytes_to_copy = uio_resid(auio);
2332
2333 /* Do nothing for empty messages */
2334 if (bytes_to_copy == 0)
2335 continue;
2336
2337 for (n = m; n != NULL; n = n->m_next) {
2338 mlen = mbuf_maxlen(n);
2339
2340 len = imin(mlen, bytes_to_copy);
2341
2342 /*
2343 * Note: uiomove() decrements the iovec
2344 * length
2345 */
2346 error = uiomove(mtod(n, caddr_t),
2347 len, auio);
2348 if (error != 0)
2349 break;
2350 n->m_len = len;
2351 m->m_pkthdr.len += len;
2352
2353 VERIFY(m->m_pkthdr.len <= maxpktlen);
2354
2355 bytes_to_copy -= len;
2356 resid -= len;
2357 }
2358 if (m->m_pkthdr.len == 0) {
2359 printf("%s so %llx pkt %llx len null\n",
2360 __func__,
2361 (uint64_t)VM_KERNEL_ADDRPERM(so),
2362 (uint64_t)VM_KERNEL_ADDRPERM(m));
2363 }
2364 if (error != 0)
2365 break;
2366 m = m->m_nextpkt;
2367 }
2368
2369 socket_lock(so, 0);
2370
2371 if (error)
2372 goto release;
2373 top = freelist;
2374 freelist = NULL;
2375 }
2376
2377 if (dontroute)
2378 so->so_options |= SO_DONTROUTE;
2379
2380 if ((flags & MSG_SKIPCFIL) == 0) {
2381 struct mbuf **prevnextp = NULL;
2382
2383 for (i = uiofirst, m = top;
2384 i < uiolast && m != NULL;
2385 i++) {
2386 struct mbuf *nextpkt = m->m_nextpkt;
2387
2388 /*
2389 * Socket filter processing
2390 */
2391 error = sflt_data_out(so, addr, &m,
2392 &control, 0);
2393 if (error != 0 && error != EJUSTRETURN)
2394 goto release;
2395
2396 #if CONTENT_FILTER
2397 if (error == 0) {
2398 /*
2399 * Content filter processing
2400 */
2401 error = cfil_sock_data_out(so, addr, m,
2402 control, 0);
2403 if (error != 0 && error != EJUSTRETURN)
2404 goto release;
2405 }
2406 #endif /* CONTENT_FILTER */
2407 /*
2408 * Remove packet from the list when
2409 * swallowed by a filter
2410 */
2411 if (error == EJUSTRETURN) {
2412 error = 0;
2413 if (prevnextp != NULL)
2414 *prevnextp = nextpkt;
2415 else
2416 top = nextpkt;
2417 }
2418
2419 m = nextpkt;
2420 if (m != NULL)
2421 prevnextp = &m->m_nextpkt;
2422 }
2423 }
2424 if (top != NULL)
2425 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2426 (so, 0, top, addr, control, p);
2427
2428 if (dontroute)
2429 so->so_options &= ~SO_DONTROUTE;
2430
2431 clen = 0;
2432 top = NULL;
2433 uiofirst = uiolast;
2434 } while (resid > 0 && error == 0);
2435 release:
2436 if (sblocked)
2437 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2438 else
2439 socket_unlock(so, 1);
2440 out:
2441 if (top != NULL)
2442 m_freem(top);
2443 if (control != NULL)
2444 m_freem(control);
2445 if (freelist != NULL)
2446 m_freem_list(freelist);
2447
2448 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2449 so->so_snd.sb_cc, 0, error);
2450
2451 return (error);
2452 }
2453
2454 /*
2455 * Implement receive operations on a socket.
2456 * We depend on the way that records are added to the sockbuf
2457 * by sbappend*. In particular, each record (mbufs linked through m_next)
2458 * must begin with an address if the protocol so specifies,
2459 * followed by an optional mbuf or mbufs containing ancillary data,
2460 * and then zero or more mbufs of data.
2461 * In order to avoid blocking network interrupts for the entire time here,
2462 * we splx() while doing the actual copy to user space.
2463 * Although the sockbuf is locked, new data may still be appended,
2464 * and thus we must maintain consistency of the sockbuf during that time.
2465 *
2466 * The caller may receive the data as a single mbuf chain by supplying
2467 * an mbuf **mp0 for use in returning the chain. The uio is then used
2468 * only for the count in uio_resid.
2469 *
2470 * Returns: 0 Success
2471 * ENOBUFS
2472 * ENOTCONN
2473 * EWOULDBLOCK
2474 * uiomove:EFAULT
2475 * sblock:EWOULDBLOCK
2476 * sblock:EINTR
2477 * sbwait:EBADF
2478 * sbwait:EINTR
2479 * sodelayed_copy:EFAULT
2480 * <pru_rcvoob>:EINVAL[TCP]
2481 * <pru_rcvoob>:EWOULDBLOCK[TCP]
2482 * <pru_rcvoob>:???
2483 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
2484 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
2485 * <pr_domain->dom_externalize>:???
2486 *
2487 * Notes: Additional return values from calls through <pru_rcvoob> and
2488 * <pr_domain->dom_externalize> depend on protocols other than
2489 * TCP or AF_UNIX, which are documented above.
2490 */
2491 int
2492 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2493 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2494 {
2495 struct mbuf *m, **mp, *ml = NULL;
2496 struct mbuf *nextrecord, *free_list;
2497 int flags, error, offset;
2498 user_ssize_t len;
2499 struct protosw *pr = so->so_proto;
2500 int moff, type =0;
2501 user_ssize_t orig_resid = uio_resid(uio);
2502 user_ssize_t delayed_copy_len;
2503 int can_delay;
2504 int need_event;
2505 struct proc *p = current_proc();
2506
2507 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so, uio_resid(uio),
2508 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
2509
2510 /*
2511 * Sanity check on the length passed by caller as we are making 'int'
2512 * comparisons
2513 */
2514 if (orig_resid < 0 || orig_resid > INT_MAX)
2515 return (EINVAL);
2516
2517 socket_lock(so, 1);
2518 so_update_last_owner_locked(so, p);
2519 so_update_policy(so);
2520
2521 #ifdef MORE_LOCKING_DEBUG
2522 if (so->so_usecount == 1) {
2523 panic("%s: so=%x no other reference on socket\n", __func__, so);
2524 /* NOTREACHED */
2525 }
2526 #endif
2527 mp = mp0;
2528 if (psa != NULL)
2529 *psa = NULL;
2530 if (controlp != NULL)
2531 *controlp = NULL;
2532 if (flagsp != NULL)
2533 flags = *flagsp &~ MSG_EOR;
2534 else
2535 flags = 0;
2536
2537 /*
2538 * If a recv attempt is made on a previously-accepted socket
2539 * that has been marked as inactive (disconnected), reject
2540 * the request.
2541 */
2542 if (so->so_flags & SOF_DEFUNCT) {
2543 struct sockbuf *sb = &so->so_rcv;
2544
2545 error = ENOTCONN;
2546 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
2547 __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
2548 SOCK_DOM(so), SOCK_TYPE(so), error));
2549 /*
2550 * This socket should have been disconnected and flushed
2551 * prior to being returned from sodefunct(); there should
2552 * be no data on its receive list, so panic otherwise.
2553 */
2554 if (so->so_state & SS_DEFUNCT)
2555 sb_empty_assert(sb, __func__);
2556 socket_unlock(so, 1);
2557 return (error);
2558 }
2559
2560 /*
2561 * When SO_WANTOOBFLAG is set we try to get out-of-band data
2562 * regardless of the flags argument. Here is the case were
2563 * out-of-band data is not inline.
2564 */
2565 if ((flags & MSG_OOB) ||
2566 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
2567 (so->so_options & SO_OOBINLINE) == 0 &&
2568 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
2569 m = m_get(M_WAIT, MT_DATA);
2570 if (m == NULL) {
2571 socket_unlock(so, 1);
2572 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
2573 ENOBUFS, 0, 0, 0, 0);
2574 return (ENOBUFS);
2575 }
2576 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
2577 if (error)
2578 goto bad;
2579 socket_unlock(so, 0);
2580 do {
2581 error = uiomove(mtod(m, caddr_t),
2582 imin(uio_resid(uio), m->m_len), uio);
2583 m = m_free(m);
2584 } while (uio_resid(uio) && error == 0 && m != NULL);
2585 socket_lock(so, 0);
2586 bad:
2587 if (m != NULL)
2588 m_freem(m);
2589
2590 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
2591 if (error == EWOULDBLOCK || error == EINVAL) {
2592 /*
2593 * Let's try to get normal data:
2594 * EWOULDBLOCK: out-of-band data not
2595 * receive yet. EINVAL: out-of-band data
2596 * already read.
2597 */
2598 error = 0;
2599 goto nooob;
2600 } else if (error == 0 && flagsp != NULL) {
2601 *flagsp |= MSG_OOB;
2602 }
2603 }
2604 socket_unlock(so, 1);
2605 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2606 0, 0, 0, 0);
2607
2608 return (error);
2609 }
2610 nooob:
2611 if (mp != NULL)
2612 *mp = NULL;
2613
2614 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
2615 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
2616 }
2617
2618 free_list = NULL;
2619 delayed_copy_len = 0;
2620 restart:
2621 #ifdef MORE_LOCKING_DEBUG
2622 if (so->so_usecount <= 1)
2623 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
2624 (uint64_t)VM_KERNEL_ADDRPERM(so), so->so_usecount);
2625 #endif
2626 /*
2627 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2628 * and if so just return to the caller. This could happen when
2629 * soreceive() is called by a socket upcall function during the
2630 * time the socket is freed. The socket buffer would have been
2631 * locked across the upcall, therefore we cannot put this thread
2632 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2633 * we may livelock), because the lock on the socket buffer will
2634 * only be released when the upcall routine returns to its caller.
2635 * Because the socket has been officially closed, there can be
2636 * no further read on it.
2637 *
2638 * A multipath subflow socket would have its SS_NOFDREF set by
2639 * default, so check for SOF_MP_SUBFLOW socket flag; when the
2640 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
2641 */
2642 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2643 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
2644 socket_unlock(so, 1);
2645 return (0);
2646 }
2647
2648 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
2649 if (error) {
2650 socket_unlock(so, 1);
2651 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2652 0, 0, 0, 0);
2653 return (error);
2654 }
2655
2656 m = so->so_rcv.sb_mb;
2657 /*
2658 * If we have less data than requested, block awaiting more
2659 * (subject to any timeout) if:
2660 * 1. the current count is less than the low water mark, or
2661 * 2. MSG_WAITALL is set, and it is possible to do the entire
2662 * receive operation at once if we block (resid <= hiwat).
2663 * 3. MSG_DONTWAIT is not set
2664 * If MSG_WAITALL is set but resid is larger than the receive buffer,
2665 * we have to do the receive in sections, and thus risk returning
2666 * a short count if a timeout or signal occurs after we start.
2667 */
2668 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
2669 so->so_rcv.sb_cc < uio_resid(uio)) &&
2670 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
2671 ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
2672 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
2673 /*
2674 * Panic if we notice inconsistencies in the socket's
2675 * receive list; both sb_mb and sb_cc should correctly
2676 * reflect the contents of the list, otherwise we may
2677 * end up with false positives during select() or poll()
2678 * which could put the application in a bad state.
2679 */
2680 SB_MB_CHECK(&so->so_rcv);
2681
2682 if (so->so_error) {
2683 if (m != NULL)
2684 goto dontblock;
2685 error = so->so_error;
2686 if ((flags & MSG_PEEK) == 0)
2687 so->so_error = 0;
2688 goto release;
2689 }
2690 if (so->so_state & SS_CANTRCVMORE) {
2691 #if CONTENT_FILTER
2692 /*
2693 * Deal with half closed connections
2694 */
2695 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2696 cfil_sock_data_pending(&so->so_rcv) != 0)
2697 CFIL_LOG(LOG_INFO,
2698 "so %llx ignore SS_CANTRCVMORE",
2699 (uint64_t)VM_KERNEL_ADDRPERM(so));
2700 else
2701 #endif /* CONTENT_FILTER */
2702 if (m != NULL)
2703 goto dontblock;
2704 else
2705 goto release;
2706 }
2707 for (; m != NULL; m = m->m_next)
2708 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
2709 m = so->so_rcv.sb_mb;
2710 goto dontblock;
2711 }
2712 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
2713 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
2714 error = ENOTCONN;
2715 goto release;
2716 }
2717 if (uio_resid(uio) == 0)
2718 goto release;
2719 if ((so->so_state & SS_NBIO) ||
2720 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
2721 error = EWOULDBLOCK;
2722 goto release;
2723 }
2724 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
2725 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
2726 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
2727 #if EVEN_MORE_LOCKING_DEBUG
2728 if (socket_debug)
2729 printf("Waiting for socket data\n");
2730 #endif
2731
2732 error = sbwait(&so->so_rcv);
2733 #if EVEN_MORE_LOCKING_DEBUG
2734 if (socket_debug)
2735 printf("SORECEIVE - sbwait returned %d\n", error);
2736 #endif
2737 if (so->so_usecount < 1) {
2738 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
2739 __func__, so, so->so_usecount);
2740 /* NOTREACHED */
2741 }
2742 if (error) {
2743 socket_unlock(so, 1);
2744 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
2745 0, 0, 0, 0);
2746 return (error);
2747 }
2748 goto restart;
2749 }
2750 dontblock:
2751 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
2752 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
2753 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
2754 nextrecord = m->m_nextpkt;
2755 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
2756 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2757 #if CONFIG_MACF_SOCKET_SUBSET
2758 /*
2759 * Call the MAC framework for policy checking if we're in
2760 * the user process context and the socket isn't connected.
2761 */
2762 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2763 struct mbuf *m0 = m;
2764 /*
2765 * Dequeue this record (temporarily) from the receive
2766 * list since we're about to drop the socket's lock
2767 * where a new record may arrive and be appended to
2768 * the list. Upon MAC policy failure, the record
2769 * will be freed. Otherwise, we'll add it back to
2770 * the head of the list. We cannot rely on SB_LOCK
2771 * because append operation uses the socket's lock.
2772 */
2773 do {
2774 m->m_nextpkt = NULL;
2775 sbfree(&so->so_rcv, m);
2776 m = m->m_next;
2777 } while (m != NULL);
2778 m = m0;
2779 so->so_rcv.sb_mb = nextrecord;
2780 SB_EMPTY_FIXUP(&so->so_rcv);
2781 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2782 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2783 socket_unlock(so, 0);
2784
2785 if (mac_socket_check_received(proc_ucred(p), so,
2786 mtod(m, struct sockaddr *)) != 0) {
2787 /*
2788 * MAC policy failure; free this record and
2789 * process the next record (or block until
2790 * one is available). We have adjusted sb_cc
2791 * and sb_mbcnt above so there is no need to
2792 * call sbfree() again.
2793 */
2794 do {
2795 m = m_free(m);
2796 } while (m != NULL);
2797 /*
2798 * Clear SB_LOCK but don't unlock the socket.
2799 * Process the next record or wait for one.
2800 */
2801 socket_lock(so, 0);
2802 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2803 goto restart;
2804 }
2805 socket_lock(so, 0);
2806 /*
2807 * If the socket has been defunct'd, drop it.
2808 */
2809 if (so->so_flags & SOF_DEFUNCT) {
2810 m_freem(m);
2811 error = ENOTCONN;
2812 goto release;
2813 }
2814 /*
2815 * Re-adjust the socket receive list and re-enqueue
2816 * the record in front of any packets which may have
2817 * been appended while we dropped the lock.
2818 */
2819 for (m = m0; m->m_next != NULL; m = m->m_next)
2820 sballoc(&so->so_rcv, m);
2821 sballoc(&so->so_rcv, m);
2822 if (so->so_rcv.sb_mb == NULL) {
2823 so->so_rcv.sb_lastrecord = m0;
2824 so->so_rcv.sb_mbtail = m;
2825 }
2826 m = m0;
2827 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2828 so->so_rcv.sb_mb = m;
2829 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2830 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2831 }
2832 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2833 orig_resid = 0;
2834 if (psa != NULL) {
2835 *psa = dup_sockaddr(mtod(m, struct sockaddr *),
2836 mp0 == NULL);
2837 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2838 error = EWOULDBLOCK;
2839 goto release;
2840 }
2841 }
2842 if (flags & MSG_PEEK) {
2843 m = m->m_next;
2844 } else {
2845 sbfree(&so->so_rcv, m);
2846 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2847 panic("%s: about to create invalid socketbuf",
2848 __func__);
2849 /* NOTREACHED */
2850 }
2851 MFREE(m, so->so_rcv.sb_mb);
2852 m = so->so_rcv.sb_mb;
2853 if (m != NULL) {
2854 m->m_nextpkt = nextrecord;
2855 } else {
2856 so->so_rcv.sb_mb = nextrecord;
2857 SB_EMPTY_FIXUP(&so->so_rcv);
2858 }
2859 }
2860 }
2861
2862 /*
2863 * Process one or more MT_CONTROL mbufs present before any data mbufs
2864 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2865 * just copy the data; if !MSG_PEEK, we call into the protocol to
2866 * perform externalization.
2867 */
2868 if (m != NULL && m->m_type == MT_CONTROL) {
2869 struct mbuf *cm = NULL, *cmn;
2870 struct mbuf **cme = &cm;
2871 struct sockbuf *sb_rcv = &so->so_rcv;
2872 struct mbuf **msgpcm = NULL;
2873
2874 /*
2875 * Externalizing the control messages would require us to
2876 * drop the socket's lock below. Once we re-acquire the
2877 * lock, the mbuf chain might change. In order to preserve
2878 * consistency, we unlink all control messages from the
2879 * first mbuf chain in one shot and link them separately
2880 * onto a different chain.
2881 */
2882 do {
2883 if (flags & MSG_PEEK) {
2884 if (controlp != NULL) {
2885 if (*controlp == NULL) {
2886 msgpcm = controlp;
2887 }
2888 *controlp = m_copy(m, 0, m->m_len);
2889
2890 /*
2891 * If we failed to allocate an mbuf,
2892 * release any previously allocated
2893 * mbufs for control data. Return
2894 * an error. Keep the mbufs in the
2895 * socket as this is using
2896 * MSG_PEEK flag.
2897 */
2898 if (*controlp == NULL) {
2899 m_freem(*msgpcm);
2900 error = ENOBUFS;
2901 goto release;
2902 }
2903 controlp = &(*controlp)->m_next;
2904 }
2905 m = m->m_next;
2906 } else {
2907 m->m_nextpkt = NULL;
2908 sbfree(sb_rcv, m);
2909 sb_rcv->sb_mb = m->m_next;
2910 m->m_next = NULL;
2911 *cme = m;
2912 cme = &(*cme)->m_next;
2913 m = sb_rcv->sb_mb;
2914 }
2915 } while (m != NULL && m->m_type == MT_CONTROL);
2916
2917 if (!(flags & MSG_PEEK)) {
2918 if (sb_rcv->sb_mb != NULL) {
2919 sb_rcv->sb_mb->m_nextpkt = nextrecord;
2920 } else {
2921 sb_rcv->sb_mb = nextrecord;
2922 SB_EMPTY_FIXUP(sb_rcv);
2923 }
2924 if (nextrecord == NULL)
2925 sb_rcv->sb_lastrecord = m;
2926 }
2927
2928 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2929 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2930
2931 while (cm != NULL) {
2932 int cmsg_type;
2933
2934 cmn = cm->m_next;
2935 cm->m_next = NULL;
2936 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2937
2938 /*
2939 * Call the protocol to externalize SCM_RIGHTS message
2940 * and return the modified message to the caller upon
2941 * success. Otherwise, all other control messages are
2942 * returned unmodified to the caller. Note that we
2943 * only get into this loop if MSG_PEEK is not set.
2944 */
2945 if (pr->pr_domain->dom_externalize != NULL &&
2946 cmsg_type == SCM_RIGHTS) {
2947 /*
2948 * Release socket lock: see 3903171. This
2949 * would also allow more records to be appended
2950 * to the socket buffer. We still have SB_LOCK
2951 * set on it, so we can be sure that the head
2952 * of the mbuf chain won't change.
2953 */
2954 socket_unlock(so, 0);
2955 error = (*pr->pr_domain->dom_externalize)(cm);
2956 socket_lock(so, 0);
2957 } else {
2958 error = 0;
2959 }
2960
2961 if (controlp != NULL && error == 0) {
2962 *controlp = cm;
2963 controlp = &(*controlp)->m_next;
2964 orig_resid = 0;
2965 } else {
2966 (void) m_free(cm);
2967 }
2968 cm = cmn;
2969 }
2970 /*
2971 * Update the value of nextrecord in case we received new
2972 * records when the socket was unlocked above for
2973 * externalizing SCM_RIGHTS.
2974 */
2975 if (m != NULL)
2976 nextrecord = sb_rcv->sb_mb->m_nextpkt;
2977 else
2978 nextrecord = sb_rcv->sb_mb;
2979 orig_resid = 0;
2980 }
2981
2982 /*
2983 * If the socket is a TCP socket with message delivery
2984 * enabled, then create a control msg to deliver the
2985 * relative TCP sequence number for this data. Waiting
2986 * until this point will protect against failures to
2987 * allocate an mbuf for control msgs.
2988 */
2989 if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
2990 (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
2991 struct mbuf *seq_cm;
2992
2993 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
2994 sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
2995 if (seq_cm == NULL) {
2996 /* unable to allocate a control mbuf */
2997 error = ENOBUFS;
2998 goto release;
2999 }
3000 *controlp = seq_cm;
3001 controlp = &seq_cm->m_next;
3002 }
3003
3004 if (m != NULL) {
3005 if (!(flags & MSG_PEEK)) {
3006 /*
3007 * We get here because m points to an mbuf following
3008 * any MT_SONAME or MT_CONTROL mbufs which have been
3009 * processed above. In any case, m should be pointing
3010 * to the head of the mbuf chain, and the nextrecord
3011 * should be either NULL or equal to m->m_nextpkt.
3012 * See comments above about SB_LOCK.
3013 */
3014 if (m != so->so_rcv.sb_mb ||
3015 m->m_nextpkt != nextrecord) {
3016 panic("%s: post-control !sync so=%p m=%p "
3017 "nextrecord=%p\n", __func__, so, m,
3018 nextrecord);
3019 /* NOTREACHED */
3020 }
3021 if (nextrecord == NULL)
3022 so->so_rcv.sb_lastrecord = m;
3023 }
3024 type = m->m_type;
3025 if (type == MT_OOBDATA)
3026 flags |= MSG_OOB;
3027 } else {
3028 if (!(flags & MSG_PEEK)) {
3029 SB_EMPTY_FIXUP(&so->so_rcv);
3030 }
3031 }
3032 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3033 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3034
3035 moff = 0;
3036 offset = 0;
3037
3038 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
3039 can_delay = 1;
3040 else
3041 can_delay = 0;
3042
3043 need_event = 0;
3044
3045 while (m != NULL &&
3046 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3047 if (m->m_type == MT_OOBDATA) {
3048 if (type != MT_OOBDATA)
3049 break;
3050 } else if (type == MT_OOBDATA) {
3051 break;
3052 }
3053 /*
3054 * Make sure to allways set MSG_OOB event when getting
3055 * out of band data inline.
3056 */
3057 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3058 (so->so_options & SO_OOBINLINE) != 0 &&
3059 (so->so_state & SS_RCVATMARK) != 0) {
3060 flags |= MSG_OOB;
3061 }
3062 so->so_state &= ~SS_RCVATMARK;
3063 len = uio_resid(uio) - delayed_copy_len;
3064 if (so->so_oobmark && len > so->so_oobmark - offset)
3065 len = so->so_oobmark - offset;
3066 if (len > m->m_len - moff)
3067 len = m->m_len - moff;
3068 /*
3069 * If mp is set, just pass back the mbufs.
3070 * Otherwise copy them out via the uio, then free.
3071 * Sockbuf must be consistent here (points to current mbuf,
3072 * it points to next record) when we drop priority;
3073 * we must note any additions to the sockbuf when we
3074 * block interrupts again.
3075 */
3076 if (mp == NULL) {
3077 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3078 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3079 if (can_delay && len == m->m_len) {
3080 /*
3081 * only delay the copy if we're consuming the
3082 * mbuf and we're NOT in MSG_PEEK mode
3083 * and we have enough data to make it worthwile
3084 * to drop and retake the lock... can_delay
3085 * reflects the state of the 2 latter
3086 * constraints moff should always be zero
3087 * in these cases
3088 */
3089 delayed_copy_len += len;
3090 } else {
3091 if (delayed_copy_len) {
3092 error = sodelayed_copy(so, uio,
3093 &free_list, &delayed_copy_len);
3094
3095 if (error) {
3096 goto release;
3097 }
3098 /*
3099 * can only get here if MSG_PEEK is not
3100 * set therefore, m should point at the
3101 * head of the rcv queue; if it doesn't,
3102 * it means something drastically
3103 * changed while we were out from behind
3104 * the lock in sodelayed_copy. perhaps
3105 * a RST on the stream. in any event,
3106 * the stream has been interrupted. it's
3107 * probably best just to return whatever
3108 * data we've moved and let the caller
3109 * sort it out...
3110 */
3111 if (m != so->so_rcv.sb_mb) {
3112 break;
3113 }
3114 }
3115 socket_unlock(so, 0);
3116 error = uiomove(mtod(m, caddr_t) + moff,
3117 (int)len, uio);
3118 socket_lock(so, 0);
3119
3120 if (error)
3121 goto release;
3122 }
3123 } else {
3124 uio_setresid(uio, (uio_resid(uio) - len));
3125 }
3126 if (len == m->m_len - moff) {
3127 if (m->m_flags & M_EOR)
3128 flags |= MSG_EOR;
3129 if (flags & MSG_PEEK) {
3130 m = m->m_next;
3131 moff = 0;
3132 } else {
3133 nextrecord = m->m_nextpkt;
3134 sbfree(&so->so_rcv, m);
3135 m->m_nextpkt = NULL;
3136
3137 /*
3138 * If this packet is an unordered packet
3139 * (indicated by M_UNORDERED_DATA flag), remove
3140 * the additional bytes added to the
3141 * receive socket buffer size.
3142 */
3143 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3144 m->m_len &&
3145 (m->m_flags & M_UNORDERED_DATA) &&
3146 sbreserve(&so->so_rcv,
3147 so->so_rcv.sb_hiwat - m->m_len)) {
3148 if (so->so_msg_state->msg_uno_bytes >
3149 m->m_len) {
3150 so->so_msg_state->
3151 msg_uno_bytes -= m->m_len;
3152 } else {
3153 so->so_msg_state->
3154 msg_uno_bytes = 0;
3155 }
3156 m->m_flags &= ~M_UNORDERED_DATA;
3157 }
3158
3159 if (mp != NULL) {
3160 *mp = m;
3161 mp = &m->m_next;
3162 so->so_rcv.sb_mb = m = m->m_next;
3163 *mp = NULL;
3164 } else {
3165 if (free_list == NULL)
3166 free_list = m;
3167 else
3168 ml->m_next = m;
3169 ml = m;
3170 so->so_rcv.sb_mb = m = m->m_next;
3171 ml->m_next = NULL;
3172 }
3173 if (m != NULL) {
3174 m->m_nextpkt = nextrecord;
3175 if (nextrecord == NULL)
3176 so->so_rcv.sb_lastrecord = m;
3177 } else {
3178 so->so_rcv.sb_mb = nextrecord;
3179 SB_EMPTY_FIXUP(&so->so_rcv);
3180 }
3181 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3182 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3183 }
3184 } else {
3185 if (flags & MSG_PEEK) {
3186 moff += len;
3187 } else {
3188 if (mp != NULL) {
3189 int copy_flag;
3190
3191 if (flags & MSG_DONTWAIT)
3192 copy_flag = M_DONTWAIT;
3193 else
3194 copy_flag = M_WAIT;
3195 *mp = m_copym(m, 0, len, copy_flag);
3196 /*
3197 * Failed to allocate an mbuf?
3198 * Adjust uio_resid back, it was
3199 * adjusted down by len bytes which
3200 * we didn't copy over.
3201 */
3202 if (*mp == NULL) {
3203 uio_setresid(uio,
3204 (uio_resid(uio) + len));
3205 break;
3206 }
3207 }
3208 m->m_data += len;
3209 m->m_len -= len;
3210 so->so_rcv.sb_cc -= len;
3211 }
3212 }
3213 if (so->so_oobmark) {
3214 if ((flags & MSG_PEEK) == 0) {
3215 so->so_oobmark -= len;
3216 if (so->so_oobmark == 0) {
3217 so->so_state |= SS_RCVATMARK;
3218 /*
3219 * delay posting the actual event until
3220 * after any delayed copy processing
3221 * has finished
3222 */
3223 need_event = 1;
3224 break;
3225 }
3226 } else {
3227 offset += len;
3228 if (offset == so->so_oobmark)
3229 break;
3230 }
3231 }
3232 if (flags & MSG_EOR)
3233 break;
3234 /*
3235 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3236 * (for non-atomic socket), we must not quit until
3237 * "uio->uio_resid == 0" or an error termination.
3238 * If a signal/timeout occurs, return with a short
3239 * count but without error. Keep sockbuf locked
3240 * against other readers.
3241 */
3242 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL &&
3243 (uio_resid(uio) - delayed_copy_len) > 0 &&
3244 !sosendallatonce(so) && !nextrecord) {
3245 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3246 #if CONTENT_FILTER
3247 && cfil_sock_data_pending(&so->so_rcv) == 0
3248 #endif /* CONTENT_FILTER */
3249 ))
3250 goto release;
3251
3252 /*
3253 * Depending on the protocol (e.g. TCP), the following
3254 * might cause the socket lock to be dropped and later
3255 * be reacquired, and more data could have arrived and
3256 * have been appended to the receive socket buffer by
3257 * the time it returns. Therefore, we only sleep in
3258 * sbwait() below if and only if the socket buffer is
3259 * empty, in order to avoid a false sleep.
3260 */
3261 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3262 (((struct inpcb *)so->so_pcb)->inp_state !=
3263 INPCB_STATE_DEAD))
3264 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3265
3266 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3267 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3268
3269 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3270 error = 0;
3271 goto release;
3272 }
3273 /*
3274 * have to wait until after we get back from the sbwait
3275 * to do the copy because we will drop the lock if we
3276 * have enough data that has been delayed... by dropping
3277 * the lock we open up a window allowing the netisr
3278 * thread to process the incoming packets and to change
3279 * the state of this socket... we're issuing the sbwait
3280 * because the socket is empty and we're expecting the
3281 * netisr thread to wake us up when more packets arrive;
3282 * if we allow that processing to happen and then sbwait
3283 * we could stall forever with packets sitting in the
3284 * socket if no further packets arrive from the remote
3285 * side.
3286 *
3287 * we want to copy before we've collected all the data
3288 * to satisfy this request to allow the copy to overlap
3289 * the incoming packet processing on an MP system
3290 */
3291 if (delayed_copy_len > sorecvmincopy &&
3292 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3293 error = sodelayed_copy(so, uio,
3294 &free_list, &delayed_copy_len);
3295
3296 if (error)
3297 goto release;
3298 }
3299 m = so->so_rcv.sb_mb;
3300 if (m != NULL) {
3301 nextrecord = m->m_nextpkt;
3302 }
3303 SB_MB_CHECK(&so->so_rcv);
3304 }
3305 }
3306 #ifdef MORE_LOCKING_DEBUG
3307 if (so->so_usecount <= 1) {
3308 panic("%s: after big while so=%p ref=%d on socket\n",
3309 __func__, so, so->so_usecount);
3310 /* NOTREACHED */
3311 }
3312 #endif
3313
3314 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3315 if (so->so_options & SO_DONTTRUNC) {
3316 flags |= MSG_RCVMORE;
3317 } else {
3318 flags |= MSG_TRUNC;
3319 if ((flags & MSG_PEEK) == 0)
3320 (void) sbdroprecord(&so->so_rcv);
3321 }
3322 }
3323
3324 /*
3325 * pru_rcvd below (for TCP) may cause more data to be received
3326 * if the socket lock is dropped prior to sending the ACK; some
3327 * legacy OpenTransport applications don't handle this well
3328 * (if it receives less data than requested while MSG_HAVEMORE
3329 * is set), and so we set the flag now based on what we know
3330 * prior to calling pru_rcvd.
3331 */
3332 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
3333 flags |= MSG_HAVEMORE;
3334
3335 if ((flags & MSG_PEEK) == 0) {
3336 if (m == NULL) {
3337 so->so_rcv.sb_mb = nextrecord;
3338 /*
3339 * First part is an inline SB_EMPTY_FIXUP(). Second
3340 * part makes sure sb_lastrecord is up-to-date if
3341 * there is still data in the socket buffer.
3342 */
3343 if (so->so_rcv.sb_mb == NULL) {
3344 so->so_rcv.sb_mbtail = NULL;
3345 so->so_rcv.sb_lastrecord = NULL;
3346 } else if (nextrecord->m_nextpkt == NULL) {
3347 so->so_rcv.sb_lastrecord = nextrecord;
3348 }
3349 SB_MB_CHECK(&so->so_rcv);
3350 }
3351 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3352 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3353 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
3354 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3355 }
3356
3357 if (delayed_copy_len) {
3358 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3359 if (error)
3360 goto release;
3361 }
3362 if (free_list != NULL) {
3363 m_freem_list(free_list);
3364 free_list = NULL;
3365 }
3366 if (need_event)
3367 postevent(so, 0, EV_OOB);
3368
3369 if (orig_resid == uio_resid(uio) && orig_resid &&
3370 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3371 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3372 goto restart;
3373 }
3374
3375 if (flagsp != NULL)
3376 *flagsp |= flags;
3377 release:
3378 #ifdef MORE_LOCKING_DEBUG
3379 if (so->so_usecount <= 1) {
3380 panic("%s: release so=%p ref=%d on socket\n", __func__,
3381 so, so->so_usecount);
3382 /* NOTREACHED */
3383 }
3384 #endif
3385 if (delayed_copy_len)
3386 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3387
3388 if (free_list != NULL)
3389 m_freem_list(free_list);
3390
3391 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
3392
3393 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
3394 so->so_rcv.sb_cc, 0, error);
3395
3396 return (error);
3397 }
3398
3399 /*
3400 * Returns: 0 Success
3401 * uiomove:EFAULT
3402 */
3403 static int
3404 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
3405 user_ssize_t *resid)
3406 {
3407 int error = 0;
3408 struct mbuf *m;
3409
3410 m = *free_list;
3411
3412 socket_unlock(so, 0);
3413
3414 while (m != NULL && error == 0) {
3415 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
3416 m = m->m_next;
3417 }
3418 m_freem_list(*free_list);
3419
3420 *free_list = NULL;
3421 *resid = 0;
3422
3423 socket_lock(so, 0);
3424
3425 return (error);
3426 }
3427
3428 int
3429 soreceive_list(struct socket *so, struct sockaddr **psa, struct uio **uioarray,
3430 u_int uiocnt, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3431 {
3432 struct mbuf *m, **mp;
3433 struct mbuf *nextrecord;
3434 struct mbuf *ml = NULL, *free_list = NULL;
3435 int flags, error, offset;
3436 user_ssize_t len;
3437 struct protosw *pr = so->so_proto;
3438 user_ssize_t orig_resid, resid;
3439 struct proc *p = current_proc();
3440 struct uio *auio = NULL;
3441 int i = 0;
3442 int sblocked = 0;
3443
3444 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
3445 so, uiocnt,
3446 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
3447
3448 mp = mp0;
3449 if (psa != NULL)
3450 *psa = NULL;
3451 if (controlp != NULL)
3452 *controlp = NULL;
3453 if (flagsp != NULL)
3454 flags = *flagsp &~ MSG_EOR;
3455 else
3456 flags = 0;
3457 /*
3458 * Disallow functionality not currently supported
3459 */
3460 if (mp0 != NULL) {
3461 printf("%s mp0 not supported\n", __func__);
3462 error = EOPNOTSUPP;
3463 goto out;
3464 }
3465 if (psa != NULL) {
3466 printf("%s sockaddr not supported\n", __func__);
3467 error = EOPNOTSUPP;
3468 goto out;
3469 }
3470 if (controlp != NULL) {
3471 printf("%s control not supported\n", __func__);
3472 error = EOPNOTSUPP;
3473 goto out;
3474 }
3475
3476 /*
3477 * Sanity checks:
3478 * - Only supports don't wait flags
3479 * - Only support datagram sockets (could be extended to raw)
3480 * - Must be atomic
3481 * - Protocol must support packet chains
3482 * - The uio array is NULL (should we panic?)
3483 */
3484 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
3485 printf("%s flags not supported\n", __func__);
3486 error = EOPNOTSUPP;
3487 goto out;
3488 }
3489 if (so->so_type != SOCK_DGRAM) {
3490 error = EINVAL;
3491 goto out;
3492 }
3493 if (sosendallatonce(so) == 0) {
3494 error = EINVAL;
3495 goto out;
3496 }
3497 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
3498 error = EPROTONOSUPPORT;
3499 goto out;
3500 }
3501 if (uioarray == NULL) {
3502 printf("%s uioarray is NULL\n", __func__);
3503 error = EINVAL;
3504 goto out;
3505 }
3506 if (uiocnt == 0) {
3507 printf("%s uiocnt is 0\n", __func__);
3508 error = EINVAL;
3509 goto out;
3510 }
3511 /*
3512 * Sanity check on the length passed by caller as we are making 'int'
3513 * comparisons
3514 */
3515 resid = orig_resid = uio_array_resid(uioarray, uiocnt);
3516 if (orig_resid < 0 || orig_resid > INT_MAX) {
3517 error = EINVAL;
3518 goto out;
3519 }
3520
3521 socket_lock(so, 1);
3522 so_update_last_owner_locked(so, p);
3523 so_update_policy(so);
3524
3525 #if NECP
3526 so_update_necp_policy(so, NULL, NULL);
3527 #endif /* NECP */
3528
3529 /*
3530 * If a recv attempt is made on a previously-accepted socket
3531 * that has been marked as inactive (disconnected), reject
3532 * the request.
3533 */
3534 if (so->so_flags & SOF_DEFUNCT) {
3535 struct sockbuf *sb = &so->so_rcv;
3536
3537 error = ENOTCONN;
3538 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
3539 __func__, proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so),
3540 SOCK_DOM(so), SOCK_TYPE(so), error));
3541 /*
3542 * This socket should have been disconnected and flushed
3543 * prior to being returned from sodefunct(); there should
3544 * be no data on its receive list, so panic otherwise.
3545 */
3546 if (so->so_state & SS_DEFUNCT)
3547 sb_empty_assert(sb, __func__);
3548 goto release;
3549 }
3550 if (mp != NULL)
3551 *mp = NULL;
3552 restart:
3553 /*
3554 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3555 * and if so just return to the caller. This could happen when
3556 * soreceive() is called by a socket upcall function during the
3557 * time the socket is freed. The socket buffer would have been
3558 * locked across the upcall, therefore we cannot put this thread
3559 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3560 * we may livelock), because the lock on the socket buffer will
3561 * only be released when the upcall routine returns to its caller.
3562 * Because the socket has been officially closed, there can be
3563 * no further read on it.
3564 */
3565 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3566 (SS_NOFDREF | SS_CANTRCVMORE)) {
3567 error = 0;
3568 goto release;
3569 }
3570
3571 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3572 if (error) {
3573 goto release;
3574 }
3575 sblocked = 1;
3576
3577 /*
3578 * Skip empty uio
3579 */
3580 auio = uioarray[i];
3581 while (uio_resid(auio) == 0) {
3582 i++;
3583 if (i >= uiocnt) {
3584 error = 0;
3585 goto release;
3586 }
3587 }
3588
3589 m = so->so_rcv.sb_mb;
3590 /*
3591 * Block awaiting more datagram if needed
3592 */
3593 if (m == NULL) {
3594 /*
3595 * Panic if we notice inconsistencies in the socket's
3596 * receive list; both sb_mb and sb_cc should correctly
3597 * reflect the contents of the list, otherwise we may
3598 * end up with false positives during select() or poll()
3599 * which could put the application in a bad state.
3600 */
3601 SB_MB_CHECK(&so->so_rcv);
3602
3603 if (so->so_error) {
3604 error = so->so_error;
3605 goto release;
3606 }
3607 if (so->so_state & SS_CANTRCVMORE) {
3608 goto release;
3609 }
3610 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3611 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3612 error = ENOTCONN;
3613 goto release;
3614 }
3615 if ((so->so_state & SS_NBIO) ||
3616 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3617 error = EWOULDBLOCK;
3618 goto release;
3619 }
3620 /*
3621 * Do not block if we got some data
3622 * Note: We could use MSG_WAITALL to wait
3623 */
3624 resid = uio_array_resid(uioarray, uiocnt);
3625 if (resid != orig_resid) {
3626 error = 0;
3627 goto release;
3628 }
3629
3630 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3631 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3632
3633 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3634 sblocked = 0;
3635
3636 error = sbwait(&so->so_rcv);
3637 if (error) {
3638 goto release;
3639 }
3640 goto restart;
3641 }
3642
3643 if (m->m_pkthdr.len == 0) {
3644 printf("%s so %llx pkt %llx len is null\n",
3645 __func__,
3646 (uint64_t)VM_KERNEL_ADDRPERM(so),
3647 (uint64_t)VM_KERNEL_ADDRPERM(m));
3648 goto restart;
3649 }
3650 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3651 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3652 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3653
3654 /*
3655 * Consume the current uio index as we have a datagram
3656 */
3657 i += 1;
3658 nextrecord = m->m_nextpkt;
3659
3660 #if SO_RECEIVE_LIST_SOCKADDR_NOT_YET
3661 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3662 /*
3663 * to be adapted from soreceive()
3664 */
3665 }
3666 #endif /* SO_RECEIVE_LIST_SOCKADDR_NOT_YET */
3667
3668 #if SO_RECEIVE_LIST_CONTROL_NOT_YET
3669 /*
3670 * Process one or more MT_CONTROL mbufs present before any data mbufs
3671 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3672 * just copy the data; if !MSG_PEEK, we call into the protocol to
3673 * perform externalization.
3674 */
3675 if (m != NULL && m->m_type == MT_CONTROL) {
3676 /*
3677 * to be adapted from soreceive()
3678 */
3679 }
3680 #endif /* SO_RECEIVE_LIST_CONTROL_NOT_YET */
3681
3682 offset = 0;
3683
3684 /*
3685 * Loop to copy out the mbufs of the current record
3686 */
3687 while (m != NULL && uio_resid(auio) > 0 && error == 0) {
3688 len = uio_resid(auio);
3689
3690 if (m->m_len == 0)
3691 printf("%s: so %llx m %llx m_len is 0\n",
3692 __func__,
3693 (uint64_t)VM_KERNEL_ADDRPERM(so),
3694 (uint64_t)VM_KERNEL_ADDRPERM(m));
3695
3696 /*
3697 * Clip to the residual length
3698 */
3699 if (len > m->m_len)
3700 len = m->m_len;
3701 /*
3702 * If mp is set, just pass back the mbufs.
3703 * Otherwise copy them out via the uio, then free.
3704 * Sockbuf must be consistent here (points to current mbuf,
3705 * it points to next record) when we drop priority;
3706 * we must note any additions to the sockbuf when we
3707 * block interrupts again.
3708 */
3709 if (mp != NULL) {
3710 uio_setresid(auio, (uio_resid(auio) - len));
3711 } else {
3712 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3713 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3714
3715 socket_unlock(so, 0);
3716 error = uiomove(mtod(m, caddr_t), (int)len, auio);
3717 socket_lock(so, 0);
3718
3719 if (error)
3720 goto release;
3721 }
3722 if (len == m->m_len) {
3723 /*
3724 * m was entirely copied
3725 */
3726 nextrecord = m->m_nextpkt;
3727 sbfree(&so->so_rcv, m);
3728 m->m_nextpkt = NULL;
3729
3730 /*
3731 * Move to m_next
3732 */
3733 if (mp != NULL) {
3734 *mp = m;
3735 mp = &m->m_next;
3736 so->so_rcv.sb_mb = m = m->m_next;
3737 *mp = NULL;
3738 } else {
3739 if (free_list == NULL)
3740 free_list = m;
3741 else
3742 ml->m_next = m;
3743 ml = m;
3744 so->so_rcv.sb_mb = m = m->m_next;
3745 ml->m_next = NULL;
3746 ml->m_nextpkt = NULL;
3747 }
3748 if (m != NULL) {
3749 m->m_nextpkt = nextrecord;
3750 if (nextrecord == NULL)
3751 so->so_rcv.sb_lastrecord = m;
3752 } else {
3753 so->so_rcv.sb_mb = nextrecord;
3754 SB_EMPTY_FIXUP(&so->so_rcv);
3755 }
3756 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3757 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3758 } else {
3759 /*
3760 * Stop the loop on partial copy
3761 */
3762 if (mp != NULL) {
3763 int copy_flag;
3764
3765 if (flags & MSG_DONTWAIT)
3766 copy_flag = M_DONTWAIT;
3767 else
3768 copy_flag = M_WAIT;
3769 *mp = m_copym(m, 0, len, copy_flag);
3770 /*
3771 * Failed to allocate an mbuf?
3772 * Adjust uio_resid back, it was
3773 * adjusted down by len bytes which
3774 * we didn't copy over.
3775 */
3776 if (*mp == NULL) {
3777 uio_setresid(auio,
3778 (uio_resid(auio) + len));
3779 error = ENOMEM;
3780 break;
3781 }
3782 }
3783 break;
3784 }
3785 }
3786 #ifdef MORE_LOCKING_DEBUG
3787 if (so->so_usecount <= 1) {
3788 panic("%s: after big while so=%llx ref=%d on socket\n",
3789 __func__,
3790 (uint64_t)VM_KERNEL_ADDRPERM(so), so->so_usecount);
3791 /* NOTREACHED */
3792 }
3793 #endif
3794 /*
3795 * Tell the caller we made a partial copy
3796 */
3797 if (m != NULL) {
3798 if (so->so_options & SO_DONTTRUNC) {
3799 m->m_data += len;
3800 m->m_len -= len;
3801 so->so_rcv.sb_cc -= len;
3802 flags |= MSG_RCVMORE;
3803 } else {
3804 (void) sbdroprecord(&so->so_rcv);
3805 nextrecord = so->so_rcv.sb_mb;
3806 m = NULL;
3807 flags |= MSG_TRUNC;
3808 }
3809 }
3810
3811 if (m == NULL) {
3812 so->so_rcv.sb_mb = nextrecord;
3813 /*
3814 * First part is an inline SB_EMPTY_FIXUP(). Second
3815 * part makes sure sb_lastrecord is up-to-date if
3816 * there is still data in the socket buffer.
3817 */
3818 if (so->so_rcv.sb_mb == NULL) {
3819 so->so_rcv.sb_mbtail = NULL;
3820 so->so_rcv.sb_lastrecord = NULL;
3821 } else if (nextrecord->m_nextpkt == NULL) {
3822 so->so_rcv.sb_lastrecord = nextrecord;
3823 }
3824 SB_MB_CHECK(&so->so_rcv);
3825 }
3826 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3827 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3828
3829 /*
3830 * We can continue to the next packet as long as:
3831 * - We haven't exhausted the uio array
3832 * - There was no error
3833 * - A packet was not truncated
3834 * - We can still receive more data
3835 */
3836 if (i < uiocnt && error == 0 &&
3837 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0
3838 && (so->so_state & SS_CANTRCVMORE) == 0) {
3839 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3840 sblocked = 0;
3841
3842 goto restart;
3843 }
3844
3845 release:
3846 /*
3847 * pru_rcvd may cause more data to be received if the socket lock
3848 * is dropped so we set MSG_HAVEMORE now based on what we know.
3849 * That way the caller won't be surprised if it receives less data than requested.
3850 */
3851 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
3852 flags |= MSG_HAVEMORE;
3853
3854 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
3855 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3856
3857 if (flagsp != NULL)
3858 *flagsp |= flags;
3859 if (sblocked)
3860 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
3861 else
3862 socket_unlock(so, 1);
3863 out:
3864 /*
3865 * Amortize the cost
3866 */
3867 if (free_list != NULL)
3868 m_freem_list(free_list);
3869
3870 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
3871 0, 0, 0, 0);
3872 return (error);
3873 }
3874
3875 /*
3876 * Returns: 0 Success
3877 * EINVAL
3878 * ENOTCONN
3879 * <pru_shutdown>:EINVAL
3880 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
3881 * <pru_shutdown>:ENOBUFS[TCP]
3882 * <pru_shutdown>:EMSGSIZE[TCP]
3883 * <pru_shutdown>:EHOSTUNREACH[TCP]
3884 * <pru_shutdown>:ENETUNREACH[TCP]
3885 * <pru_shutdown>:ENETDOWN[TCP]
3886 * <pru_shutdown>:ENOMEM[TCP]
3887 * <pru_shutdown>:EACCES[TCP]
3888 * <pru_shutdown>:EMSGSIZE[TCP]
3889 * <pru_shutdown>:ENOBUFS[TCP]
3890 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
3891 * <pru_shutdown>:??? [other protocol families]
3892 */
3893 int
3894 soshutdown(struct socket *so, int how)
3895 {
3896 int error;
3897
3898 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
3899
3900 switch (how) {
3901 case SHUT_RD:
3902 case SHUT_WR:
3903 case SHUT_RDWR:
3904 socket_lock(so, 1);
3905 if ((so->so_state &
3906 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
3907 error = ENOTCONN;
3908 } else {
3909 error = soshutdownlock(so, how);
3910 }
3911 socket_unlock(so, 1);
3912 break;
3913 default:
3914 error = EINVAL;
3915 break;
3916 }
3917
3918 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
3919
3920 return (error);
3921 }
3922
3923 int
3924 soshutdownlock_final(struct socket *so, int how)
3925 {
3926 struct protosw *pr = so->so_proto;
3927 int error = 0;
3928
3929 sflt_notify(so, sock_evt_shutdown, &how);
3930
3931 if (how != SHUT_WR) {
3932 if ((so->so_state & SS_CANTRCVMORE) != 0) {
3933 /* read already shut down */
3934 error = ENOTCONN;
3935 goto done;
3936 }
3937 sorflush(so);
3938 postevent(so, 0, EV_RCLOSED);
3939 }
3940 if (how != SHUT_RD) {
3941 if ((so->so_state & SS_CANTSENDMORE) != 0) {
3942 /* write already shut down */
3943 error = ENOTCONN;
3944 goto done;
3945 }
3946 error = (*pr->pr_usrreqs->pru_shutdown)(so);
3947 postevent(so, 0, EV_WCLOSED);
3948 }
3949 done:
3950 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
3951 return (error);
3952 }
3953
3954 int
3955 soshutdownlock(struct socket *so, int how)
3956 {
3957 int error = 0;
3958
3959 #if CONTENT_FILTER
3960 /*
3961 * A content filter may delay the actual shutdown until it
3962 * has processed the pending data
3963 */
3964 if (so->so_flags & SOF_CONTENT_FILTER) {
3965 error = cfil_sock_shutdown(so, &how);
3966 if (error == EJUSTRETURN) {
3967 error = 0;
3968 goto done;
3969 } else if (error != 0) {
3970 goto done;
3971 }
3972 }
3973 #endif /* CONTENT_FILTER */
3974
3975 error = soshutdownlock_final(so, how);
3976
3977 done:
3978 return (error);
3979 }
3980
3981 void
3982 sowflush(struct socket *so)
3983 {
3984 struct sockbuf *sb = &so->so_snd;
3985 #ifdef notyet
3986 lck_mtx_t *mutex_held;
3987 /*
3988 * XXX: This code is currently commented out, because we may get here
3989 * as part of sofreelastref(), and at that time, pr_getlock() may no
3990 * longer be able to return us the lock; this will be fixed in future.
3991 */
3992 if (so->so_proto->pr_getlock != NULL)
3993 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
3994 else
3995 mutex_held = so->so_proto->pr_domain->dom_mtx;
3996
3997 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
3998 #endif /* notyet */
3999
4000 /*
4001 * Obtain lock on the socket buffer (SB_LOCK). This is required
4002 * to prevent the socket buffer from being unexpectedly altered
4003 * while it is used by another thread in socket send/receive.
4004 *
4005 * sblock() must not fail here, hence the assertion.
4006 */
4007 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4008 VERIFY(sb->sb_flags & SB_LOCK);
4009
4010 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
4011 sb->sb_flags |= SB_DROP;
4012 sb->sb_upcall = NULL;
4013 sb->sb_upcallarg = NULL;
4014
4015 sbunlock(sb, TRUE); /* keep socket locked */
4016
4017 selthreadclear(&sb->sb_sel);
4018 sbrelease(sb);
4019 }
4020
4021 void
4022 sorflush(struct socket *so)
4023 {
4024 struct sockbuf *sb = &so->so_rcv;
4025 struct protosw *pr = so->so_proto;
4026 struct sockbuf asb;
4027 #ifdef notyet
4028 lck_mtx_t *mutex_held;
4029 /*
4030 * XXX: This code is currently commented out, because we may get here
4031 * as part of sofreelastref(), and at that time, pr_getlock() may no
4032 * longer be able to return us the lock; this will be fixed in future.
4033 */
4034 if (so->so_proto->pr_getlock != NULL)
4035 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4036 else
4037 mutex_held = so->so_proto->pr_domain->dom_mtx;
4038
4039 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4040 #endif /* notyet */
4041
4042 sflt_notify(so, sock_evt_flush_read, NULL);
4043
4044 socantrcvmore(so);
4045
4046 /*
4047 * Obtain lock on the socket buffer (SB_LOCK). This is required
4048 * to prevent the socket buffer from being unexpectedly altered
4049 * while it is used by another thread in socket send/receive.
4050 *
4051 * sblock() must not fail here, hence the assertion.
4052 */
4053 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4054 VERIFY(sb->sb_flags & SB_LOCK);
4055
4056 /*
4057 * Copy only the relevant fields from "sb" to "asb" which we
4058 * need for sbrelease() to function. In particular, skip
4059 * sb_sel as it contains the wait queue linkage, which would
4060 * wreak havoc if we were to issue selthreadclear() on "asb".
4061 * Make sure to not carry over SB_LOCK in "asb", as we need
4062 * to acquire it later as part of sbrelease().
4063 */
4064 bzero(&asb, sizeof (asb));
4065 asb.sb_cc = sb->sb_cc;
4066 asb.sb_hiwat = sb->sb_hiwat;
4067 asb.sb_mbcnt = sb->sb_mbcnt;
4068 asb.sb_mbmax = sb->sb_mbmax;
4069 asb.sb_ctl = sb->sb_ctl;
4070 asb.sb_lowat = sb->sb_lowat;
4071 asb.sb_mb = sb->sb_mb;
4072 asb.sb_mbtail = sb->sb_mbtail;
4073 asb.sb_lastrecord = sb->sb_lastrecord;
4074 asb.sb_so = sb->sb_so;
4075 asb.sb_flags = sb->sb_flags;
4076 asb.sb_flags &= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL);
4077 asb.sb_flags |= SB_DROP;
4078
4079 /*
4080 * Ideally we'd bzero() these and preserve the ones we need;
4081 * but to do that we'd need to shuffle things around in the
4082 * sockbuf, and we can't do it now because there are KEXTS
4083 * that are directly referring to the socket structure.
4084 *
4085 * Setting SB_DROP acts as a barrier to prevent further appends.
4086 * Clearing SB_SEL is done for selthreadclear() below.
4087 */
4088 sb->sb_cc = 0;
4089 sb->sb_hiwat = 0;
4090 sb->sb_mbcnt = 0;
4091 sb->sb_mbmax = 0;
4092 sb->sb_ctl = 0;
4093 sb->sb_lowat = 0;
4094 sb->sb_mb = NULL;
4095 sb->sb_mbtail = NULL;
4096 sb->sb_lastrecord = NULL;
4097 sb->sb_timeo.tv_sec = 0;
4098 sb->sb_timeo.tv_usec = 0;
4099 sb->sb_upcall = NULL;
4100 sb->sb_upcallarg = NULL;
4101 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
4102 sb->sb_flags |= SB_DROP;
4103
4104 sbunlock(sb, TRUE); /* keep socket locked */
4105
4106 /*
4107 * Note that selthreadclear() is called on the original "sb" and
4108 * not the local "asb" because of the way wait queue linkage is
4109 * implemented. Given that selwakeup() may be triggered, SB_SEL
4110 * should no longer be set (cleared above.)
4111 */
4112 selthreadclear(&sb->sb_sel);
4113
4114 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
4115 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4116
4117 sbrelease(&asb);
4118 }
4119
4120 /*
4121 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4122 * an additional variant to handle the case where the option value needs
4123 * to be some kind of integer, but not a specific size.
4124 * In addition to their use here, these functions are also called by the
4125 * protocol-level pr_ctloutput() routines.
4126 *
4127 * Returns: 0 Success
4128 * EINVAL
4129 * copyin:EFAULT
4130 */
4131 int
4132 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4133 {
4134 size_t valsize;
4135
4136 /*
4137 * If the user gives us more than we wanted, we ignore it,
4138 * but if we don't get the minimum length the caller
4139 * wants, we return EINVAL. On success, sopt->sopt_valsize
4140 * is set to however much we actually retrieved.
4141 */
4142 if ((valsize = sopt->sopt_valsize) < minlen)
4143 return (EINVAL);
4144 if (valsize > len)
4145 sopt->sopt_valsize = valsize = len;
4146
4147 if (sopt->sopt_p != kernproc)
4148 return (copyin(sopt->sopt_val, buf, valsize));
4149
4150 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4151 return (0);
4152 }
4153
4154 /*
4155 * sooptcopyin_timeval
4156 * Copy in a timeval value into tv_p, and take into account whether the
4157 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4158 * code here so that we can verify the 64-bit tv_sec value before we lose
4159 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4160 */
4161 static int
4162 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4163 {
4164 int error;
4165
4166 if (proc_is64bit(sopt->sopt_p)) {
4167 struct user64_timeval tv64;
4168
4169 if (sopt->sopt_valsize < sizeof (tv64))
4170 return (EINVAL);
4171
4172 sopt->sopt_valsize = sizeof (tv64);
4173 if (sopt->sopt_p != kernproc) {
4174 error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
4175 if (error != 0)
4176 return (error);
4177 } else {
4178 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4179 sizeof (tv64));
4180 }
4181 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4182 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000)
4183 return (EDOM);
4184
4185 tv_p->tv_sec = tv64.tv_sec;
4186 tv_p->tv_usec = tv64.tv_usec;
4187 } else {
4188 struct user32_timeval tv32;
4189
4190 if (sopt->sopt_valsize < sizeof (tv32))
4191 return (EINVAL);
4192
4193 sopt->sopt_valsize = sizeof (tv32);
4194 if (sopt->sopt_p != kernproc) {
4195 error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
4196 if (error != 0) {
4197 return (error);
4198 }
4199 } else {
4200 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4201 sizeof (tv32));
4202 }
4203 #ifndef __LP64__
4204 /*
4205 * K64todo "comparison is always false due to
4206 * limited range of data type"
4207 */
4208 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4209 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000)
4210 return (EDOM);
4211 #endif
4212 tv_p->tv_sec = tv32.tv_sec;
4213 tv_p->tv_usec = tv32.tv_usec;
4214 }
4215 return (0);
4216 }
4217
4218 /*
4219 * Returns: 0 Success
4220 * EINVAL
4221 * ENOPROTOOPT
4222 * ENOBUFS
4223 * EDOM
4224 * sooptcopyin:EINVAL
4225 * sooptcopyin:EFAULT
4226 * sooptcopyin_timeval:EINVAL
4227 * sooptcopyin_timeval:EFAULT
4228 * sooptcopyin_timeval:EDOM
4229 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4230 * <pr_ctloutput>:???w
4231 * sflt_attach_private:??? [whatever a filter author chooses]
4232 * <sf_setoption>:??? [whatever a filter author chooses]
4233 *
4234 * Notes: Other <pru_listen> returns depend on the protocol family; all
4235 * <sf_listen> returns depend on what the filter author causes
4236 * their filter to return.
4237 */
4238 int
4239 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4240 {
4241 int error, optval;
4242 struct linger l;
4243 struct timeval tv;
4244 #if CONFIG_MACF_SOCKET
4245 struct mac extmac;
4246 #endif /* MAC_SOCKET */
4247
4248 if (sopt->sopt_dir != SOPT_SET)
4249 sopt->sopt_dir = SOPT_SET;
4250
4251 if (dolock)
4252 socket_lock(so, 1);
4253
4254 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4255 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4256 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4257 /* the socket has been shutdown, no more sockopt's */
4258 error = EINVAL;
4259 goto out;
4260 }
4261
4262 error = sflt_setsockopt(so, sopt);
4263 if (error != 0) {
4264 if (error == EJUSTRETURN)
4265 error = 0;
4266 goto out;
4267 }
4268
4269 if (sopt->sopt_level != SOL_SOCKET) {
4270 if (so->so_proto != NULL &&
4271 so->so_proto->pr_ctloutput != NULL) {
4272 error = (*so->so_proto->pr_ctloutput)(so, sopt);
4273 goto out;
4274 }
4275 error = ENOPROTOOPT;
4276 } else {
4277 /*
4278 * Allow socket-level (SOL_SOCKET) options to be filtered by
4279 * the protocol layer, if needed. A zero value returned from
4280 * the handler means use default socket-level processing as
4281 * done by the rest of this routine. Otherwise, any other
4282 * return value indicates that the option is unsupported.
4283 */
4284 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4285 pru_socheckopt(so, sopt)) != 0)
4286 goto out;
4287
4288 error = 0;
4289 switch (sopt->sopt_name) {
4290 case SO_LINGER:
4291 case SO_LINGER_SEC:
4292 error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
4293 if (error != 0)
4294 goto out;
4295
4296 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
4297 l.l_linger : l.l_linger * hz;
4298 if (l.l_onoff != 0)
4299 so->so_options |= SO_LINGER;
4300 else
4301 so->so_options &= ~SO_LINGER;
4302 break;
4303
4304 case SO_DEBUG:
4305 case SO_KEEPALIVE:
4306 case SO_DONTROUTE:
4307 case SO_USELOOPBACK:
4308 case SO_BROADCAST:
4309 case SO_REUSEADDR:
4310 case SO_REUSEPORT:
4311 case SO_OOBINLINE:
4312 case SO_TIMESTAMP:
4313 case SO_TIMESTAMP_MONOTONIC:
4314 case SO_DONTTRUNC:
4315 case SO_WANTMORE:
4316 case SO_WANTOOBFLAG:
4317 case SO_NOWAKEFROMSLEEP:
4318 error = sooptcopyin(sopt, &optval, sizeof (optval),
4319 sizeof (optval));
4320 if (error != 0)
4321 goto out;
4322 if (optval)
4323 so->so_options |= sopt->sopt_name;
4324 else
4325 so->so_options &= ~sopt->sopt_name;
4326 break;
4327
4328 case SO_SNDBUF:
4329 case SO_RCVBUF:
4330 case SO_SNDLOWAT:
4331 case SO_RCVLOWAT:
4332 error = sooptcopyin(sopt, &optval, sizeof (optval),
4333 sizeof (optval));
4334 if (error != 0)
4335 goto out;
4336
4337 /*
4338 * Values < 1 make no sense for any of these
4339 * options, so disallow them.
4340 */
4341 if (optval < 1) {
4342 error = EINVAL;
4343 goto out;
4344 }
4345
4346 switch (sopt->sopt_name) {
4347 case SO_SNDBUF:
4348 case SO_RCVBUF: {
4349 struct sockbuf *sb =
4350 (sopt->sopt_name == SO_SNDBUF) ?
4351 &so->so_snd : &so->so_rcv;
4352 if (sbreserve(sb, (u_int32_t)optval) == 0) {
4353 error = ENOBUFS;
4354 goto out;
4355 }
4356 sb->sb_flags |= SB_USRSIZE;
4357 sb->sb_flags &= ~SB_AUTOSIZE;
4358 sb->sb_idealsize = (u_int32_t)optval;
4359 break;
4360 }
4361 /*
4362 * Make sure the low-water is never greater than
4363 * the high-water.
4364 */
4365 case SO_SNDLOWAT: {
4366 int space = sbspace(&so->so_snd);
4367 u_int32_t hiwat = so->so_snd.sb_hiwat;
4368
4369 if (so->so_snd.sb_flags & SB_UNIX) {
4370 struct unpcb *unp =
4371 (struct unpcb *)(so->so_pcb);
4372 if (unp != NULL && unp->unp_conn != NULL) {
4373 hiwat += unp->unp_conn->unp_cc;
4374 }
4375 }
4376
4377 so->so_snd.sb_lowat =
4378 (optval > hiwat) ?
4379 hiwat : optval;
4380
4381 if (space >= so->so_snd.sb_lowat) {
4382 sowwakeup(so);
4383 }
4384 break;
4385 }
4386 case SO_RCVLOWAT: {
4387 int64_t data_len;
4388 so->so_rcv.sb_lowat =
4389 (optval > so->so_rcv.sb_hiwat) ?
4390 so->so_rcv.sb_hiwat : optval;
4391 data_len = so->so_rcv.sb_cc
4392 - so->so_rcv.sb_ctl;
4393 if (data_len >= so->so_rcv.sb_lowat)
4394 sorwakeup(so);
4395 break;
4396 }
4397 }
4398 break;
4399
4400 case SO_SNDTIMEO:
4401 case SO_RCVTIMEO:
4402 error = sooptcopyin_timeval(sopt, &tv);
4403 if (error != 0)
4404 goto out;
4405
4406 switch (sopt->sopt_name) {
4407 case SO_SNDTIMEO:
4408 so->so_snd.sb_timeo = tv;
4409 break;
4410 case SO_RCVTIMEO:
4411 so->so_rcv.sb_timeo = tv;
4412 break;
4413 }
4414 break;
4415
4416 case SO_NKE: {
4417 struct so_nke nke;
4418
4419 error = sooptcopyin(sopt, &nke, sizeof (nke),
4420 sizeof (nke));
4421 if (error != 0)
4422 goto out;
4423
4424 error = sflt_attach_internal(so, nke.nke_handle);
4425 break;
4426 }
4427
4428 case SO_NOSIGPIPE:
4429 error = sooptcopyin(sopt, &optval, sizeof (optval),
4430 sizeof (optval));
4431 if (error != 0)
4432 goto out;
4433 if (optval != 0)
4434 so->so_flags |= SOF_NOSIGPIPE;
4435 else
4436 so->so_flags &= ~SOF_NOSIGPIPE;
4437 break;
4438
4439 case SO_NOADDRERR:
4440 error = sooptcopyin(sopt, &optval, sizeof (optval),
4441 sizeof (optval));
4442 if (error != 0)
4443 goto out;
4444 if (optval != 0)
4445 so->so_flags |= SOF_NOADDRAVAIL;
4446 else
4447 so->so_flags &= ~SOF_NOADDRAVAIL;
4448 break;
4449
4450 case SO_REUSESHAREUID:
4451 error = sooptcopyin(sopt, &optval, sizeof (optval),
4452 sizeof (optval));
4453 if (error != 0)
4454 goto out;
4455 if (optval != 0)
4456 so->so_flags |= SOF_REUSESHAREUID;
4457 else
4458 so->so_flags &= ~SOF_REUSESHAREUID;
4459 break;
4460
4461 case SO_NOTIFYCONFLICT:
4462 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
4463 error = EPERM;
4464 goto out;
4465 }
4466 error = sooptcopyin(sopt, &optval, sizeof (optval),
4467 sizeof (optval));
4468 if (error != 0)
4469 goto out;
4470 if (optval != 0)
4471 so->so_flags |= SOF_NOTIFYCONFLICT;
4472 else
4473 so->so_flags &= ~SOF_NOTIFYCONFLICT;
4474 break;
4475
4476 case SO_RESTRICTIONS:
4477 error = sooptcopyin(sopt, &optval, sizeof (optval),
4478 sizeof (optval));
4479 if (error != 0)
4480 goto out;
4481
4482 error = so_set_restrictions(so, optval);
4483 break;
4484
4485 case SO_AWDL_UNRESTRICTED:
4486 if (SOCK_DOM(so) != PF_INET &&
4487 SOCK_DOM(so) != PF_INET6) {
4488 error = EOPNOTSUPP;
4489 goto out;
4490 }
4491 error = sooptcopyin(sopt, &optval, sizeof(optval),
4492 sizeof(optval));
4493 if (error != 0)
4494 goto out;
4495 if (optval != 0) {
4496 kauth_cred_t cred = NULL;
4497 proc_t ep = PROC_NULL;
4498
4499 if (so->so_flags & SOF_DELEGATED) {
4500 ep = proc_find(so->e_pid);
4501 if (ep)
4502 cred = kauth_cred_proc_ref(ep);
4503 }
4504 error = priv_check_cred(
4505 cred ? cred : so->so_cred,
4506 PRIV_NET_RESTRICTED_AWDL, 0);
4507 if (error == 0)
4508 inp_set_awdl_unrestricted(
4509 sotoinpcb(so));
4510 if (cred)
4511 kauth_cred_unref(&cred);
4512 if (ep != PROC_NULL)
4513 proc_rele(ep);
4514 } else
4515 inp_clear_awdl_unrestricted(sotoinpcb(so));
4516 break;
4517
4518 case SO_LABEL:
4519 #if CONFIG_MACF_SOCKET
4520 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
4521 sizeof (extmac))) != 0)
4522 goto out;
4523
4524 error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
4525 so, &extmac);
4526 #else
4527 error = EOPNOTSUPP;
4528 #endif /* MAC_SOCKET */
4529 break;
4530
4531 case SO_UPCALLCLOSEWAIT:
4532 error = sooptcopyin(sopt, &optval, sizeof (optval),
4533 sizeof (optval));
4534 if (error != 0)
4535 goto out;
4536 if (optval != 0)
4537 so->so_flags |= SOF_UPCALLCLOSEWAIT;
4538 else
4539 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
4540 break;
4541
4542 case SO_RANDOMPORT:
4543 error = sooptcopyin(sopt, &optval, sizeof (optval),
4544 sizeof (optval));
4545 if (error != 0)
4546 goto out;
4547 if (optval != 0)
4548 so->so_flags |= SOF_BINDRANDOMPORT;
4549 else
4550 so->so_flags &= ~SOF_BINDRANDOMPORT;
4551 break;
4552
4553 case SO_NP_EXTENSIONS: {
4554 struct so_np_extensions sonpx;
4555
4556 error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
4557 sizeof (sonpx));
4558 if (error != 0)
4559 goto out;
4560 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
4561 error = EINVAL;
4562 goto out;
4563 }
4564 /*
4565 * Only one bit defined for now
4566 */
4567 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
4568 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
4569 so->so_flags |= SOF_NPX_SETOPTSHUT;
4570 else
4571 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
4572 }
4573 break;
4574 }
4575
4576 case SO_TRAFFIC_CLASS: {
4577 error = sooptcopyin(sopt, &optval, sizeof (optval),
4578 sizeof (optval));
4579 if (error != 0)
4580 goto out;
4581 error = so_set_traffic_class(so, optval);
4582 if (error != 0)
4583 goto out;
4584 break;
4585 }
4586
4587 case SO_RECV_TRAFFIC_CLASS: {
4588 error = sooptcopyin(sopt, &optval, sizeof (optval),
4589 sizeof (optval));
4590 if (error != 0)
4591 goto out;
4592 if (optval == 0)
4593 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
4594 else
4595 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
4596 break;
4597 }
4598
4599 case SO_TRAFFIC_CLASS_DBG: {
4600 struct so_tcdbg so_tcdbg;
4601
4602 error = sooptcopyin(sopt, &so_tcdbg,
4603 sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
4604 if (error != 0)
4605 goto out;
4606 error = so_set_tcdbg(so, &so_tcdbg);
4607 if (error != 0)
4608 goto out;
4609 break;
4610 }
4611
4612 case SO_PRIVILEGED_TRAFFIC_CLASS:
4613 error = priv_check_cred(kauth_cred_get(),
4614 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
4615 if (error != 0)
4616 goto out;
4617 error = sooptcopyin(sopt, &optval, sizeof (optval),
4618 sizeof (optval));
4619 if (error != 0)
4620 goto out;
4621 if (optval == 0)
4622 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
4623 else
4624 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
4625 break;
4626
4627 case SO_DEFUNCTOK:
4628 error = sooptcopyin(sopt, &optval, sizeof (optval),
4629 sizeof (optval));
4630 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
4631 if (error == 0)
4632 error = EBADF;
4633 goto out;
4634 }
4635 /*
4636 * Any process can set SO_DEFUNCTOK (clear
4637 * SOF_NODEFUNCT), but only root can clear
4638 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
4639 */
4640 if (optval == 0 &&
4641 kauth_cred_issuser(kauth_cred_get()) == 0) {
4642 error = EPERM;
4643 goto out;
4644 }
4645 if (optval)
4646 so->so_flags &= ~SOF_NODEFUNCT;
4647 else
4648 so->so_flags |= SOF_NODEFUNCT;
4649
4650 if (SOCK_DOM(so) == PF_INET ||
4651 SOCK_DOM(so) == PF_INET6) {
4652 char s[MAX_IPv6_STR_LEN];
4653 char d[MAX_IPv6_STR_LEN];
4654 struct inpcb *inp = sotoinpcb(so);
4655
4656 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%s %s:%d -> "
4657 "%s:%d] is now marked as %seligible for "
4658 "defunct\n", __func__, proc_selfpid(),
4659 (uint64_t)VM_KERNEL_ADDRPERM(so),
4660 (SOCK_TYPE(so) == SOCK_STREAM) ?
4661 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
4662 ((SOCK_DOM(so) == PF_INET) ?
4663 (void *)&inp->inp_laddr.s_addr :
4664 (void *)&inp->in6p_laddr), s, sizeof (s)),
4665 ntohs(inp->in6p_lport),
4666 inet_ntop(SOCK_DOM(so),
4667 (SOCK_DOM(so) == PF_INET) ?
4668 (void *)&inp->inp_faddr.s_addr :
4669 (void *)&inp->in6p_faddr, d, sizeof (d)),
4670 ntohs(inp->in6p_fport),
4671 (so->so_flags & SOF_NODEFUNCT) ?
4672 "not " : ""));
4673 } else {
4674 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d] is "
4675 "now marked as %seligible for defunct\n",
4676 __func__, proc_selfpid(),
4677 (uint64_t)VM_KERNEL_ADDRPERM(so),
4678 SOCK_DOM(so), SOCK_TYPE(so),
4679 (so->so_flags & SOF_NODEFUNCT) ?
4680 "not " : ""));
4681 }
4682 break;
4683
4684 case SO_ISDEFUNCT:
4685 /* This option is not settable */
4686 error = EINVAL;
4687 break;
4688
4689 case SO_OPPORTUNISTIC:
4690 error = sooptcopyin(sopt, &optval, sizeof (optval),
4691 sizeof (optval));
4692 if (error == 0)
4693 error = so_set_opportunistic(so, optval);
4694 break;
4695
4696 case SO_FLUSH:
4697 /* This option is handled by lower layer(s) */
4698 error = 0;
4699 break;
4700
4701 case SO_RECV_ANYIF:
4702 error = sooptcopyin(sopt, &optval, sizeof (optval),
4703 sizeof (optval));
4704 if (error == 0)
4705 error = so_set_recv_anyif(so, optval);
4706 break;
4707
4708 case SO_TRAFFIC_MGT_BACKGROUND: {
4709 /* This option is handled by lower layer(s) */
4710 error = 0;
4711 break;
4712 }
4713
4714 #if FLOW_DIVERT
4715 case SO_FLOW_DIVERT_TOKEN:
4716 error = flow_divert_token_set(so, sopt);
4717 break;
4718 #endif /* FLOW_DIVERT */
4719
4720
4721 case SO_DELEGATED:
4722 if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
4723 sizeof (optval))) != 0)
4724 break;
4725
4726 error = so_set_effective_pid(so, optval, sopt->sopt_p);
4727 break;
4728
4729 case SO_DELEGATED_UUID: {
4730 uuid_t euuid;
4731
4732 if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
4733 sizeof (euuid))) != 0)
4734 break;
4735
4736 error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
4737 break;
4738 }
4739
4740 #if NECP
4741 case SO_NECP_ATTRIBUTES:
4742 error = necp_set_socket_attributes(so, sopt);
4743 break;
4744 #endif /* NECP */
4745
4746 #if MPTCP
4747 case SO_MPTCP_FASTJOIN:
4748 if (!((so->so_flags & SOF_MP_SUBFLOW) ||
4749 ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
4750 (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
4751 error = ENOPROTOOPT;
4752 break;
4753 }
4754
4755 error = sooptcopyin(sopt, &optval, sizeof (optval),
4756 sizeof (optval));
4757 if (error != 0)
4758 goto out;
4759 if (optval == 0)
4760 so->so_flags &= ~SOF_MPTCP_FASTJOIN;
4761 else
4762 so->so_flags |= SOF_MPTCP_FASTJOIN;
4763 break;
4764 #endif /* MPTCP */
4765
4766 default:
4767 error = ENOPROTOOPT;
4768 break;
4769 }
4770 if (error == 0 && so->so_proto != NULL &&
4771 so->so_proto->pr_ctloutput != NULL) {
4772 (void) so->so_proto->pr_ctloutput(so, sopt);
4773 }
4774 }
4775 out:
4776 if (dolock)
4777 socket_unlock(so, 1);
4778 return (error);
4779 }
4780
4781 /* Helper routines for getsockopt */
4782 int
4783 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
4784 {
4785 int error;
4786 size_t valsize;
4787
4788 error = 0;
4789
4790 /*
4791 * Documented get behavior is that we always return a value,
4792 * possibly truncated to fit in the user's buffer.
4793 * Traditional behavior is that we always tell the user
4794 * precisely how much we copied, rather than something useful
4795 * like the total amount we had available for her.
4796 * Note that this interface is not idempotent; the entire answer must
4797 * generated ahead of time.
4798 */
4799 valsize = min(len, sopt->sopt_valsize);
4800 sopt->sopt_valsize = valsize;
4801 if (sopt->sopt_val != USER_ADDR_NULL) {
4802 if (sopt->sopt_p != kernproc)
4803 error = copyout(buf, sopt->sopt_val, valsize);
4804 else
4805 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
4806 }
4807 return (error);
4808 }
4809
4810 static int
4811 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
4812 {
4813 int error;
4814 size_t len;
4815 struct user64_timeval tv64;
4816 struct user32_timeval tv32;
4817 const void * val;
4818 size_t valsize;
4819
4820 error = 0;
4821 if (proc_is64bit(sopt->sopt_p)) {
4822 len = sizeof (tv64);
4823 tv64.tv_sec = tv_p->tv_sec;
4824 tv64.tv_usec = tv_p->tv_usec;
4825 val = &tv64;
4826 } else {
4827 len = sizeof (tv32);
4828 tv32.tv_sec = tv_p->tv_sec;
4829 tv32.tv_usec = tv_p->tv_usec;
4830 val = &tv32;
4831 }
4832 valsize = min(len, sopt->sopt_valsize);
4833 sopt->sopt_valsize = valsize;
4834 if (sopt->sopt_val != USER_ADDR_NULL) {
4835 if (sopt->sopt_p != kernproc)
4836 error = copyout(val, sopt->sopt_val, valsize);
4837 else
4838 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
4839 }
4840 return (error);
4841 }
4842
4843 /*
4844 * Return: 0 Success
4845 * ENOPROTOOPT
4846 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4847 * <pr_ctloutput>:???
4848 * <sf_getoption>:???
4849 */
4850 int
4851 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4852 {
4853 int error, optval;
4854 struct linger l;
4855 struct timeval tv;
4856 #if CONFIG_MACF_SOCKET
4857 struct mac extmac;
4858 #endif /* MAC_SOCKET */
4859
4860 if (sopt->sopt_dir != SOPT_GET)
4861 sopt->sopt_dir = SOPT_GET;
4862
4863 if (dolock)
4864 socket_lock(so, 1);
4865
4866 error = sflt_getsockopt(so, sopt);
4867 if (error != 0) {
4868 if (error == EJUSTRETURN)
4869 error = 0;
4870 goto out;
4871 }
4872
4873 if (sopt->sopt_level != SOL_SOCKET) {
4874 if (so->so_proto != NULL &&
4875 so->so_proto->pr_ctloutput != NULL) {
4876 error = (*so->so_proto->pr_ctloutput)(so, sopt);
4877 goto out;
4878 }
4879 error = ENOPROTOOPT;
4880 } else {
4881 /*
4882 * Allow socket-level (SOL_SOCKET) options to be filtered by
4883 * the protocol layer, if needed. A zero value returned from
4884 * the handler means use default socket-level processing as
4885 * done by the rest of this routine. Otherwise, any other
4886 * return value indicates that the option is unsupported.
4887 */
4888 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4889 pru_socheckopt(so, sopt)) != 0)
4890 goto out;
4891
4892 error = 0;
4893 switch (sopt->sopt_name) {
4894 case SO_LINGER:
4895 case SO_LINGER_SEC:
4896 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
4897 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
4898 so->so_linger : so->so_linger / hz;
4899 error = sooptcopyout(sopt, &l, sizeof (l));
4900 break;
4901
4902 case SO_USELOOPBACK:
4903 case SO_DONTROUTE:
4904 case SO_DEBUG:
4905 case SO_KEEPALIVE:
4906 case SO_REUSEADDR:
4907 case SO_REUSEPORT:
4908 case SO_BROADCAST:
4909 case SO_OOBINLINE:
4910 case SO_TIMESTAMP:
4911 case SO_TIMESTAMP_MONOTONIC:
4912 case SO_DONTTRUNC:
4913 case SO_WANTMORE:
4914 case SO_WANTOOBFLAG:
4915 case SO_NOWAKEFROMSLEEP:
4916 optval = so->so_options & sopt->sopt_name;
4917 integer:
4918 error = sooptcopyout(sopt, &optval, sizeof (optval));
4919 break;
4920
4921 case SO_TYPE:
4922 optval = so->so_type;
4923 goto integer;
4924
4925 case SO_NREAD:
4926 if (so->so_proto->pr_flags & PR_ATOMIC) {
4927 int pkt_total;
4928 struct mbuf *m1;
4929
4930 pkt_total = 0;
4931 m1 = so->so_rcv.sb_mb;
4932 while (m1 != NULL) {
4933 if (m1->m_type == MT_DATA ||
4934 m1->m_type == MT_HEADER ||
4935 m1->m_type == MT_OOBDATA)
4936 pkt_total += m1->m_len;
4937 m1 = m1->m_next;
4938 }
4939 optval = pkt_total;
4940 } else {
4941 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
4942 }
4943 goto integer;
4944
4945 case SO_NUMRCVPKT:
4946 if (so->so_proto->pr_flags & PR_ATOMIC) {
4947 int cnt = 0;
4948 struct mbuf *m1;
4949
4950 m1 = so->so_rcv.sb_mb;
4951 while (m1 != NULL) {
4952 if (m1->m_type == MT_DATA ||
4953 m1->m_type == MT_HEADER ||
4954 m1->m_type == MT_OOBDATA)
4955 cnt += 1;
4956 m1 = m1->m_nextpkt;
4957 }
4958 optval = cnt;
4959 goto integer;
4960 } else {
4961 error = EINVAL;
4962 break;
4963 }
4964
4965 case SO_NWRITE:
4966 optval = so->so_snd.sb_cc;
4967 goto integer;
4968
4969 case SO_ERROR:
4970 optval = so->so_error;
4971 so->so_error = 0;
4972 goto integer;
4973
4974 case SO_SNDBUF: {
4975 u_int32_t hiwat = so->so_snd.sb_hiwat;
4976
4977 if (so->so_snd.sb_flags & SB_UNIX) {
4978 struct unpcb *unp =
4979 (struct unpcb *)(so->so_pcb);
4980 if (unp != NULL && unp->unp_conn != NULL) {
4981 hiwat += unp->unp_conn->unp_cc;
4982 }
4983 }
4984
4985 optval = hiwat;
4986 goto integer;
4987 }
4988 case SO_RCVBUF:
4989 optval = so->so_rcv.sb_hiwat;
4990 goto integer;
4991
4992 case SO_SNDLOWAT:
4993 optval = so->so_snd.sb_lowat;
4994 goto integer;
4995
4996 case SO_RCVLOWAT:
4997 optval = so->so_rcv.sb_lowat;
4998 goto integer;
4999
5000 case SO_SNDTIMEO:
5001 case SO_RCVTIMEO:
5002 tv = (sopt->sopt_name == SO_SNDTIMEO ?
5003 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5004
5005 error = sooptcopyout_timeval(sopt, &tv);
5006 break;
5007
5008 case SO_NOSIGPIPE:
5009 optval = (so->so_flags & SOF_NOSIGPIPE);
5010 goto integer;
5011
5012 case SO_NOADDRERR:
5013 optval = (so->so_flags & SOF_NOADDRAVAIL);
5014 goto integer;
5015
5016 case SO_REUSESHAREUID:
5017 optval = (so->so_flags & SOF_REUSESHAREUID);
5018 goto integer;
5019
5020
5021 case SO_NOTIFYCONFLICT:
5022 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5023 goto integer;
5024
5025 case SO_RESTRICTIONS:
5026 optval = so_get_restrictions(so);
5027 goto integer;
5028
5029 case SO_AWDL_UNRESTRICTED:
5030 if (SOCK_DOM(so) == PF_INET ||
5031 SOCK_DOM(so) == PF_INET6) {
5032 optval = inp_get_awdl_unrestricted(
5033 sotoinpcb(so));
5034 goto integer;
5035 } else
5036 error = EOPNOTSUPP;
5037 break;
5038
5039 case SO_LABEL:
5040 #if CONFIG_MACF_SOCKET
5041 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5042 sizeof (extmac))) != 0 ||
5043 (error = mac_socket_label_get(proc_ucred(
5044 sopt->sopt_p), so, &extmac)) != 0)
5045 break;
5046
5047 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5048 #else
5049 error = EOPNOTSUPP;
5050 #endif /* MAC_SOCKET */
5051 break;
5052
5053 case SO_PEERLABEL:
5054 #if CONFIG_MACF_SOCKET
5055 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5056 sizeof (extmac))) != 0 ||
5057 (error = mac_socketpeer_label_get(proc_ucred(
5058 sopt->sopt_p), so, &extmac)) != 0)
5059 break;
5060
5061 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5062 #else
5063 error = EOPNOTSUPP;
5064 #endif /* MAC_SOCKET */
5065 break;
5066
5067 #ifdef __APPLE_API_PRIVATE
5068 case SO_UPCALLCLOSEWAIT:
5069 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
5070 goto integer;
5071 #endif
5072 case SO_RANDOMPORT:
5073 optval = (so->so_flags & SOF_BINDRANDOMPORT);
5074 goto integer;
5075
5076 case SO_NP_EXTENSIONS: {
5077 struct so_np_extensions sonpx;
5078
5079 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
5080 SONPX_SETOPTSHUT : 0;
5081 sonpx.npx_mask = SONPX_MASK_VALID;
5082
5083 error = sooptcopyout(sopt, &sonpx,
5084 sizeof (struct so_np_extensions));
5085 break;
5086 }
5087
5088 case SO_TRAFFIC_CLASS:
5089 optval = so->so_traffic_class;
5090 goto integer;
5091
5092 case SO_RECV_TRAFFIC_CLASS:
5093 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
5094 goto integer;
5095
5096 case SO_TRAFFIC_CLASS_STATS:
5097 error = sooptcopyout(sopt, &so->so_tc_stats,
5098 sizeof (so->so_tc_stats));
5099 break;
5100
5101 case SO_TRAFFIC_CLASS_DBG:
5102 error = sogetopt_tcdbg(so, sopt);
5103 break;
5104
5105 case SO_PRIVILEGED_TRAFFIC_CLASS:
5106 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
5107 goto integer;
5108
5109 case SO_DEFUNCTOK:
5110 optval = !(so->so_flags & SOF_NODEFUNCT);
5111 goto integer;
5112
5113 case SO_ISDEFUNCT:
5114 optval = (so->so_flags & SOF_DEFUNCT);
5115 goto integer;
5116
5117 case SO_OPPORTUNISTIC:
5118 optval = so_get_opportunistic(so);
5119 goto integer;
5120
5121 case SO_FLUSH:
5122 /* This option is not gettable */
5123 error = EINVAL;
5124 break;
5125
5126 case SO_RECV_ANYIF:
5127 optval = so_get_recv_anyif(so);
5128 goto integer;
5129
5130 case SO_TRAFFIC_MGT_BACKGROUND:
5131 /* This option is handled by lower layer(s) */
5132 if (so->so_proto != NULL &&
5133 so->so_proto->pr_ctloutput != NULL) {
5134 (void) so->so_proto->pr_ctloutput(so, sopt);
5135 }
5136 break;
5137
5138 #if FLOW_DIVERT
5139 case SO_FLOW_DIVERT_TOKEN:
5140 error = flow_divert_token_get(so, sopt);
5141 break;
5142 #endif /* FLOW_DIVERT */
5143
5144 #if NECP
5145 case SO_NECP_ATTRIBUTES:
5146 error = necp_get_socket_attributes(so, sopt);
5147 break;
5148 #endif /* NECP */
5149
5150 #if CONTENT_FILTER
5151 case SO_CFIL_SOCK_ID: {
5152 cfil_sock_id_t sock_id;
5153
5154 sock_id = cfil_sock_id_from_socket(so);
5155
5156 error = sooptcopyout(sopt, &sock_id,
5157 sizeof(cfil_sock_id_t));
5158 break;
5159 }
5160 #endif /* CONTENT_FILTER */
5161
5162 #if MPTCP
5163 case SO_MPTCP_FASTJOIN:
5164 if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5165 ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5166 (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5167 error = ENOPROTOOPT;
5168 break;
5169 }
5170 optval = (so->so_flags & SOF_MPTCP_FASTJOIN);
5171 break;
5172 #endif /* MPTCP */
5173
5174 default:
5175 error = ENOPROTOOPT;
5176 break;
5177 }
5178 }
5179 out:
5180 if (dolock)
5181 socket_unlock(so, 1);
5182 return (error);
5183 }
5184
5185 /*
5186 * The size limits on our soopt_getm is different from that on FreeBSD.
5187 * We limit the size of options to MCLBYTES. This will have to change
5188 * if we need to define options that need more space than MCLBYTES.
5189 */
5190 int
5191 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
5192 {
5193 struct mbuf *m, *m_prev;
5194 int sopt_size = sopt->sopt_valsize;
5195 int how;
5196
5197 if (sopt_size <= 0 || sopt_size > MCLBYTES)
5198 return (EMSGSIZE);
5199
5200 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
5201 MGET(m, how, MT_DATA);
5202 if (m == NULL)
5203 return (ENOBUFS);
5204 if (sopt_size > MLEN) {
5205 MCLGET(m, how);
5206 if ((m->m_flags & M_EXT) == 0) {
5207 m_free(m);
5208 return (ENOBUFS);
5209 }
5210 m->m_len = min(MCLBYTES, sopt_size);
5211 } else {
5212 m->m_len = min(MLEN, sopt_size);
5213 }
5214 sopt_size -= m->m_len;
5215 *mp = m;
5216 m_prev = m;
5217
5218 while (sopt_size > 0) {
5219 MGET(m, how, MT_DATA);
5220 if (m == NULL) {
5221 m_freem(*mp);
5222 return (ENOBUFS);
5223 }
5224 if (sopt_size > MLEN) {
5225 MCLGET(m, how);
5226 if ((m->m_flags & M_EXT) == 0) {
5227 m_freem(*mp);
5228 m_freem(m);
5229 return (ENOBUFS);
5230 }
5231 m->m_len = min(MCLBYTES, sopt_size);
5232 } else {
5233 m->m_len = min(MLEN, sopt_size);
5234 }
5235 sopt_size -= m->m_len;
5236 m_prev->m_next = m;
5237 m_prev = m;
5238 }
5239 return (0);
5240 }
5241
5242 /* copyin sopt data into mbuf chain */
5243 int
5244 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
5245 {
5246 struct mbuf *m0 = m;
5247
5248 if (sopt->sopt_val == USER_ADDR_NULL)
5249 return (0);
5250 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5251 if (sopt->sopt_p != kernproc) {
5252 int error;
5253
5254 error = copyin(sopt->sopt_val, mtod(m, char *),
5255 m->m_len);
5256 if (error != 0) {
5257 m_freem(m0);
5258 return (error);
5259 }
5260 } else {
5261 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
5262 mtod(m, char *), m->m_len);
5263 }
5264 sopt->sopt_valsize -= m->m_len;
5265 sopt->sopt_val += m->m_len;
5266 m = m->m_next;
5267 }
5268 /* should be allocated enoughly at ip6_sooptmcopyin() */
5269 if (m != NULL) {
5270 panic("soopt_mcopyin");
5271 /* NOTREACHED */
5272 }
5273 return (0);
5274 }
5275
5276 /* copyout mbuf chain data into soopt */
5277 int
5278 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
5279 {
5280 struct mbuf *m0 = m;
5281 size_t valsize = 0;
5282
5283 if (sopt->sopt_val == USER_ADDR_NULL)
5284 return (0);
5285 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5286 if (sopt->sopt_p != kernproc) {
5287 int error;
5288
5289 error = copyout(mtod(m, char *), sopt->sopt_val,
5290 m->m_len);
5291 if (error != 0) {
5292 m_freem(m0);
5293 return (error);
5294 }
5295 } else {
5296 bcopy(mtod(m, char *),
5297 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
5298 }
5299 sopt->sopt_valsize -= m->m_len;
5300 sopt->sopt_val += m->m_len;
5301 valsize += m->m_len;
5302 m = m->m_next;
5303 }
5304 if (m != NULL) {
5305 /* enough soopt buffer should be given from user-land */
5306 m_freem(m0);
5307 return (EINVAL);
5308 }
5309 sopt->sopt_valsize = valsize;
5310 return (0);
5311 }
5312
5313 void
5314 sohasoutofband(struct socket *so)
5315 {
5316 if (so->so_pgid < 0)
5317 gsignal(-so->so_pgid, SIGURG);
5318 else if (so->so_pgid > 0)
5319 proc_signal(so->so_pgid, SIGURG);
5320 selwakeup(&so->so_rcv.sb_sel);
5321 }
5322
5323 int
5324 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
5325 {
5326 #pragma unused(cred)
5327 struct proc *p = current_proc();
5328 int revents = 0;
5329
5330 socket_lock(so, 1);
5331 so_update_last_owner_locked(so, PROC_NULL);
5332 so_update_policy(so);
5333
5334 if (events & (POLLIN | POLLRDNORM))
5335 if (soreadable(so))
5336 revents |= events & (POLLIN | POLLRDNORM);
5337
5338 if (events & (POLLOUT | POLLWRNORM))
5339 if (sowriteable(so))
5340 revents |= events & (POLLOUT | POLLWRNORM);
5341
5342 if (events & (POLLPRI | POLLRDBAND))
5343 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
5344 revents |= events & (POLLPRI | POLLRDBAND);
5345
5346 if (revents == 0) {
5347 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
5348 /*
5349 * Darwin sets the flag first,
5350 * BSD calls selrecord first
5351 */
5352 so->so_rcv.sb_flags |= SB_SEL;
5353 selrecord(p, &so->so_rcv.sb_sel, wql);
5354 }
5355
5356 if (events & (POLLOUT | POLLWRNORM)) {
5357 /*
5358 * Darwin sets the flag first,
5359 * BSD calls selrecord first
5360 */
5361 so->so_snd.sb_flags |= SB_SEL;
5362 selrecord(p, &so->so_snd.sb_sel, wql);
5363 }
5364 }
5365
5366 socket_unlock(so, 1);
5367 return (revents);
5368 }
5369
5370 int
5371 soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx)
5372 {
5373 #pragma unused(fp)
5374 #if !CONFIG_MACF_SOCKET
5375 #pragma unused(ctx)
5376 #endif /* MAC_SOCKET */
5377 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5378 struct klist *skl;
5379
5380 socket_lock(so, 1);
5381 so_update_last_owner_locked(so, PROC_NULL);
5382 so_update_policy(so);
5383
5384 #if CONFIG_MACF_SOCKET
5385 if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
5386 kn, so) != 0) {
5387 socket_unlock(so, 1);
5388 return (1);
5389 }
5390 #endif /* MAC_SOCKET */
5391
5392 switch (kn->kn_filter) {
5393 case EVFILT_READ:
5394 kn->kn_fop = &soread_filtops;
5395 skl = &so->so_rcv.sb_sel.si_note;
5396 break;
5397 case EVFILT_WRITE:
5398 kn->kn_fop = &sowrite_filtops;
5399 skl = &so->so_snd.sb_sel.si_note;
5400 break;
5401 case EVFILT_SOCK:
5402 kn->kn_fop = &sock_filtops;
5403 skl = &so->so_klist;
5404 break;
5405 default:
5406 socket_unlock(so, 1);
5407 return (1);
5408 }
5409
5410 if (KNOTE_ATTACH(skl, kn)) {
5411 switch (kn->kn_filter) {
5412 case EVFILT_READ:
5413 so->so_rcv.sb_flags |= SB_KNOTE;
5414 break;
5415 case EVFILT_WRITE:
5416 so->so_snd.sb_flags |= SB_KNOTE;
5417 break;
5418 case EVFILT_SOCK:
5419 so->so_flags |= SOF_KNOTE;
5420 break;
5421 default:
5422 socket_unlock(so, 1);
5423 return (1);
5424 }
5425 }
5426 socket_unlock(so, 1);
5427 return (0);
5428 }
5429
5430 static void
5431 filt_sordetach(struct knote *kn)
5432 {
5433 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5434
5435 socket_lock(so, 1);
5436 if (so->so_rcv.sb_flags & SB_KNOTE)
5437 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
5438 so->so_rcv.sb_flags &= ~SB_KNOTE;
5439 socket_unlock(so, 1);
5440 }
5441
5442 /*ARGSUSED*/
5443 static int
5444 filt_soread(struct knote *kn, long hint)
5445 {
5446 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5447
5448 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5449 socket_lock(so, 1);
5450
5451 if (so->so_options & SO_ACCEPTCONN) {
5452 int isempty;
5453
5454 /*
5455 * Radar 6615193 handle the listen case dynamically
5456 * for kqueue read filter. This allows to call listen()
5457 * after registering the kqueue EVFILT_READ.
5458 */
5459
5460 kn->kn_data = so->so_qlen;
5461 isempty = ! TAILQ_EMPTY(&so->so_comp);
5462
5463 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5464 socket_unlock(so, 1);
5465
5466 return (isempty);
5467 }
5468
5469 /* socket isn't a listener */
5470
5471 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5472
5473 if (so->so_oobmark) {
5474 if (kn->kn_flags & EV_OOBAND) {
5475 kn->kn_data -= so->so_oobmark;
5476 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5477 socket_unlock(so, 1);
5478 return (1);
5479 }
5480 kn->kn_data = so->so_oobmark;
5481 kn->kn_flags |= EV_OOBAND;
5482 } else {
5483 if ((so->so_state & SS_CANTRCVMORE)
5484 #if CONTENT_FILTER
5485 && cfil_sock_data_pending(&so->so_rcv) == 0
5486 #endif /* CONTENT_FILTER */
5487 ) {
5488 kn->kn_flags |= EV_EOF;
5489 kn->kn_fflags = so->so_error;
5490 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5491 socket_unlock(so, 1);
5492 return (1);
5493 }
5494 }
5495
5496 if (so->so_state & SS_RCVATMARK) {
5497 if (kn->kn_flags & EV_OOBAND) {
5498 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5499 socket_unlock(so, 1);
5500 return (1);
5501 }
5502 kn->kn_flags |= EV_OOBAND;
5503 } else if (kn->kn_flags & EV_OOBAND) {
5504 kn->kn_data = 0;
5505 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5506 socket_unlock(so, 1);
5507 return (0);
5508 }
5509
5510 if (so->so_error) { /* temporary udp error */
5511 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5512 socket_unlock(so, 1);
5513 return (1);
5514 }
5515
5516 int64_t lowwat = so->so_rcv.sb_lowat;
5517 if (kn->kn_sfflags & NOTE_LOWAT) {
5518 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
5519 lowwat = so->so_rcv.sb_hiwat;
5520 else if (kn->kn_sdata > lowwat)
5521 lowwat = kn->kn_sdata;
5522 }
5523
5524 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5525 socket_unlock(so, 1);
5526
5527 return ((kn->kn_flags & EV_OOBAND) || kn->kn_data >= lowwat);
5528 }
5529
5530 static void
5531 filt_sowdetach(struct knote *kn)
5532 {
5533 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5534 socket_lock(so, 1);
5535
5536 if (so->so_snd.sb_flags & SB_KNOTE)
5537 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
5538 so->so_snd.sb_flags &= ~SB_KNOTE;
5539 socket_unlock(so, 1);
5540 }
5541
5542 int
5543 so_wait_for_if_feedback(struct socket *so)
5544 {
5545 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
5546 (so->so_state & SS_ISCONNECTED)) {
5547 struct inpcb *inp = sotoinpcb(so);
5548 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
5549 return (1);
5550 }
5551 return (0);
5552 }
5553
5554 /*ARGSUSED*/
5555 static int
5556 filt_sowrite(struct knote *kn, long hint)
5557 {
5558 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5559 int ret = 0;
5560
5561 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5562 socket_lock(so, 1);
5563
5564 kn->kn_data = sbspace(&so->so_snd);
5565 if (so->so_state & SS_CANTSENDMORE) {
5566 kn->kn_flags |= EV_EOF;
5567 kn->kn_fflags = so->so_error;
5568 ret = 1;
5569 goto out;
5570 }
5571 if (so->so_error) { /* temporary udp error */
5572 ret = 1;
5573 goto out;
5574 }
5575 if (((so->so_state & SS_ISCONNECTED) == 0) &&
5576 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
5577 ret = 0;
5578 goto out;
5579 }
5580 int64_t lowwat = so->so_snd.sb_lowat;
5581 if (kn->kn_sfflags & NOTE_LOWAT) {
5582 if (kn->kn_sdata > so->so_snd.sb_hiwat)
5583 lowwat = so->so_snd.sb_hiwat;
5584 else if (kn->kn_sdata > lowwat)
5585 lowwat = kn->kn_sdata;
5586 }
5587 if (kn->kn_data >= lowwat) {
5588 if (so->so_flags & SOF_NOTSENT_LOWAT) {
5589 if ((SOCK_DOM(so) == PF_INET
5590 || SOCK_DOM(so) == PF_INET6)
5591 && so->so_type == SOCK_STREAM) {
5592 ret = tcp_notsent_lowat_check(so);
5593 }
5594 #if MPTCP
5595 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
5596 (SOCK_PROTO(so) == IPPROTO_TCP)) {
5597 ret = mptcp_notsent_lowat_check(so);
5598 }
5599 #endif
5600 else {
5601 return (1);
5602 }
5603 } else {
5604 ret = 1;
5605 }
5606 }
5607 if (so_wait_for_if_feedback(so))
5608 ret = 0;
5609 out:
5610 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5611 socket_unlock(so, 1);
5612 return (ret);
5613 }
5614
5615 static void
5616 filt_sockdetach(struct knote *kn)
5617 {
5618 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5619 socket_lock(so, 1);
5620
5621 if ((so->so_flags & SOF_KNOTE) != 0)
5622 if (KNOTE_DETACH(&so->so_klist, kn))
5623 so->so_flags &= ~SOF_KNOTE;
5624 socket_unlock(so, 1);
5625 }
5626
5627 static int
5628 filt_sockev(struct knote *kn, long hint)
5629 {
5630 int ret = 0, locked = 0;
5631 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5632 long ev_hint = (hint & SO_FILT_HINT_EV);
5633
5634 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
5635 socket_lock(so, 1);
5636 locked = 1;
5637 }
5638
5639 if (ev_hint & SO_FILT_HINT_CONNRESET) {
5640 if (kn->kn_sfflags & NOTE_CONNRESET)
5641 kn->kn_fflags |= NOTE_CONNRESET;
5642 }
5643 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
5644 if (kn->kn_sfflags & NOTE_TIMEOUT)
5645 kn->kn_fflags |= NOTE_TIMEOUT;
5646 }
5647 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
5648 if (kn->kn_sfflags & NOTE_NOSRCADDR)
5649 kn->kn_fflags |= NOTE_NOSRCADDR;
5650 }
5651 if (ev_hint & SO_FILT_HINT_IFDENIED) {
5652 if ((kn->kn_sfflags & NOTE_IFDENIED))
5653 kn->kn_fflags |= NOTE_IFDENIED;
5654 }
5655 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
5656 if (kn->kn_sfflags & NOTE_KEEPALIVE)
5657 kn->kn_fflags |= NOTE_KEEPALIVE;
5658 }
5659 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
5660 if (kn->kn_sfflags & NOTE_ADAPTIVE_WTIMO)
5661 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
5662 }
5663 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
5664 if (kn->kn_sfflags & NOTE_ADAPTIVE_RTIMO)
5665 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
5666 }
5667 if (ev_hint & SO_FILT_HINT_CONNECTED) {
5668 if (kn->kn_sfflags & NOTE_CONNECTED)
5669 kn->kn_fflags |= NOTE_CONNECTED;
5670 }
5671 if (ev_hint & SO_FILT_HINT_DISCONNECTED) {
5672 if (kn->kn_sfflags & NOTE_DISCONNECTED)
5673 kn->kn_fflags |= NOTE_DISCONNECTED;
5674 }
5675 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
5676 if (so->so_proto != NULL &&
5677 (so->so_proto->pr_flags & PR_EVCONNINFO) &&
5678 (kn->kn_sfflags & NOTE_CONNINFO_UPDATED))
5679 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
5680 }
5681
5682 if ((kn->kn_sfflags & NOTE_READCLOSED) &&
5683 (so->so_state & SS_CANTRCVMORE)
5684 #if CONTENT_FILTER
5685 && cfil_sock_data_pending(&so->so_rcv) == 0
5686 #endif /* CONTENT_FILTER */
5687 )
5688 kn->kn_fflags |= NOTE_READCLOSED;
5689
5690 if ((kn->kn_sfflags & NOTE_WRITECLOSED) &&
5691 (so->so_state & SS_CANTSENDMORE))
5692 kn->kn_fflags |= NOTE_WRITECLOSED;
5693
5694 if ((kn->kn_sfflags & NOTE_SUSPEND) &&
5695 ((ev_hint & SO_FILT_HINT_SUSPEND) ||
5696 (so->so_flags & SOF_SUSPENDED))) {
5697 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
5698 kn->kn_fflags |= NOTE_SUSPEND;
5699 }
5700
5701 if ((kn->kn_sfflags & NOTE_RESUME) &&
5702 ((ev_hint & SO_FILT_HINT_RESUME) ||
5703 (so->so_flags & SOF_SUSPENDED) == 0)) {
5704 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
5705 kn->kn_fflags |= NOTE_RESUME;
5706 }
5707
5708 if (so->so_error != 0) {
5709 ret = 1;
5710 kn->kn_data = so->so_error;
5711 kn->kn_flags |= EV_EOF;
5712 } else {
5713 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
5714 }
5715
5716 if (kn->kn_fflags != 0)
5717 ret = 1;
5718
5719 if (locked)
5720 socket_unlock(so, 1);
5721
5722 return (ret);
5723 }
5724
5725 void
5726 get_sockev_state(struct socket *so, u_int32_t *statep)
5727 {
5728 u_int32_t state = *(statep);
5729
5730 if (so->so_state & SS_ISCONNECTED)
5731 state |= SOCKEV_CONNECTED;
5732 else
5733 state &= ~(SOCKEV_CONNECTED);
5734 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
5735 *(statep) = state;
5736 }
5737
5738 #define SO_LOCK_HISTORY_STR_LEN \
5739 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
5740
5741 __private_extern__ const char *
5742 solockhistory_nr(struct socket *so)
5743 {
5744 size_t n = 0;
5745 int i;
5746 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
5747
5748 bzero(lock_history_str, sizeof (lock_history_str));
5749 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
5750 n += snprintf(lock_history_str + n,
5751 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
5752 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
5753 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
5754 }
5755 return (lock_history_str);
5756 }
5757
5758 int
5759 socket_lock(struct socket *so, int refcount)
5760 {
5761 int error = 0;
5762 void *lr_saved;
5763
5764 lr_saved = __builtin_return_address(0);
5765
5766 if (so->so_proto->pr_lock) {
5767 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
5768 } else {
5769 #ifdef MORE_LOCKING_DEBUG
5770 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
5771 LCK_MTX_ASSERT_NOTOWNED);
5772 #endif
5773 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
5774 if (refcount)
5775 so->so_usecount++;
5776 so->lock_lr[so->next_lock_lr] = lr_saved;
5777 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
5778 }
5779
5780 return (error);
5781 }
5782
5783 int
5784 socket_unlock(struct socket *so, int refcount)
5785 {
5786 int error = 0;
5787 void *lr_saved;
5788 lck_mtx_t *mutex_held;
5789
5790 lr_saved = __builtin_return_address(0);
5791
5792 if (so->so_proto == NULL) {
5793 panic("%s: null so_proto so=%p\n", __func__, so);
5794 /* NOTREACHED */
5795 }
5796
5797 if (so && so->so_proto->pr_unlock) {
5798 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
5799 } else {
5800 mutex_held = so->so_proto->pr_domain->dom_mtx;
5801 #ifdef MORE_LOCKING_DEBUG
5802 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
5803 #endif
5804 so->unlock_lr[so->next_unlock_lr] = lr_saved;
5805 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
5806
5807 if (refcount) {
5808 if (so->so_usecount <= 0) {
5809 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
5810 "lrh=%s", __func__, so->so_usecount, so,
5811 SOCK_DOM(so), so->so_type,
5812 SOCK_PROTO(so), solockhistory_nr(so));
5813 /* NOTREACHED */
5814 }
5815
5816 so->so_usecount--;
5817 if (so->so_usecount == 0)
5818 sofreelastref(so, 1);
5819 }
5820 lck_mtx_unlock(mutex_held);
5821 }
5822
5823 return (error);
5824 }
5825
5826 /* Called with socket locked, will unlock socket */
5827 void
5828 sofree(struct socket *so)
5829 {
5830 lck_mtx_t *mutex_held;
5831
5832 if (so->so_proto->pr_getlock != NULL)
5833 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
5834 else
5835 mutex_held = so->so_proto->pr_domain->dom_mtx;
5836 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
5837
5838 sofreelastref(so, 0);
5839 }
5840
5841 void
5842 soreference(struct socket *so)
5843 {
5844 socket_lock(so, 1); /* locks & take one reference on socket */
5845 socket_unlock(so, 0); /* unlock only */
5846 }
5847
5848 void
5849 sodereference(struct socket *so)
5850 {
5851 socket_lock(so, 0);
5852 socket_unlock(so, 1);
5853 }
5854
5855 /*
5856 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
5857 * possibility of using jumbo clusters. Caller must ensure to hold
5858 * the socket lock.
5859 */
5860 void
5861 somultipages(struct socket *so, boolean_t set)
5862 {
5863 if (set)
5864 so->so_flags |= SOF_MULTIPAGES;
5865 else
5866 so->so_flags &= ~SOF_MULTIPAGES;
5867 }
5868
5869 void
5870 soif2kcl(struct socket *so, boolean_t set)
5871 {
5872 if (set)
5873 so->so_flags1 |= SOF1_IF_2KCL;
5874 else
5875 so->so_flags1 &= ~SOF1_IF_2KCL;
5876 }
5877
5878 int
5879 so_isdstlocal(struct socket *so) {
5880
5881 struct inpcb *inp = (struct inpcb *)so->so_pcb;
5882
5883 if (SOCK_DOM(so) == PF_INET)
5884 return (inaddr_local(inp->inp_faddr));
5885 else if (SOCK_DOM(so) == PF_INET6)
5886 return (in6addr_local(&inp->in6p_faddr));
5887
5888 return (0);
5889 }
5890
5891 int
5892 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
5893 {
5894 struct sockbuf *rcv, *snd;
5895 int err = 0, defunct;
5896
5897 rcv = &so->so_rcv;
5898 snd = &so->so_snd;
5899
5900 defunct = (so->so_flags & SOF_DEFUNCT);
5901 if (defunct) {
5902 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
5903 panic("%s: SB_DROP not set", __func__);
5904 /* NOTREACHED */
5905 }
5906 goto done;
5907 }
5908
5909 if (so->so_flags & SOF_NODEFUNCT) {
5910 if (noforce) {
5911 err = EOPNOTSUPP;
5912 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) "
5913 "so 0x%llx [%d,%d] is not eligible for defunct "
5914 "(%d)\n", __func__, proc_selfpid(), proc_pid(p),
5915 level, (uint64_t)VM_KERNEL_ADDRPERM(so),
5916 SOCK_DOM(so), SOCK_TYPE(so), err));
5917 return (err);
5918 }
5919 so->so_flags &= ~SOF_NODEFUNCT;
5920 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
5921 "[%d,%d] defunct by force\n", __func__, proc_selfpid(),
5922 proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
5923 SOCK_DOM(so), SOCK_TYPE(so)));
5924 }
5925
5926 so->so_flags |= SOF_DEFUNCT;
5927
5928 /* Prevent further data from being appended to the socket buffers */
5929 snd->sb_flags |= SB_DROP;
5930 rcv->sb_flags |= SB_DROP;
5931
5932 /* Flush any existing data in the socket buffers */
5933 if (rcv->sb_cc != 0) {
5934 rcv->sb_flags &= ~SB_SEL;
5935 selthreadclear(&rcv->sb_sel);
5936 sbrelease(rcv);
5937 }
5938 if (snd->sb_cc != 0) {
5939 snd->sb_flags &= ~SB_SEL;
5940 selthreadclear(&snd->sb_sel);
5941 sbrelease(snd);
5942 }
5943
5944 done:
5945 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%d,%d] %s "
5946 "defunct\n", __func__, proc_selfpid(), proc_pid(p), level,
5947 (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so),
5948 defunct ? "is already" : "marked as"));
5949
5950 return (err);
5951 }
5952
5953 int
5954 sodefunct(struct proc *p, struct socket *so, int level)
5955 {
5956 struct sockbuf *rcv, *snd;
5957
5958 if (!(so->so_flags & SOF_DEFUNCT)) {
5959 panic("%s improperly called", __func__);
5960 /* NOTREACHED */
5961 }
5962 if (so->so_state & SS_DEFUNCT)
5963 goto done;
5964
5965 rcv = &so->so_rcv;
5966 snd = &so->so_snd;
5967
5968 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
5969 char s[MAX_IPv6_STR_LEN];
5970 char d[MAX_IPv6_STR_LEN];
5971 struct inpcb *inp = sotoinpcb(so);
5972
5973 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%s "
5974 "%s:%d -> %s:%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
5975 "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
5976 proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
5977 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
5978 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
5979 (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr),
5980 s, sizeof (s)), ntohs(inp->in6p_lport),
5981 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
5982 (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr,
5983 d, sizeof (d)), ntohs(inp->in6p_fport),
5984 (uint32_t)rcv->sb_sel.si_flags,
5985 (uint32_t)snd->sb_sel.si_flags,
5986 rcv->sb_flags, snd->sb_flags));
5987 } else {
5988 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
5989 "[%d,%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
5990 "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
5991 proc_pid(p), level, (uint64_t)VM_KERNEL_ADDRPERM(so),
5992 SOCK_DOM(so), SOCK_TYPE(so), (uint32_t)rcv->sb_sel.si_flags,
5993 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
5994 snd->sb_flags));
5995 }
5996
5997 /*
5998 * Unwedge threads blocked on sbwait() and sb_lock().
5999 */
6000 sbwakeup(rcv);
6001 sbwakeup(snd);
6002
6003 so->so_flags1 |= SOF1_DEFUNCTINPROG;
6004 if (rcv->sb_flags & SB_LOCK)
6005 sbunlock(rcv, TRUE); /* keep socket locked */
6006 if (snd->sb_flags & SB_LOCK)
6007 sbunlock(snd, TRUE); /* keep socket locked */
6008
6009 /*
6010 * Flush the buffers and disconnect. We explicitly call shutdown
6011 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
6012 * states are set for the socket. This would also flush out data
6013 * hanging off the receive list of this socket.
6014 */
6015 (void) soshutdownlock_final(so, SHUT_RD);
6016 (void) soshutdownlock_final(so, SHUT_WR);
6017 (void) sodisconnectlocked(so);
6018
6019 /*
6020 * Explicitly handle connectionless-protocol disconnection
6021 * and release any remaining data in the socket buffers.
6022 */
6023 if (!(so->so_flags & SS_ISDISCONNECTED))
6024 (void) soisdisconnected(so);
6025
6026 if (so->so_error == 0)
6027 so->so_error = EBADF;
6028
6029 if (rcv->sb_cc != 0) {
6030 rcv->sb_flags &= ~SB_SEL;
6031 selthreadclear(&rcv->sb_sel);
6032 sbrelease(rcv);
6033 }
6034 if (snd->sb_cc != 0) {
6035 snd->sb_flags &= ~SB_SEL;
6036 selthreadclear(&snd->sb_sel);
6037 sbrelease(snd);
6038 }
6039 so->so_state |= SS_DEFUNCT;
6040
6041 done:
6042 return (0);
6043 }
6044
6045 __private_extern__ int
6046 so_set_recv_anyif(struct socket *so, int optval)
6047 {
6048 int ret = 0;
6049
6050 #if INET6
6051 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6052 #else
6053 if (SOCK_DOM(so) == PF_INET) {
6054 #endif /* !INET6 */
6055 if (optval)
6056 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
6057 else
6058 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
6059 }
6060
6061 return (ret);
6062 }
6063
6064 __private_extern__ int
6065 so_get_recv_anyif(struct socket *so)
6066 {
6067 int ret = 0;
6068
6069 #if INET6
6070 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6071 #else
6072 if (SOCK_DOM(so) == PF_INET) {
6073 #endif /* !INET6 */
6074 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
6075 }
6076
6077 return (ret);
6078 }
6079
6080 int
6081 so_set_restrictions(struct socket *so, uint32_t vals)
6082 {
6083 int nocell_old, nocell_new;
6084 int noexpensive_old, noexpensive_new;
6085
6086 /*
6087 * Deny-type restrictions are trapdoors; once set they cannot be
6088 * unset for the lifetime of the socket. This allows them to be
6089 * issued by a framework on behalf of the application without
6090 * having to worry that they can be undone.
6091 *
6092 * Note here that socket-level restrictions overrides any protocol
6093 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
6094 * socket restriction issued on the socket has a higher precendence
6095 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
6096 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
6097 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
6098 */
6099 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
6100 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
6101 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
6102 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
6103 SO_RESTRICT_DENY_EXPENSIVE));
6104 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
6105 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
6106
6107 /* we can only set, not clear restrictions */
6108 if ((nocell_new - nocell_old) == 0 &&
6109 (noexpensive_new - noexpensive_old) == 0)
6110 return (0);
6111 #if INET6
6112 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6113 #else
6114 if (SOCK_DOM(so) == PF_INET) {
6115 #endif /* !INET6 */
6116 if (nocell_new - nocell_old != 0) {
6117 /* if deny cellular is now set, do what's needed for INPCB */
6118 inp_set_nocellular(sotoinpcb(so));
6119 }
6120 if (noexpensive_new - noexpensive_old != 0) {
6121 inp_set_noexpensive(sotoinpcb(so));
6122 }
6123 }
6124
6125 return (0);
6126 }
6127
6128 uint32_t
6129 so_get_restrictions(struct socket *so)
6130 {
6131 return (so->so_restrictions & (SO_RESTRICT_DENY_IN |
6132 SO_RESTRICT_DENY_OUT |
6133 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE));
6134 }
6135
6136 struct sockaddr_entry *
6137 sockaddrentry_alloc(int how)
6138 {
6139 struct sockaddr_entry *se;
6140
6141 se = (how == M_WAITOK) ? zalloc(se_zone) : zalloc_noblock(se_zone);
6142 if (se != NULL)
6143 bzero(se, se_zone_size);
6144
6145 return (se);
6146 }
6147
6148 void
6149 sockaddrentry_free(struct sockaddr_entry *se)
6150 {
6151 if (se->se_addr != NULL) {
6152 FREE(se->se_addr, M_SONAME);
6153 se->se_addr = NULL;
6154 }
6155 zfree(se_zone, se);
6156 }
6157
6158 struct sockaddr_entry *
6159 sockaddrentry_dup(const struct sockaddr_entry *src_se, int how)
6160 {
6161 struct sockaddr_entry *dst_se;
6162
6163 dst_se = sockaddrentry_alloc(how);
6164 if (dst_se != NULL) {
6165 int len = src_se->se_addr->sa_len;
6166
6167 MALLOC(dst_se->se_addr, struct sockaddr *,
6168 len, M_SONAME, how | M_ZERO);
6169 if (dst_se->se_addr != NULL) {
6170 bcopy(src_se->se_addr, dst_se->se_addr, len);
6171 } else {
6172 sockaddrentry_free(dst_se);
6173 dst_se = NULL;
6174 }
6175 }
6176
6177 return (dst_se);
6178 }
6179
6180 struct sockaddr_list *
6181 sockaddrlist_alloc(int how)
6182 {
6183 struct sockaddr_list *sl;
6184
6185 sl = (how == M_WAITOK) ? zalloc(sl_zone) : zalloc_noblock(sl_zone);
6186 if (sl != NULL) {
6187 bzero(sl, sl_zone_size);
6188 TAILQ_INIT(&sl->sl_head);
6189 }
6190 return (sl);
6191 }
6192
6193 void
6194 sockaddrlist_free(struct sockaddr_list *sl)
6195 {
6196 struct sockaddr_entry *se, *tse;
6197
6198 TAILQ_FOREACH_SAFE(se, &sl->sl_head, se_link, tse) {
6199 sockaddrlist_remove(sl, se);
6200 sockaddrentry_free(se);
6201 }
6202 VERIFY(sl->sl_cnt == 0 && TAILQ_EMPTY(&sl->sl_head));
6203 zfree(sl_zone, sl);
6204 }
6205
6206 void
6207 sockaddrlist_insert(struct sockaddr_list *sl, struct sockaddr_entry *se)
6208 {
6209 VERIFY(!(se->se_flags & SEF_ATTACHED));
6210 se->se_flags |= SEF_ATTACHED;
6211 TAILQ_INSERT_TAIL(&sl->sl_head, se, se_link);
6212 sl->sl_cnt++;
6213 VERIFY(sl->sl_cnt != 0);
6214 }
6215
6216 void
6217 sockaddrlist_remove(struct sockaddr_list *sl, struct sockaddr_entry *se)
6218 {
6219 VERIFY(se->se_flags & SEF_ATTACHED);
6220 se->se_flags &= ~SEF_ATTACHED;
6221 VERIFY(sl->sl_cnt != 0);
6222 sl->sl_cnt--;
6223 TAILQ_REMOVE(&sl->sl_head, se, se_link);
6224 }
6225
6226 struct sockaddr_list *
6227 sockaddrlist_dup(const struct sockaddr_list *src_sl, int how)
6228 {
6229 struct sockaddr_entry *src_se, *tse;
6230 struct sockaddr_list *dst_sl;
6231
6232 dst_sl = sockaddrlist_alloc(how);
6233 if (dst_sl == NULL)
6234 return (NULL);
6235
6236 TAILQ_FOREACH_SAFE(src_se, &src_sl->sl_head, se_link, tse) {
6237 struct sockaddr_entry *dst_se;
6238
6239 if (src_se->se_addr == NULL)
6240 continue;
6241
6242 dst_se = sockaddrentry_dup(src_se, how);
6243 if (dst_se == NULL) {
6244 sockaddrlist_free(dst_sl);
6245 return (NULL);
6246 }
6247
6248 sockaddrlist_insert(dst_sl, dst_se);
6249 }
6250 VERIFY(src_sl->sl_cnt == dst_sl->sl_cnt);
6251
6252 return (dst_sl);
6253 }
6254
6255 int
6256 so_set_effective_pid(struct socket *so, int epid, struct proc *p)
6257 {
6258 struct proc *ep = PROC_NULL;
6259 int error = 0;
6260
6261 /* pid 0 is reserved for kernel */
6262 if (epid == 0) {
6263 error = EINVAL;
6264 goto done;
6265 }
6266
6267 /*
6268 * If this is an in-kernel socket, prevent its delegate
6269 * association from changing unless the socket option is
6270 * coming from within the kernel itself.
6271 */
6272 if (so->last_pid == 0 && p != kernproc) {
6273 error = EACCES;
6274 goto done;
6275 }
6276
6277 /*
6278 * If this is issued by a process that's recorded as the
6279 * real owner of the socket, or if the pid is the same as
6280 * the process's own pid, then proceed. Otherwise ensure
6281 * that the issuing process has the necessary privileges.
6282 */
6283 if (epid != so->last_pid || epid != proc_pid(p)) {
6284 if ((error = priv_check_cred(kauth_cred_get(),
6285 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
6286 error = EACCES;
6287 goto done;
6288 }
6289 }
6290
6291 /* Find the process that corresponds to the effective pid */
6292 if ((ep = proc_find(epid)) == PROC_NULL) {
6293 error = ESRCH;
6294 goto done;
6295 }
6296
6297 /*
6298 * If a process tries to delegate the socket to itself, then
6299 * there's really nothing to do; treat it as a way for the
6300 * delegate association to be cleared. Note that we check
6301 * the passed-in proc rather than calling proc_selfpid(),
6302 * as we need to check the process issuing the socket option
6303 * which could be kernproc. Given that we don't allow 0 for
6304 * effective pid, it means that a delegated in-kernel socket
6305 * stays delegated during its lifetime (which is probably OK.)
6306 */
6307 if (epid == proc_pid(p)) {
6308 so->so_flags &= ~SOF_DELEGATED;
6309 so->e_upid = 0;
6310 so->e_pid = 0;
6311 uuid_clear(so->e_uuid);
6312 } else {
6313 so->so_flags |= SOF_DELEGATED;
6314 so->e_upid = proc_uniqueid(ep);
6315 so->e_pid = proc_pid(ep);
6316 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
6317 }
6318 done:
6319 if (error == 0 && net_io_policy_log) {
6320 uuid_string_t buf;
6321
6322 uuid_unparse(so->e_uuid, buf);
6323 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
6324 "euuid %s%s\n", __func__, proc_name_address(p),
6325 proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6326 SOCK_TYPE(so), so->e_pid, proc_name_address(ep), buf,
6327 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
6328 } else if (error != 0 && net_io_policy_log) {
6329 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
6330 "ERROR (%d)\n", __func__, proc_name_address(p),
6331 proc_pid(p), (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6332 SOCK_TYPE(so), epid, (ep == PROC_NULL) ? "PROC_NULL" :
6333 proc_name_address(ep), error);
6334 }
6335
6336 /* Update this socket's policy upon success */
6337 if (error == 0) {
6338 so->so_policy_gencnt *= -1;
6339 so_update_policy(so);
6340 #if NECP
6341 so_update_necp_policy(so, NULL, NULL);
6342 #endif /* NECP */
6343 }
6344
6345 if (ep != PROC_NULL)
6346 proc_rele(ep);
6347
6348 return (error);
6349 }
6350
6351 int
6352 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
6353 {
6354 uuid_string_t buf;
6355 uuid_t uuid;
6356 int error = 0;
6357
6358 /* UUID must not be all-zeroes (reserved for kernel) */
6359 if (uuid_is_null(euuid)) {
6360 error = EINVAL;
6361 goto done;;
6362 }
6363
6364 /*
6365 * If this is an in-kernel socket, prevent its delegate
6366 * association from changing unless the socket option is
6367 * coming from within the kernel itself.
6368 */
6369 if (so->last_pid == 0 && p != kernproc) {
6370 error = EACCES;
6371 goto done;
6372 }
6373
6374 /* Get the UUID of the issuing process */
6375 proc_getexecutableuuid(p, uuid, sizeof (uuid));
6376
6377 /*
6378 * If this is issued by a process that's recorded as the
6379 * real owner of the socket, or if the uuid is the same as
6380 * the process's own uuid, then proceed. Otherwise ensure
6381 * that the issuing process has the necessary privileges.
6382 */
6383 if (uuid_compare(euuid, so->last_uuid) != 0 ||
6384 uuid_compare(euuid, uuid) != 0) {
6385 if ((error = priv_check_cred(kauth_cred_get(),
6386 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
6387 error = EACCES;
6388 goto done;
6389 }
6390 }
6391
6392 /*
6393 * If a process tries to delegate the socket to itself, then
6394 * there's really nothing to do; treat it as a way for the
6395 * delegate association to be cleared. Note that we check
6396 * the uuid of the passed-in proc rather than that of the
6397 * current process, as we need to check the process issuing
6398 * the socket option which could be kernproc itself. Given
6399 * that we don't allow 0 for effective uuid, it means that
6400 * a delegated in-kernel socket stays delegated during its
6401 * lifetime (which is okay.)
6402 */
6403 if (uuid_compare(euuid, uuid) == 0) {
6404 so->so_flags &= ~SOF_DELEGATED;
6405 so->e_upid = 0;
6406 so->e_pid = 0;
6407 uuid_clear(so->e_uuid);
6408 } else {
6409 so->so_flags |= SOF_DELEGATED;
6410 /*
6411 * Unlike so_set_effective_pid(), we only have the UUID
6412 * here and the process ID is not known. Inherit the
6413 * real {pid,upid} of the socket.
6414 */
6415 so->e_upid = so->last_upid;
6416 so->e_pid = so->last_pid;
6417 uuid_copy(so->e_uuid, euuid);
6418 }
6419
6420 done:
6421 if (error == 0 && net_io_policy_log) {
6422 uuid_unparse(so->e_uuid, buf);
6423 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
6424 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
6425 (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6426 SOCK_TYPE(so), so->e_pid, buf,
6427 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
6428 } else if (error != 0 && net_io_policy_log) {
6429 uuid_unparse(euuid, buf);
6430 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
6431 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
6432 (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6433 SOCK_TYPE(so), buf, error);
6434 }
6435
6436 /* Update this socket's policy upon success */
6437 if (error == 0) {
6438 so->so_policy_gencnt *= -1;
6439 so_update_policy(so);
6440 #if NECP
6441 so_update_necp_policy(so, NULL, NULL);
6442 #endif /* NECP */
6443 }
6444
6445 return (error);
6446 }
6447
6448 void
6449 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
6450 uint32_t ev_datalen)
6451 {
6452 struct kev_msg ev_msg;
6453
6454 /*
6455 * A netpolicy event always starts with a netpolicy_event_data
6456 * structure, but the caller can provide for a longer event
6457 * structure to post, depending on the event code.
6458 */
6459 VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
6460
6461 bzero(&ev_msg, sizeof (ev_msg));
6462 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6463 ev_msg.kev_class = KEV_NETWORK_CLASS;
6464 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
6465 ev_msg.event_code = ev_code;
6466
6467 ev_msg.dv[0].data_ptr = ev_data;
6468 ev_msg.dv[0].data_length = ev_datalen;
6469
6470 kev_post_msg(&ev_msg);
6471 }
6472
6473 void
6474 socket_post_kev_msg(uint32_t ev_code,
6475 struct kev_socket_event_data *ev_data,
6476 uint32_t ev_datalen)
6477 {
6478 struct kev_msg ev_msg;
6479
6480 bzero(&ev_msg, sizeof(ev_msg));
6481 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6482 ev_msg.kev_class = KEV_NETWORK_CLASS;
6483 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
6484 ev_msg.event_code = ev_code;
6485
6486 ev_msg.dv[0].data_ptr = ev_data;
6487 ev_msg.dv[0]. data_length = ev_datalen;
6488
6489 kev_post_msg(&ev_msg);
6490 }
6491
6492 void
6493 socket_post_kev_msg_closed(struct socket *so)
6494 {
6495 struct kev_socket_closed ev;
6496 struct sockaddr *socksa = NULL, *peersa = NULL;
6497 int err;
6498 bzero(&ev, sizeof(ev));
6499 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
6500 if (err == 0) {
6501 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
6502 &peersa);
6503 if (err == 0) {
6504 memcpy(&ev.ev_data.kev_sockname, socksa,
6505 min(socksa->sa_len,
6506 sizeof (ev.ev_data.kev_sockname)));
6507 memcpy(&ev.ev_data.kev_peername, peersa,
6508 min(peersa->sa_len,
6509 sizeof (ev.ev_data.kev_peername)));
6510 socket_post_kev_msg(KEV_SOCKET_CLOSED,
6511 &ev.ev_data, sizeof (ev));
6512 }
6513 }
6514 if (socksa != NULL)
6515 FREE(socksa, M_SONAME);
6516 if (peersa != NULL)
6517 FREE(peersa, M_SONAME);
6518 }