]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/uipc_socket.c
xnu-3247.10.11.tar.gz
[apple/xnu.git] / bsd / kern / uipc_socket.c
1 /*
2 * Copyright (c) 1998-2015 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/ntstat.h>
102 #include <net/content_filter.h>
103 #include <netinet/in.h>
104 #include <netinet/in_pcb.h>
105 #include <netinet/ip6.h>
106 #include <netinet6/ip6_var.h>
107 #include <netinet/flow_divert.h>
108 #include <kern/zalloc.h>
109 #include <kern/locks.h>
110 #include <machine/limits.h>
111 #include <libkern/OSAtomic.h>
112 #include <pexpert/pexpert.h>
113 #include <kern/assert.h>
114 #include <kern/task.h>
115 #include <sys/kpi_mbuf.h>
116 #include <sys/mcache.h>
117 #include <sys/unpcb.h>
118
119 #if CONFIG_MACF
120 #include <security/mac.h>
121 #include <security/mac_framework.h>
122 #endif /* MAC */
123
124 #if MULTIPATH
125 #include <netinet/mp_pcb.h>
126 #include <netinet/mptcp_var.h>
127 #endif /* MULTIPATH */
128
129 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
130
131 #if DEBUG || DEVELOPMENT
132 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
133 #else
134 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
135 #endif
136
137 /* TODO: this should be in a header file somewhere */
138 extern char *proc_name_address(void *p);
139
140 static u_int32_t so_cache_hw; /* High water mark for socache */
141 static u_int32_t so_cache_timeouts; /* number of timeouts */
142 static u_int32_t so_cache_max_freed; /* max freed per timeout */
143 static u_int32_t cached_sock_count = 0;
144 STAILQ_HEAD(, socket) so_cache_head;
145 int max_cached_sock_count = MAX_CACHED_SOCKETS;
146 static u_int32_t so_cache_time;
147 static int socketinit_done;
148 static struct zone *so_cache_zone;
149
150 static lck_grp_t *so_cache_mtx_grp;
151 static lck_attr_t *so_cache_mtx_attr;
152 static lck_grp_attr_t *so_cache_mtx_grp_attr;
153 static lck_mtx_t *so_cache_mtx;
154
155 #include <machine/limits.h>
156
157 static void filt_sordetach(struct knote *kn);
158 static int filt_soread(struct knote *kn, long hint);
159 static void filt_sowdetach(struct knote *kn);
160 static int filt_sowrite(struct knote *kn, long hint);
161 static void filt_sockdetach(struct knote *kn);
162 static int filt_sockev(struct knote *kn, long hint);
163 static void filt_socktouch(struct knote *kn, struct kevent_internal_s *kev,
164 long type);
165
166 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
167 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
168
169 static struct filterops soread_filtops = {
170 .f_isfd = 1,
171 .f_detach = filt_sordetach,
172 .f_event = filt_soread,
173 };
174
175 static struct filterops sowrite_filtops = {
176 .f_isfd = 1,
177 .f_detach = filt_sowdetach,
178 .f_event = filt_sowrite,
179 };
180
181 static struct filterops sock_filtops = {
182 .f_isfd = 1,
183 .f_detach = filt_sockdetach,
184 .f_event = filt_sockev,
185 .f_touch = filt_socktouch,
186 };
187
188 SYSCTL_DECL(_kern_ipc);
189
190 #define EVEN_MORE_LOCKING_DEBUG 0
191
192 int socket_debug = 0;
193 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
194 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
195
196 static int socket_zone = M_SOCKET;
197 so_gen_t so_gencnt; /* generation count for sockets */
198
199 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
200 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
201
202 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
203 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
204 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
205 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
206 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
207 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
208 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
209 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
210 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
211
212 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
213
214 int somaxconn = SOMAXCONN;
215 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
216 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
217
218 /* Should we get a maximum also ??? */
219 static int sosendmaxchain = 65536;
220 static int sosendminchain = 16384;
221 static int sorecvmincopy = 16384;
222 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
223 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
224 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
225 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
226
227 /*
228 * Set to enable jumbo clusters (if available) for large writes when
229 * the socket is marked with SOF_MULTIPAGES; see below.
230 */
231 int sosendjcl = 1;
232 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
233 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
234
235 /*
236 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
237 * writes on the socket for all protocols on any network interfaces,
238 * depending upon sosendjcl above. Be extra careful when setting this
239 * to 1, because sending down packets that cross physical pages down to
240 * broken drivers (those that falsely assume that the physical pages
241 * are contiguous) might lead to system panics or silent data corruption.
242 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
243 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
244 * capable. Set this to 1 only for testing/debugging purposes.
245 */
246 int sosendjcl_ignore_capab = 0;
247 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
248 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
249
250 /*
251 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
252 * writes on the socket for all protocols on any network interfaces.
253 * Be extra careful when setting this to 1, because sending down packets with
254 * clusters larger that 2 KB might lead to system panics or data corruption.
255 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
256 * on the outgoing interface
257 * Set this to 1 for testing/debugging purposes only.
258 */
259 int sosendbigcl_ignore_capab = 0;
260 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
261 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
262
263 int sodefunctlog = 0;
264 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
265 &sodefunctlog, 0, "");
266
267 int sothrottlelog = 0;
268 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
269 &sothrottlelog, 0, "");
270
271 int sorestrictrecv = 1;
272 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
273 &sorestrictrecv, 0, "Enable inbound interface restrictions");
274
275 int sorestrictsend = 1;
276 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
277 &sorestrictsend, 0, "Enable outbound interface restrictions");
278
279 int soreserveheadroom = 1;
280 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
281 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
282
283 extern struct inpcbinfo tcbinfo;
284
285 /* TODO: these should be in header file */
286 extern int get_inpcb_str_size(void);
287 extern int get_tcp_str_size(void);
288
289 static unsigned int sl_zone_size; /* size of sockaddr_list */
290 static struct zone *sl_zone; /* zone for sockaddr_list */
291
292 static unsigned int se_zone_size; /* size of sockaddr_entry */
293 static struct zone *se_zone; /* zone for sockaddr_entry */
294
295 vm_size_t so_cache_zone_element_size;
296
297 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
298 user_ssize_t *);
299 static void cached_sock_alloc(struct socket **, int);
300 static void cached_sock_free(struct socket *);
301
302 /*
303 * Maximum of extended background idle sockets per process
304 * Set to zero to disable further setting of the option
305 */
306
307 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
308 #define SO_IDLE_BK_IDLE_TIME 600
309 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
310
311 struct soextbkidlestat soextbkidlestat;
312
313 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
314 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
315 "Maximum of extended background idle sockets per process");
316
317 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
318 &soextbkidlestat.so_xbkidle_time, 0,
319 "Time in seconds to keep extended background idle sockets");
320
321 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
322 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
323 "High water mark for extended background idle sockets");
324
325 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
326 &soextbkidlestat, soextbkidlestat, "");
327
328 int so_set_extended_bk_idle(struct socket *, int);
329
330 /*
331 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
332 * setting the DSCP code on the packet based on the service class; see
333 * <rdar://problem/11277343> for details.
334 */
335 __private_extern__ u_int32_t sotcdb = SOTCDB_NO_DSCP;
336 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
337 &sotcdb, 0, "");
338
339 void
340 socketinit(void)
341 {
342 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
343 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
344
345 #ifdef __LP64__
346 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
347 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
348 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
349 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
350 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
351 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
352 #else
353 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
354 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
355 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
356 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
357 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
358 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
359 #endif
360
361 if (socketinit_done) {
362 printf("socketinit: already called...\n");
363 return;
364 }
365 socketinit_done = 1;
366
367 PE_parse_boot_argn("socket_debug", &socket_debug,
368 sizeof (socket_debug));
369
370 /*
371 * allocate lock group attribute and group for socket cache mutex
372 */
373 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
374 so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
375 so_cache_mtx_grp_attr);
376
377 /*
378 * allocate the lock attribute for socket cache mutex
379 */
380 so_cache_mtx_attr = lck_attr_alloc_init();
381
382 /* cached sockets mutex */
383 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
384 if (so_cache_mtx == NULL) {
385 panic("%s: unable to allocate so_cache_mtx\n", __func__);
386 /* NOTREACHED */
387 }
388 STAILQ_INIT(&so_cache_head);
389
390 so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4
391 + get_inpcb_str_size() + 4 + get_tcp_str_size());
392
393 so_cache_zone = zinit(so_cache_zone_element_size,
394 (120000 * so_cache_zone_element_size), 8192, "socache zone");
395 zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
396 zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
397
398 sl_zone_size = sizeof (struct sockaddr_list);
399 if ((sl_zone = zinit(sl_zone_size, 1024 * sl_zone_size, 1024,
400 "sockaddr_list")) == NULL) {
401 panic("%s: unable to allocate sockaddr_list zone\n", __func__);
402 /* NOTREACHED */
403 }
404 zone_change(sl_zone, Z_CALLERACCT, FALSE);
405 zone_change(sl_zone, Z_EXPAND, TRUE);
406
407 se_zone_size = sizeof (struct sockaddr_entry);
408 if ((se_zone = zinit(se_zone_size, 1024 * se_zone_size, 1024,
409 "sockaddr_entry")) == NULL) {
410 panic("%s: unable to allocate sockaddr_entry zone\n", __func__);
411 /* NOTREACHED */
412 }
413 zone_change(se_zone, Z_CALLERACCT, FALSE);
414 zone_change(se_zone, Z_EXPAND, TRUE);
415
416 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
417 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
418 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
419 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
420
421 in_pcbinit();
422 sflt_init();
423 socket_tclass_init();
424 #if MULTIPATH
425 mp_pcbinit();
426 #endif /* MULTIPATH */
427 }
428
429 static void
430 cached_sock_alloc(struct socket **so, int waitok)
431 {
432 caddr_t temp;
433 uintptr_t offset;
434
435 lck_mtx_lock(so_cache_mtx);
436
437 if (!STAILQ_EMPTY(&so_cache_head)) {
438 VERIFY(cached_sock_count > 0);
439
440 *so = STAILQ_FIRST(&so_cache_head);
441 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
442 STAILQ_NEXT((*so), so_cache_ent) = NULL;
443
444 cached_sock_count--;
445 lck_mtx_unlock(so_cache_mtx);
446
447 temp = (*so)->so_saved_pcb;
448 bzero((caddr_t)*so, sizeof (struct socket));
449
450 (*so)->so_saved_pcb = temp;
451 } else {
452
453 lck_mtx_unlock(so_cache_mtx);
454
455 if (waitok)
456 *so = (struct socket *)zalloc(so_cache_zone);
457 else
458 *so = (struct socket *)zalloc_noblock(so_cache_zone);
459
460 if (*so == NULL)
461 return;
462
463 bzero((caddr_t)*so, sizeof (struct socket));
464
465 /*
466 * Define offsets for extra structures into our
467 * single block of memory. Align extra structures
468 * on longword boundaries.
469 */
470
471 offset = (uintptr_t)*so;
472 offset += sizeof (struct socket);
473
474 offset = ALIGN(offset);
475
476 (*so)->so_saved_pcb = (caddr_t)offset;
477 offset += get_inpcb_str_size();
478
479 offset = ALIGN(offset);
480
481 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
482 (caddr_t)offset;
483 }
484
485 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
486 }
487
488 static void
489 cached_sock_free(struct socket *so)
490 {
491
492 lck_mtx_lock(so_cache_mtx);
493
494 so_cache_time = net_uptime();
495 if (++cached_sock_count > max_cached_sock_count) {
496 --cached_sock_count;
497 lck_mtx_unlock(so_cache_mtx);
498 zfree(so_cache_zone, so);
499 } else {
500 if (so_cache_hw < cached_sock_count)
501 so_cache_hw = cached_sock_count;
502
503 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
504
505 so->cache_timestamp = so_cache_time;
506 lck_mtx_unlock(so_cache_mtx);
507 }
508 }
509
510 void
511 so_update_last_owner_locked(struct socket *so, proc_t self)
512 {
513 if (so->last_pid != 0) {
514 /*
515 * last_pid and last_upid should remain zero for sockets
516 * created using sock_socket. The check above achieves that
517 */
518 if (self == PROC_NULL)
519 self = current_proc();
520
521 if (so->last_upid != proc_uniqueid(self) ||
522 so->last_pid != proc_pid(self)) {
523 so->last_upid = proc_uniqueid(self);
524 so->last_pid = proc_pid(self);
525 proc_getexecutableuuid(self, so->last_uuid,
526 sizeof (so->last_uuid));
527 }
528 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
529 }
530 }
531
532 void
533 so_update_policy(struct socket *so)
534 {
535 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
536 (void) inp_update_policy(sotoinpcb(so));
537 }
538
539 #if NECP
540 static void
541 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
542 struct sockaddr *override_remote_addr)
543 {
544 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
545 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
546 override_remote_addr, 0);
547 }
548 #endif /* NECP */
549
550 boolean_t
551 so_cache_timer(void)
552 {
553 struct socket *p;
554 int n_freed = 0;
555 boolean_t rc = FALSE;
556
557 lck_mtx_lock(so_cache_mtx);
558 so_cache_timeouts++;
559 so_cache_time = net_uptime();
560
561 while (!STAILQ_EMPTY(&so_cache_head)) {
562 VERIFY(cached_sock_count > 0);
563 p = STAILQ_FIRST(&so_cache_head);
564 if ((so_cache_time - p->cache_timestamp) <
565 SO_CACHE_TIME_LIMIT)
566 break;
567
568 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
569 --cached_sock_count;
570
571 zfree(so_cache_zone, p);
572
573 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
574 so_cache_max_freed++;
575 break;
576 }
577 }
578
579 /* Schedule again if there is more to cleanup */
580 if (!STAILQ_EMPTY(&so_cache_head))
581 rc = TRUE;
582
583 lck_mtx_unlock(so_cache_mtx);
584 return (rc);
585 }
586
587 /*
588 * Get a socket structure from our zone, and initialize it.
589 * We don't implement `waitok' yet (see comments in uipc_domain.c).
590 * Note that it would probably be better to allocate socket
591 * and PCB at the same time, but I'm not convinced that all
592 * the protocols can be easily modified to do this.
593 */
594 struct socket *
595 soalloc(int waitok, int dom, int type)
596 {
597 struct socket *so;
598
599 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
600 cached_sock_alloc(&so, waitok);
601 } else {
602 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
603 M_WAITOK);
604 if (so != NULL)
605 bzero(so, sizeof (*so));
606 }
607 if (so != NULL) {
608 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
609 so->so_zone = socket_zone;
610 #if CONFIG_MACF_SOCKET
611 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
612 if (mac_socket_label_init(so, !waitok) != 0) {
613 sodealloc(so);
614 return (NULL);
615 }
616 #endif /* MAC_SOCKET */
617 }
618
619 return (so);
620 }
621
622 int
623 socreate_internal(int dom, struct socket **aso, int type, int proto,
624 struct proc *p, uint32_t flags, struct proc *ep)
625 {
626 struct protosw *prp;
627 struct socket *so;
628 int error = 0;
629
630 #if TCPDEBUG
631 extern int tcpconsdebug;
632 #endif
633
634 VERIFY(aso != NULL);
635 *aso = NULL;
636
637 if (proto != 0)
638 prp = pffindproto(dom, proto, type);
639 else
640 prp = pffindtype(dom, type);
641
642 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
643 if (pffinddomain(dom) == NULL)
644 return (EAFNOSUPPORT);
645 if (proto != 0) {
646 if (pffindprotonotype(dom, proto) != NULL)
647 return (EPROTOTYPE);
648 }
649 return (EPROTONOSUPPORT);
650 }
651 if (prp->pr_type != type)
652 return (EPROTOTYPE);
653 so = soalloc(1, dom, type);
654 if (so == NULL)
655 return (ENOBUFS);
656
657 if (flags & SOCF_ASYNC)
658 so->so_state |= SS_NBIO;
659 #if MULTIPATH
660 if (flags & SOCF_MP_SUBFLOW) {
661 /*
662 * A multipath subflow socket is used internally in the kernel,
663 * therefore it does not have a file desciptor associated by
664 * default.
665 */
666 so->so_state |= SS_NOFDREF;
667 so->so_flags |= SOF_MP_SUBFLOW;
668 }
669 #endif /* MULTIPATH */
670
671 TAILQ_INIT(&so->so_incomp);
672 TAILQ_INIT(&so->so_comp);
673 so->so_type = type;
674 so->last_upid = proc_uniqueid(p);
675 so->last_pid = proc_pid(p);
676 proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
677 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
678
679 if (ep != PROC_NULL && ep != p) {
680 so->e_upid = proc_uniqueid(ep);
681 so->e_pid = proc_pid(ep);
682 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
683 so->so_flags |= SOF_DELEGATED;
684 }
685
686 so->so_cred = kauth_cred_proc_ref(p);
687 if (!suser(kauth_cred_get(), NULL))
688 so->so_state |= SS_PRIV;
689
690 so->so_proto = prp;
691 so->so_rcv.sb_flags |= SB_RECV;
692 so->so_rcv.sb_so = so->so_snd.sb_so = so;
693 so->next_lock_lr = 0;
694 so->next_unlock_lr = 0;
695
696 #if CONFIG_MACF_SOCKET
697 mac_socket_label_associate(kauth_cred_get(), so);
698 #endif /* MAC_SOCKET */
699
700 /*
701 * Attachment will create the per pcb lock if necessary and
702 * increase refcount for creation, make sure it's done before
703 * socket is inserted in lists.
704 */
705 so->so_usecount++;
706
707 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
708 if (error != 0) {
709 /*
710 * Warning:
711 * If so_pcb is not zero, the socket will be leaked,
712 * so protocol attachment handler must be coded carefuly
713 */
714 so->so_state |= SS_NOFDREF;
715 so->so_usecount--;
716 sofreelastref(so, 1); /* will deallocate the socket */
717 return (error);
718 }
719
720 atomic_add_32(&prp->pr_domain->dom_refs, 1);
721 TAILQ_INIT(&so->so_evlist);
722
723 /* Attach socket filters for this protocol */
724 sflt_initsock(so);
725 #if TCPDEBUG
726 if (tcpconsdebug == 2)
727 so->so_options |= SO_DEBUG;
728 #endif
729 so_set_default_traffic_class(so);
730
731 /*
732 * If this thread or task is marked to create backgrounded sockets,
733 * mark the socket as background.
734 */
735 if (proc_get_effective_thread_policy(current_thread(),
736 TASK_POLICY_NEW_SOCKETS_BG)) {
737 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
738 so->so_background_thread = current_thread();
739 }
740
741 switch (dom) {
742 /*
743 * Don't mark Unix domain, system or multipath sockets as
744 * eligible for defunct by default.
745 */
746 case PF_LOCAL:
747 case PF_SYSTEM:
748 case PF_MULTIPATH:
749 so->so_flags |= SOF_NODEFUNCT;
750 break;
751 default:
752 break;
753 }
754
755 /*
756 * Entitlements can't be checked at socket creation time except if the
757 * application requested a feature guarded by a privilege (c.f., socket
758 * delegation).
759 * The priv(9) and the Sandboxing APIs are designed with the idea that
760 * a privilege check should only be triggered by a userland request.
761 * A privilege check at socket creation time is time consuming and
762 * could trigger many authorisation error messages from the security
763 * APIs.
764 */
765
766 *aso = so;
767
768 return (0);
769 }
770
771 /*
772 * Returns: 0 Success
773 * EAFNOSUPPORT
774 * EPROTOTYPE
775 * EPROTONOSUPPORT
776 * ENOBUFS
777 * <pru_attach>:ENOBUFS[AF_UNIX]
778 * <pru_attach>:ENOBUFS[TCP]
779 * <pru_attach>:ENOMEM[TCP]
780 * <pru_attach>:??? [other protocol families, IPSEC]
781 */
782 int
783 socreate(int dom, struct socket **aso, int type, int proto)
784 {
785 return (socreate_internal(dom, aso, type, proto, current_proc(), 0,
786 PROC_NULL));
787 }
788
789 int
790 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
791 {
792 int error = 0;
793 struct proc *ep = PROC_NULL;
794
795 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
796 error = ESRCH;
797 goto done;
798 }
799
800 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
801
802 /*
803 * It might not be wise to hold the proc reference when calling
804 * socreate_internal since it calls soalloc with M_WAITOK
805 */
806 done:
807 if (ep != PROC_NULL)
808 proc_rele(ep);
809
810 return (error);
811 }
812
813 /*
814 * Returns: 0 Success
815 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
816 * <pru_bind>:EAFNOSUPPORT Address family not supported
817 * <pru_bind>:EADDRNOTAVAIL Address not available.
818 * <pru_bind>:EINVAL Invalid argument
819 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
820 * <pru_bind>:EACCES Permission denied
821 * <pru_bind>:EADDRINUSE Address in use
822 * <pru_bind>:EAGAIN Resource unavailable, try again
823 * <pru_bind>:EPERM Operation not permitted
824 * <pru_bind>:???
825 * <sf_bind>:???
826 *
827 * Notes: It's not possible to fully enumerate the return codes above,
828 * since socket filter authors and protocol family authors may
829 * not choose to limit their error returns to those listed, even
830 * though this may result in some software operating incorrectly.
831 *
832 * The error codes which are enumerated above are those known to
833 * be returned by the tcp_usr_bind function supplied.
834 */
835 int
836 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
837 {
838 struct proc *p = current_proc();
839 int error = 0;
840
841 if (dolock)
842 socket_lock(so, 1);
843 VERIFY(so->so_usecount > 1);
844
845 so_update_last_owner_locked(so, p);
846 so_update_policy(so);
847
848 #if NECP
849 so_update_necp_policy(so, nam, NULL);
850 #endif /* NECP */
851
852 /*
853 * If this is a bind request on a socket that has been marked
854 * as inactive, reject it now before we go any further.
855 */
856 if (so->so_flags & SOF_DEFUNCT) {
857 error = EINVAL;
858 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
859 __func__, proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
860 SOCK_DOM(so), SOCK_TYPE(so), error));
861 goto out;
862 }
863
864 /* Socket filter */
865 error = sflt_bind(so, nam);
866
867 if (error == 0)
868 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
869 out:
870 if (dolock)
871 socket_unlock(so, 1);
872
873 if (error == EJUSTRETURN)
874 error = 0;
875
876 return (error);
877 }
878
879 void
880 sodealloc(struct socket *so)
881 {
882 kauth_cred_unref(&so->so_cred);
883
884 /* Remove any filters */
885 sflt_termsock(so);
886
887 #if CONTENT_FILTER
888 cfil_sock_detach(so);
889 #endif /* CONTENT_FILTER */
890
891 /* Delete the state allocated for msg queues on a socket */
892 if (so->so_flags & SOF_ENABLE_MSGS) {
893 FREE(so->so_msg_state, M_TEMP);
894 so->so_msg_state = NULL;
895 }
896 VERIFY(so->so_msg_state == NULL);
897
898 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
899
900 #if CONFIG_MACF_SOCKET
901 mac_socket_label_destroy(so);
902 #endif /* MAC_SOCKET */
903
904 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
905 cached_sock_free(so);
906 } else {
907 FREE_ZONE(so, sizeof (*so), so->so_zone);
908 }
909 }
910
911 /*
912 * Returns: 0 Success
913 * EINVAL
914 * EOPNOTSUPP
915 * <pru_listen>:EINVAL[AF_UNIX]
916 * <pru_listen>:EINVAL[TCP]
917 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
918 * <pru_listen>:EINVAL[TCP] Invalid argument
919 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
920 * <pru_listen>:EACCES[TCP] Permission denied
921 * <pru_listen>:EADDRINUSE[TCP] Address in use
922 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
923 * <pru_listen>:EPERM[TCP] Operation not permitted
924 * <sf_listen>:???
925 *
926 * Notes: Other <pru_listen> returns depend on the protocol family; all
927 * <sf_listen> returns depend on what the filter author causes
928 * their filter to return.
929 */
930 int
931 solisten(struct socket *so, int backlog)
932 {
933 struct proc *p = current_proc();
934 int error = 0;
935
936 socket_lock(so, 1);
937
938 so_update_last_owner_locked(so, p);
939 so_update_policy(so);
940
941 #if NECP
942 so_update_necp_policy(so, NULL, NULL);
943 #endif /* NECP */
944
945 if (so->so_proto == NULL) {
946 error = EINVAL;
947 goto out;
948 }
949 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
950 error = EOPNOTSUPP;
951 goto out;
952 }
953
954 /*
955 * If the listen request is made on a socket that is not fully
956 * disconnected, or on a socket that has been marked as inactive,
957 * reject the request now.
958 */
959 if ((so->so_state &
960 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
961 (so->so_flags & SOF_DEFUNCT)) {
962 error = EINVAL;
963 if (so->so_flags & SOF_DEFUNCT) {
964 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
965 "(%d)\n", __func__, proc_pid(p),
966 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
967 SOCK_DOM(so), SOCK_TYPE(so), error));
968 }
969 goto out;
970 }
971
972 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
973 error = EPERM;
974 goto out;
975 }
976
977 error = sflt_listen(so);
978 if (error == 0)
979 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
980
981 if (error) {
982 if (error == EJUSTRETURN)
983 error = 0;
984 goto out;
985 }
986
987 if (TAILQ_EMPTY(&so->so_comp))
988 so->so_options |= SO_ACCEPTCONN;
989 /*
990 * POSIX: The implementation may have an upper limit on the length of
991 * the listen queue-either global or per accepting socket. If backlog
992 * exceeds this limit, the length of the listen queue is set to the
993 * limit.
994 *
995 * If listen() is called with a backlog argument value that is less
996 * than 0, the function behaves as if it had been called with a backlog
997 * argument value of 0.
998 *
999 * A backlog argument of 0 may allow the socket to accept connections,
1000 * in which case the length of the listen queue may be set to an
1001 * implementation-defined minimum value.
1002 */
1003 if (backlog <= 0 || backlog > somaxconn)
1004 backlog = somaxconn;
1005
1006 so->so_qlimit = backlog;
1007 out:
1008 socket_unlock(so, 1);
1009 return (error);
1010 }
1011
1012 void
1013 sofreelastref(struct socket *so, int dealloc)
1014 {
1015 struct socket *head = so->so_head;
1016
1017 /* Assume socket is locked */
1018
1019 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1020 selthreadclear(&so->so_snd.sb_sel);
1021 selthreadclear(&so->so_rcv.sb_sel);
1022 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1023 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1024 so->so_event = sonullevent;
1025 return;
1026 }
1027 if (head != NULL) {
1028 socket_lock(head, 1);
1029 if (so->so_state & SS_INCOMP) {
1030 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1031 head->so_incqlen--;
1032 } else if (so->so_state & SS_COMP) {
1033 /*
1034 * We must not decommission a socket that's
1035 * on the accept(2) queue. If we do, then
1036 * accept(2) may hang after select(2) indicated
1037 * that the listening socket was ready.
1038 */
1039 selthreadclear(&so->so_snd.sb_sel);
1040 selthreadclear(&so->so_rcv.sb_sel);
1041 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1042 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1043 so->so_event = sonullevent;
1044 socket_unlock(head, 1);
1045 return;
1046 } else {
1047 panic("sofree: not queued");
1048 }
1049 head->so_qlen--;
1050 so->so_state &= ~SS_INCOMP;
1051 so->so_head = NULL;
1052 socket_unlock(head, 1);
1053 }
1054 sowflush(so);
1055 sorflush(so);
1056
1057 #if FLOW_DIVERT
1058 if (so->so_flags & SOF_FLOW_DIVERT) {
1059 flow_divert_detach(so);
1060 }
1061 #endif /* FLOW_DIVERT */
1062
1063 /* 3932268: disable upcall */
1064 so->so_rcv.sb_flags &= ~SB_UPCALL;
1065 so->so_snd.sb_flags &= ~SB_UPCALL;
1066 so->so_event = sonullevent;
1067
1068 if (dealloc)
1069 sodealloc(so);
1070 }
1071
1072 void
1073 soclose_wait_locked(struct socket *so)
1074 {
1075 lck_mtx_t *mutex_held;
1076
1077 if (so->so_proto->pr_getlock != NULL)
1078 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1079 else
1080 mutex_held = so->so_proto->pr_domain->dom_mtx;
1081 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1082
1083 /*
1084 * Double check here and return if there's no outstanding upcall;
1085 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1086 */
1087 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
1088 return;
1089 so->so_rcv.sb_flags &= ~SB_UPCALL;
1090 so->so_snd.sb_flags &= ~SB_UPCALL;
1091 so->so_flags |= SOF_CLOSEWAIT;
1092 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1093 "soclose_wait_locked", NULL);
1094 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1095 so->so_flags &= ~SOF_CLOSEWAIT;
1096 }
1097
1098 /*
1099 * Close a socket on last file table reference removal.
1100 * Initiate disconnect if connected.
1101 * Free socket when disconnect complete.
1102 */
1103 int
1104 soclose_locked(struct socket *so)
1105 {
1106 int error = 0;
1107 lck_mtx_t *mutex_held;
1108 struct timespec ts;
1109
1110 if (so->so_usecount == 0) {
1111 panic("soclose: so=%p refcount=0\n", so);
1112 /* NOTREACHED */
1113 }
1114
1115 sflt_notify(so, sock_evt_closing, NULL);
1116
1117 if (so->so_upcallusecount)
1118 soclose_wait_locked(so);
1119
1120 #if CONTENT_FILTER
1121 /*
1122 * We have to wait until the content filters are done
1123 */
1124 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1125 cfil_sock_close_wait(so);
1126 cfil_sock_is_closed(so);
1127 cfil_sock_detach(so);
1128 }
1129 #endif /* CONTENT_FILTER */
1130
1131 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1132 soresume(current_proc(), so, 1);
1133 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1134 }
1135
1136 if ((so->so_options & SO_ACCEPTCONN)) {
1137 struct socket *sp, *sonext;
1138 int socklock = 0;
1139
1140 /*
1141 * We do not want new connection to be added
1142 * to the connection queues
1143 */
1144 so->so_options &= ~SO_ACCEPTCONN;
1145
1146 for (sp = TAILQ_FIRST(&so->so_incomp);
1147 sp != NULL; sp = sonext) {
1148 sonext = TAILQ_NEXT(sp, so_list);
1149
1150 /*
1151 * Radar 5350314
1152 * skip sockets thrown away by tcpdropdropblreq
1153 * they will get cleanup by the garbage collection.
1154 * otherwise, remove the incomp socket from the queue
1155 * and let soabort trigger the appropriate cleanup.
1156 */
1157 if (sp->so_flags & SOF_OVERFLOW)
1158 continue;
1159
1160 if (so->so_proto->pr_getlock != NULL) {
1161 /*
1162 * Lock ordering for consistency with the
1163 * rest of the stack, we lock the socket
1164 * first and then grabb the head.
1165 */
1166 socket_unlock(so, 0);
1167 socket_lock(sp, 1);
1168 socket_lock(so, 0);
1169 socklock = 1;
1170 }
1171
1172 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1173 so->so_incqlen--;
1174
1175 if (sp->so_state & SS_INCOMP) {
1176 sp->so_state &= ~SS_INCOMP;
1177 sp->so_head = NULL;
1178
1179 (void) soabort(sp);
1180 }
1181
1182 if (socklock)
1183 socket_unlock(sp, 1);
1184 }
1185
1186 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
1187 /* Dequeue from so_comp since sofree() won't do it */
1188 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1189 so->so_qlen--;
1190
1191 if (so->so_proto->pr_getlock != NULL) {
1192 socket_unlock(so, 0);
1193 socket_lock(sp, 1);
1194 }
1195
1196 if (sp->so_state & SS_COMP) {
1197 sp->so_state &= ~SS_COMP;
1198 sp->so_head = NULL;
1199
1200 (void) soabort(sp);
1201 }
1202
1203 if (so->so_proto->pr_getlock != NULL) {
1204 socket_unlock(sp, 1);
1205 socket_lock(so, 0);
1206 }
1207 }
1208 }
1209 if (so->so_pcb == NULL) {
1210 /* 3915887: mark the socket as ready for dealloc */
1211 so->so_flags |= SOF_PCBCLEARING;
1212 goto discard;
1213 }
1214 if (so->so_state & SS_ISCONNECTED) {
1215 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1216 error = sodisconnectlocked(so);
1217 if (error)
1218 goto drop;
1219 }
1220 if (so->so_options & SO_LINGER) {
1221 if ((so->so_state & SS_ISDISCONNECTING) &&
1222 (so->so_state & SS_NBIO))
1223 goto drop;
1224 if (so->so_proto->pr_getlock != NULL)
1225 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1226 else
1227 mutex_held = so->so_proto->pr_domain->dom_mtx;
1228 while (so->so_state & SS_ISCONNECTED) {
1229 ts.tv_sec = (so->so_linger/100);
1230 ts.tv_nsec = (so->so_linger % 100) *
1231 NSEC_PER_USEC * 1000 * 10;
1232 error = msleep((caddr_t)&so->so_timeo,
1233 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1234 if (error) {
1235 /*
1236 * It's OK when the time fires,
1237 * don't report an error
1238 */
1239 if (error == EWOULDBLOCK)
1240 error = 0;
1241 break;
1242 }
1243 }
1244 }
1245 }
1246 drop:
1247 if (so->so_usecount == 0) {
1248 panic("soclose: usecount is zero so=%p\n", so);
1249 /* NOTREACHED */
1250 }
1251 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1252 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1253 if (error == 0)
1254 error = error2;
1255 }
1256 if (so->so_usecount <= 0) {
1257 panic("soclose: usecount is zero so=%p\n", so);
1258 /* NOTREACHED */
1259 }
1260 discard:
1261 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1262 (so->so_state & SS_NOFDREF)) {
1263 panic("soclose: NOFDREF");
1264 /* NOTREACHED */
1265 }
1266 so->so_state |= SS_NOFDREF;
1267
1268 if (so->so_flags & SOF_MP_SUBFLOW)
1269 so->so_flags &= ~SOF_MP_SUBFLOW;
1270
1271 if ((so->so_flags & SOF_KNOTE) != 0)
1272 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1273
1274 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1275 evsofree(so);
1276
1277 so->so_usecount--;
1278 sofree(so);
1279 return (error);
1280 }
1281
1282 int
1283 soclose(struct socket *so)
1284 {
1285 int error = 0;
1286 socket_lock(so, 1);
1287
1288 if (so->so_retaincnt == 0) {
1289 error = soclose_locked(so);
1290 } else {
1291 /*
1292 * if the FD is going away, but socket is
1293 * retained in kernel remove its reference
1294 */
1295 so->so_usecount--;
1296 if (so->so_usecount < 2)
1297 panic("soclose: retaincnt non null and so=%p "
1298 "usecount=%d\n", so, so->so_usecount);
1299 }
1300 socket_unlock(so, 1);
1301 return (error);
1302 }
1303
1304 /*
1305 * Must be called at splnet...
1306 */
1307 /* Should already be locked */
1308 int
1309 soabort(struct socket *so)
1310 {
1311 int error;
1312
1313 #ifdef MORE_LOCKING_DEBUG
1314 lck_mtx_t *mutex_held;
1315
1316 if (so->so_proto->pr_getlock != NULL)
1317 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1318 else
1319 mutex_held = so->so_proto->pr_domain->dom_mtx;
1320 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1321 #endif
1322
1323 if ((so->so_flags & SOF_ABORTED) == 0) {
1324 so->so_flags |= SOF_ABORTED;
1325 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1326 if (error) {
1327 sofree(so);
1328 return (error);
1329 }
1330 }
1331 return (0);
1332 }
1333
1334 int
1335 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1336 {
1337 int error;
1338
1339 if (dolock)
1340 socket_lock(so, 1);
1341
1342 so_update_last_owner_locked(so, PROC_NULL);
1343 so_update_policy(so);
1344 #if NECP
1345 so_update_necp_policy(so, NULL, NULL);
1346 #endif /* NECP */
1347
1348 if ((so->so_state & SS_NOFDREF) == 0)
1349 panic("soaccept: !NOFDREF");
1350 so->so_state &= ~SS_NOFDREF;
1351 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1352
1353 if (dolock)
1354 socket_unlock(so, 1);
1355 return (error);
1356 }
1357
1358 int
1359 soaccept(struct socket *so, struct sockaddr **nam)
1360 {
1361 return (soacceptlock(so, nam, 1));
1362 }
1363
1364 int
1365 soacceptfilter(struct socket *so)
1366 {
1367 struct sockaddr *local = NULL, *remote = NULL;
1368 int error = 0;
1369 struct socket *head = so->so_head;
1370
1371 /*
1372 * Hold the lock even if this socket has not been made visible
1373 * to the filter(s). For sockets with global locks, this protects
1374 * against the head or peer going away
1375 */
1376 socket_lock(so, 1);
1377 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1378 sogetaddr_locked(so, &local, 0) != 0) {
1379 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1380 so->so_head = NULL;
1381 socket_unlock(so, 1);
1382 soclose(so);
1383 /* Out of resources; try it again next time */
1384 error = ECONNABORTED;
1385 goto done;
1386 }
1387
1388 error = sflt_accept(head, so, local, remote);
1389
1390 /*
1391 * If we get EJUSTRETURN from one of the filters, mark this socket
1392 * as inactive and return it anyway. This newly accepted socket
1393 * will be disconnected later before we hand it off to the caller.
1394 */
1395 if (error == EJUSTRETURN) {
1396 error = 0;
1397 (void) sosetdefunct(current_proc(), so,
1398 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1399 }
1400
1401 if (error != 0) {
1402 /*
1403 * This may seem like a duplication to the above error
1404 * handling part when we return ECONNABORTED, except
1405 * the following is done while holding the lock since
1406 * the socket has been exposed to the filter(s) earlier.
1407 */
1408 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1409 so->so_head = NULL;
1410 socket_unlock(so, 1);
1411 soclose(so);
1412 /* Propagate socket filter's error code to the caller */
1413 } else {
1414 socket_unlock(so, 1);
1415 }
1416 done:
1417 /* Callee checks for NULL pointer */
1418 sock_freeaddr(remote);
1419 sock_freeaddr(local);
1420 return (error);
1421 }
1422
1423 /*
1424 * Returns: 0 Success
1425 * EOPNOTSUPP Operation not supported on socket
1426 * EISCONN Socket is connected
1427 * <pru_connect>:EADDRNOTAVAIL Address not available.
1428 * <pru_connect>:EINVAL Invalid argument
1429 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1430 * <pru_connect>:EACCES Permission denied
1431 * <pru_connect>:EADDRINUSE Address in use
1432 * <pru_connect>:EAGAIN Resource unavailable, try again
1433 * <pru_connect>:EPERM Operation not permitted
1434 * <sf_connect_out>:??? [anything a filter writer might set]
1435 */
1436 int
1437 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1438 {
1439 int error;
1440 struct proc *p = current_proc();
1441
1442 if (dolock)
1443 socket_lock(so, 1);
1444
1445 so_update_last_owner_locked(so, p);
1446 so_update_policy(so);
1447
1448 #if NECP
1449 so_update_necp_policy(so, NULL, nam);
1450 #endif /* NECP */
1451
1452 /*
1453 * If this is a listening socket or if this is a previously-accepted
1454 * socket that has been marked as inactive, reject the connect request.
1455 */
1456 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1457 error = EOPNOTSUPP;
1458 if (so->so_flags & SOF_DEFUNCT) {
1459 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1460 "(%d)\n", __func__, proc_pid(p),
1461 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1462 SOCK_DOM(so), SOCK_TYPE(so), error));
1463 }
1464 if (dolock)
1465 socket_unlock(so, 1);
1466 return (error);
1467 }
1468
1469 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1470 if (dolock)
1471 socket_unlock(so, 1);
1472 return (EPERM);
1473 }
1474
1475 /*
1476 * If protocol is connection-based, can only connect once.
1477 * Otherwise, if connected, try to disconnect first.
1478 * This allows user to disconnect by connecting to, e.g.,
1479 * a null address.
1480 */
1481 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1482 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1483 (error = sodisconnectlocked(so)))) {
1484 error = EISCONN;
1485 } else {
1486 /*
1487 * Run connect filter before calling protocol:
1488 * - non-blocking connect returns before completion;
1489 */
1490 error = sflt_connectout(so, nam);
1491 if (error != 0) {
1492 if (error == EJUSTRETURN)
1493 error = 0;
1494 } else {
1495 error = (*so->so_proto->pr_usrreqs->pru_connect)
1496 (so, nam, p);
1497 }
1498 }
1499 if (dolock)
1500 socket_unlock(so, 1);
1501 return (error);
1502 }
1503
1504 int
1505 soconnect(struct socket *so, struct sockaddr *nam)
1506 {
1507 return (soconnectlock(so, nam, 1));
1508 }
1509
1510 /*
1511 * Returns: 0 Success
1512 * <pru_connect2>:EINVAL[AF_UNIX]
1513 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1514 * <pru_connect2>:??? [other protocol families]
1515 *
1516 * Notes: <pru_connect2> is not supported by [TCP].
1517 */
1518 int
1519 soconnect2(struct socket *so1, struct socket *so2)
1520 {
1521 int error;
1522
1523 socket_lock(so1, 1);
1524 if (so2->so_proto->pr_lock)
1525 socket_lock(so2, 1);
1526
1527 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1528
1529 socket_unlock(so1, 1);
1530 if (so2->so_proto->pr_lock)
1531 socket_unlock(so2, 1);
1532 return (error);
1533 }
1534
1535 int
1536 soconnectxlocked(struct socket *so, struct sockaddr_list **src_sl,
1537 struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
1538 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1539 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1540 {
1541 int error;
1542
1543 so_update_last_owner_locked(so, p);
1544 so_update_policy(so);
1545
1546 /*
1547 * If this is a listening socket or if this is a previously-accepted
1548 * socket that has been marked as inactive, reject the connect request.
1549 */
1550 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1551 error = EOPNOTSUPP;
1552 if (so->so_flags & SOF_DEFUNCT) {
1553 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1554 "(%d)\n", __func__, proc_pid(p),
1555 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1556 SOCK_DOM(so), SOCK_TYPE(so), error));
1557 }
1558 return (error);
1559 }
1560
1561 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0)
1562 return (EPERM);
1563
1564 /*
1565 * If protocol is connection-based, can only connect once
1566 * unless PR_MULTICONN is set. Otherwise, if connected,
1567 * try to disconnect first. This allows user to disconnect
1568 * by connecting to, e.g., a null address.
1569 */
1570 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) &&
1571 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1572 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1573 (error = sodisconnectlocked(so)) != 0)) {
1574 error = EISCONN;
1575 } else {
1576 /*
1577 * Run connect filter before calling protocol:
1578 * - non-blocking connect returns before completion;
1579 */
1580 error = sflt_connectxout(so, dst_sl);
1581 if (error != 0) {
1582 if (error == EJUSTRETURN)
1583 error = 0;
1584 } else {
1585 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1586 (so, src_sl, dst_sl, p, ifscope, aid, pcid,
1587 flags, arg, arglen, auio, bytes_written);
1588 }
1589 }
1590
1591 return (error);
1592 }
1593
1594 int
1595 sodisconnectlocked(struct socket *so)
1596 {
1597 int error;
1598
1599 if ((so->so_state & SS_ISCONNECTED) == 0) {
1600 error = ENOTCONN;
1601 goto bad;
1602 }
1603 if (so->so_state & SS_ISDISCONNECTING) {
1604 error = EALREADY;
1605 goto bad;
1606 }
1607
1608 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1609 if (error == 0)
1610 sflt_notify(so, sock_evt_disconnected, NULL);
1611
1612 bad:
1613 return (error);
1614 }
1615
1616 /* Locking version */
1617 int
1618 sodisconnect(struct socket *so)
1619 {
1620 int error;
1621
1622 socket_lock(so, 1);
1623 error = sodisconnectlocked(so);
1624 socket_unlock(so, 1);
1625 return (error);
1626 }
1627
1628 int
1629 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1630 {
1631 int error;
1632
1633 /*
1634 * Call the protocol disconnectx handler; let it handle all
1635 * matters related to the connection state of this session.
1636 */
1637 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1638 if (error == 0) {
1639 /*
1640 * The event applies only for the session, not for
1641 * the disconnection of individual subflows.
1642 */
1643 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
1644 sflt_notify(so, sock_evt_disconnected, NULL);
1645 }
1646 return (error);
1647 }
1648
1649 int
1650 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1651 {
1652 int error;
1653
1654 socket_lock(so, 1);
1655 error = sodisconnectxlocked(so, aid, cid);
1656 socket_unlock(so, 1);
1657 return (error);
1658 }
1659
1660 int
1661 sopeelofflocked(struct socket *so, sae_associd_t aid, struct socket **psop)
1662 {
1663 return ((*so->so_proto->pr_usrreqs->pru_peeloff)(so, aid, psop));
1664 }
1665
1666 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1667
1668 /*
1669 * sosendcheck will lock the socket buffer if it isn't locked and
1670 * verify that there is space for the data being inserted.
1671 *
1672 * Returns: 0 Success
1673 * EPIPE
1674 * sblock:EWOULDBLOCK
1675 * sblock:EINTR
1676 * sbwait:EBADF
1677 * sbwait:EINTR
1678 * [so_error]:???
1679 */
1680 int
1681 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1682 int32_t clen, int32_t atomic, int flags, int *sblocked,
1683 struct mbuf *control)
1684 {
1685 int error = 0;
1686 int32_t space;
1687 int assumelock = 0;
1688
1689 restart:
1690 if (*sblocked == 0) {
1691 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1692 so->so_send_filt_thread != 0 &&
1693 so->so_send_filt_thread == current_thread()) {
1694 /*
1695 * We're being called recursively from a filter,
1696 * allow this to continue. Radar 4150520.
1697 * Don't set sblocked because we don't want
1698 * to perform an unlock later.
1699 */
1700 assumelock = 1;
1701 } else {
1702 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1703 if (error) {
1704 if (so->so_flags & SOF_DEFUNCT)
1705 goto defunct;
1706 return (error);
1707 }
1708 *sblocked = 1;
1709 }
1710 }
1711
1712 /*
1713 * If a send attempt is made on a socket that has been marked
1714 * as inactive (disconnected), reject the request.
1715 */
1716 if (so->so_flags & SOF_DEFUNCT) {
1717 defunct:
1718 error = EPIPE;
1719 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
1720 __func__, proc_selfpid(),
1721 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1722 SOCK_DOM(so), SOCK_TYPE(so), error));
1723 return (error);
1724 }
1725
1726 if (so->so_state & SS_CANTSENDMORE) {
1727 #if CONTENT_FILTER
1728 /*
1729 * Can re-inject data of half closed connections
1730 */
1731 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1732 so->so_snd.sb_cfil_thread == current_thread() &&
1733 cfil_sock_data_pending(&so->so_snd) != 0)
1734 CFIL_LOG(LOG_INFO,
1735 "so %llx ignore SS_CANTSENDMORE",
1736 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1737 else
1738 #endif /* CONTENT_FILTER */
1739 return (EPIPE);
1740 }
1741 if (so->so_error) {
1742 error = so->so_error;
1743 so->so_error = 0;
1744 return (error);
1745 }
1746
1747 if ((so->so_state & SS_ISCONNECTED) == 0) {
1748 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1749 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1750 (resid != 0 || clen == 0) &&
1751 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1752 #if MPTCP
1753 /*
1754 * MPTCP Fast Join sends data before the
1755 * socket is truly connected.
1756 */
1757 if ((so->so_flags & (SOF_MP_SUBFLOW |
1758 SOF_MPTCP_FASTJOIN)) !=
1759 (SOF_MP_SUBFLOW | SOF_MPTCP_FASTJOIN))
1760 #endif /* MPTCP */
1761 return (ENOTCONN);
1762 }
1763 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1764 return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1765 ENOTCONN : EDESTADDRREQ);
1766 }
1767 }
1768
1769 if (so->so_flags & SOF_ENABLE_MSGS)
1770 space = msgq_sbspace(so, control);
1771 else
1772 space = sbspace(&so->so_snd);
1773
1774 if (flags & MSG_OOB)
1775 space += 1024;
1776 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1777 clen > so->so_snd.sb_hiwat)
1778 return (EMSGSIZE);
1779
1780 if ((space < resid + clen &&
1781 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
1782 space < clen)) ||
1783 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1784 /*
1785 * don't block the connectx call when there's more data
1786 * than can be copied.
1787 */
1788 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
1789 if (space == 0) {
1790 return (EWOULDBLOCK);
1791 }
1792 if (space < (int32_t)so->so_snd.sb_lowat) {
1793 return (0);
1794 }
1795 }
1796 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1797 assumelock) {
1798 return (EWOULDBLOCK);
1799 }
1800 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
1801 *sblocked = 0;
1802 error = sbwait(&so->so_snd);
1803 if (error) {
1804 if (so->so_flags & SOF_DEFUNCT)
1805 goto defunct;
1806 return (error);
1807 }
1808 goto restart;
1809 }
1810 return (0);
1811 }
1812
1813 /*
1814 * Send on a socket.
1815 * If send must go all at once and message is larger than
1816 * send buffering, then hard error.
1817 * Lock against other senders.
1818 * If must go all at once and not enough room now, then
1819 * inform user that this would block and do nothing.
1820 * Otherwise, if nonblocking, send as much as possible.
1821 * The data to be sent is described by "uio" if nonzero,
1822 * otherwise by the mbuf chain "top" (which must be null
1823 * if uio is not). Data provided in mbuf chain must be small
1824 * enough to send all at once.
1825 *
1826 * Returns nonzero on error, timeout or signal; callers
1827 * must check for short counts if EINTR/ERESTART are returned.
1828 * Data and control buffers are freed on return.
1829 * Experiment:
1830 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1831 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1832 * point at the mbuf chain being constructed and go from there.
1833 *
1834 * Returns: 0 Success
1835 * EOPNOTSUPP
1836 * EINVAL
1837 * ENOBUFS
1838 * uiomove:EFAULT
1839 * sosendcheck:EPIPE
1840 * sosendcheck:EWOULDBLOCK
1841 * sosendcheck:EINTR
1842 * sosendcheck:EBADF
1843 * sosendcheck:EINTR
1844 * sosendcheck:??? [value from so_error]
1845 * <pru_send>:ECONNRESET[TCP]
1846 * <pru_send>:EINVAL[TCP]
1847 * <pru_send>:ENOBUFS[TCP]
1848 * <pru_send>:EADDRINUSE[TCP]
1849 * <pru_send>:EADDRNOTAVAIL[TCP]
1850 * <pru_send>:EAFNOSUPPORT[TCP]
1851 * <pru_send>:EACCES[TCP]
1852 * <pru_send>:EAGAIN[TCP]
1853 * <pru_send>:EPERM[TCP]
1854 * <pru_send>:EMSGSIZE[TCP]
1855 * <pru_send>:EHOSTUNREACH[TCP]
1856 * <pru_send>:ENETUNREACH[TCP]
1857 * <pru_send>:ENETDOWN[TCP]
1858 * <pru_send>:ENOMEM[TCP]
1859 * <pru_send>:ENOBUFS[TCP]
1860 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
1861 * <pru_send>:EINVAL[AF_UNIX]
1862 * <pru_send>:EOPNOTSUPP[AF_UNIX]
1863 * <pru_send>:EPIPE[AF_UNIX]
1864 * <pru_send>:ENOTCONN[AF_UNIX]
1865 * <pru_send>:EISCONN[AF_UNIX]
1866 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
1867 * <sf_data_out>:??? [whatever a filter author chooses]
1868 *
1869 * Notes: Other <pru_send> returns depend on the protocol family; all
1870 * <sf_data_out> returns depend on what the filter author causes
1871 * their filter to return.
1872 */
1873 int
1874 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1875 struct mbuf *top, struct mbuf *control, int flags)
1876 {
1877 struct mbuf **mp;
1878 struct mbuf *m, *freelist = NULL;
1879 user_ssize_t space, len, resid, orig_resid;
1880 int clen = 0, error, dontroute, mlen, sendflags;
1881 int atomic = sosendallatonce(so) || top;
1882 int sblocked = 0;
1883 struct proc *p = current_proc();
1884 struct mbuf *control_copy = NULL;
1885 uint16_t headroom = 0;
1886 boolean_t en_tracing = FALSE;
1887
1888 if (uio != NULL)
1889 resid = uio_resid(uio);
1890 else
1891 resid = top->m_pkthdr.len;
1892
1893 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
1894 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1895
1896 socket_lock(so, 1);
1897
1898 /*
1899 * trace if tracing & network (vs. unix) sockets & and
1900 * non-loopback
1901 */
1902 if (ENTR_SHOULDTRACE &&
1903 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
1904 struct inpcb *inp = sotoinpcb(so);
1905 if (inp->inp_last_outifp != NULL &&
1906 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
1907 en_tracing = TRUE;
1908 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
1909 VM_KERNEL_ADDRPERM(so),
1910 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
1911 (int64_t)resid);
1912 orig_resid = resid;
1913 }
1914 }
1915
1916 /*
1917 * Re-injection should not affect process accounting
1918 */
1919 if ((flags & MSG_SKIPCFIL) == 0) {
1920 so_update_last_owner_locked(so, p);
1921 so_update_policy(so);
1922
1923 #if NECP
1924 so_update_necp_policy(so, NULL, addr);
1925 #endif /* NECP */
1926 }
1927
1928 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
1929 error = EOPNOTSUPP;
1930 socket_unlock(so, 1);
1931 goto out;
1932 }
1933
1934 /*
1935 * In theory resid should be unsigned.
1936 * However, space must be signed, as it might be less than 0
1937 * if we over-committed, and we must use a signed comparison
1938 * of space and resid. On the other hand, a negative resid
1939 * causes us to loop sending 0-length segments to the protocol.
1940 *
1941 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
1942 * But it will be used by sockets doing message delivery.
1943 *
1944 * Note: We limit resid to be a positive int value as we use
1945 * imin() to set bytes_to_copy -- radr://14558484
1946 */
1947 if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
1948 !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
1949 error = EINVAL;
1950 socket_unlock(so, 1);
1951 goto out;
1952 }
1953
1954 dontroute = (flags & MSG_DONTROUTE) &&
1955 (so->so_options & SO_DONTROUTE) == 0 &&
1956 (so->so_proto->pr_flags & PR_ATOMIC);
1957 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
1958
1959 if (control != NULL)
1960 clen = control->m_len;
1961
1962 if (soreserveheadroom != 0)
1963 headroom = so->so_pktheadroom;
1964
1965 do {
1966 error = sosendcheck(so, addr, resid, clen, atomic, flags,
1967 &sblocked, control);
1968 if (error)
1969 goto release;
1970
1971 mp = &top;
1972 if (so->so_flags & SOF_ENABLE_MSGS)
1973 space = msgq_sbspace(so, control);
1974 else
1975 space = sbspace(&so->so_snd) - clen;
1976 space += ((flags & MSG_OOB) ? 1024 : 0);
1977
1978 do {
1979 if (uio == NULL) {
1980 /*
1981 * Data is prepackaged in "top".
1982 */
1983 resid = 0;
1984 if (flags & MSG_EOR)
1985 top->m_flags |= M_EOR;
1986 } else {
1987 int chainlength;
1988 int bytes_to_copy;
1989 boolean_t jumbocl;
1990 boolean_t bigcl;
1991 int bytes_to_alloc;
1992
1993 bytes_to_copy = imin(resid, space);
1994
1995 bytes_to_alloc = bytes_to_copy;
1996 if (top == NULL)
1997 bytes_to_alloc += headroom;
1998
1999 if (sosendminchain > 0)
2000 chainlength = 0;
2001 else
2002 chainlength = sosendmaxchain;
2003
2004 /*
2005 * Use big 4 KB cluster when the outgoing interface
2006 * does not prefer 2 KB clusters
2007 */
2008 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2009 sosendbigcl_ignore_capab;
2010
2011 /*
2012 * Attempt to use larger than system page-size
2013 * clusters for large writes only if there is
2014 * a jumbo cluster pool and if the socket is
2015 * marked accordingly.
2016 */
2017 jumbocl = sosendjcl && njcl > 0 &&
2018 ((so->so_flags & SOF_MULTIPAGES) ||
2019 sosendjcl_ignore_capab) &&
2020 bigcl;
2021
2022 socket_unlock(so, 0);
2023
2024 do {
2025 int num_needed;
2026 int hdrs_needed = (top == NULL) ? 1 : 0;
2027
2028 /*
2029 * try to maintain a local cache of mbuf
2030 * clusters needed to complete this
2031 * write the list is further limited to
2032 * the number that are currently needed
2033 * to fill the socket this mechanism
2034 * allows a large number of mbufs/
2035 * clusters to be grabbed under a single
2036 * mbuf lock... if we can't get any
2037 * clusters, than fall back to trying
2038 * for mbufs if we fail early (or
2039 * miscalcluate the number needed) make
2040 * sure to release any clusters we
2041 * haven't yet consumed.
2042 */
2043 if (freelist == NULL &&
2044 bytes_to_alloc > MBIGCLBYTES &&
2045 jumbocl) {
2046 num_needed =
2047 bytes_to_alloc / M16KCLBYTES;
2048
2049 if ((bytes_to_alloc -
2050 (num_needed * M16KCLBYTES))
2051 >= MINCLSIZE)
2052 num_needed++;
2053
2054 freelist =
2055 m_getpackets_internal(
2056 (unsigned int *)&num_needed,
2057 hdrs_needed, M_WAIT, 0,
2058 M16KCLBYTES);
2059 /*
2060 * Fall back to 4K cluster size
2061 * if allocation failed
2062 */
2063 }
2064
2065 if (freelist == NULL &&
2066 bytes_to_alloc > MCLBYTES &&
2067 bigcl) {
2068 num_needed =
2069 bytes_to_alloc / MBIGCLBYTES;
2070
2071 if ((bytes_to_alloc -
2072 (num_needed * MBIGCLBYTES)) >=
2073 MINCLSIZE)
2074 num_needed++;
2075
2076 freelist =
2077 m_getpackets_internal(
2078 (unsigned int *)&num_needed,
2079 hdrs_needed, M_WAIT, 0,
2080 MBIGCLBYTES);
2081 /*
2082 * Fall back to cluster size
2083 * if allocation failed
2084 */
2085 }
2086
2087 /*
2088 * Allocate a cluster as we want to
2089 * avoid to split the data in more
2090 * that one segment and using MINCLSIZE
2091 * would lead us to allocate two mbufs
2092 */
2093 if (soreserveheadroom != 0 &&
2094 freelist == NULL &&
2095 ((top == NULL &&
2096 bytes_to_alloc > _MHLEN) ||
2097 bytes_to_alloc > _MLEN)) {
2098 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2099 MCLBYTES;
2100 freelist =
2101 m_getpackets_internal(
2102 (unsigned int *)&num_needed,
2103 hdrs_needed, M_WAIT, 0,
2104 MCLBYTES);
2105 /*
2106 * Fall back to a single mbuf
2107 * if allocation failed
2108 */
2109 } else if (freelist == NULL &&
2110 bytes_to_alloc > MINCLSIZE) {
2111 num_needed =
2112 bytes_to_alloc / MCLBYTES;
2113
2114 if ((bytes_to_alloc -
2115 (num_needed * MCLBYTES)) >=
2116 MINCLSIZE)
2117 num_needed++;
2118
2119 freelist =
2120 m_getpackets_internal(
2121 (unsigned int *)&num_needed,
2122 hdrs_needed, M_WAIT, 0,
2123 MCLBYTES);
2124 /*
2125 * Fall back to a single mbuf
2126 * if allocation failed
2127 */
2128 }
2129 /*
2130 * For datagram protocols, leave
2131 * headroom for protocol headers
2132 * in the first cluster of the chain
2133 */
2134 if (freelist != NULL && atomic &&
2135 top == NULL && headroom > 0) {
2136 freelist->m_data += headroom;
2137 }
2138
2139 /*
2140 * Fall back to regular mbufs without
2141 * reserving the socket headroom
2142 */
2143 if (freelist == NULL) {
2144 if (top == NULL)
2145 MGETHDR(freelist,
2146 M_WAIT, MT_DATA);
2147 else
2148 MGET(freelist,
2149 M_WAIT, MT_DATA);
2150
2151 if (freelist == NULL) {
2152 error = ENOBUFS;
2153 socket_lock(so, 0);
2154 goto release;
2155 }
2156 /*
2157 * For datagram protocols,
2158 * leave room for protocol
2159 * headers in first mbuf.
2160 */
2161 if (atomic && top == NULL &&
2162 bytes_to_copy < MHLEN) {
2163 MH_ALIGN(freelist,
2164 bytes_to_copy);
2165 }
2166 }
2167 m = freelist;
2168 freelist = m->m_next;
2169 m->m_next = NULL;
2170
2171 if ((m->m_flags & M_EXT))
2172 mlen = m->m_ext.ext_size -
2173 m_leadingspace(m);
2174 else if ((m->m_flags & M_PKTHDR))
2175 mlen =
2176 MHLEN - m_leadingspace(m);
2177 else
2178 mlen = MLEN - m_leadingspace(m);
2179 len = imin(mlen, bytes_to_copy);
2180
2181 chainlength += len;
2182
2183 space -= len;
2184
2185 error = uiomove(mtod(m, caddr_t),
2186 len, uio);
2187
2188 resid = uio_resid(uio);
2189
2190 m->m_len = len;
2191 *mp = m;
2192 top->m_pkthdr.len += len;
2193 if (error)
2194 break;
2195 mp = &m->m_next;
2196 if (resid <= 0) {
2197 if (flags & MSG_EOR)
2198 top->m_flags |= M_EOR;
2199 break;
2200 }
2201 bytes_to_copy = min(resid, space);
2202
2203 } while (space > 0 &&
2204 (chainlength < sosendmaxchain || atomic ||
2205 resid < MINCLSIZE));
2206
2207 socket_lock(so, 0);
2208
2209 if (error)
2210 goto release;
2211 }
2212
2213 if (flags & (MSG_HOLD|MSG_SEND)) {
2214 /* Enqueue for later, go away if HOLD */
2215 struct mbuf *mb1;
2216 if (so->so_temp && (flags & MSG_FLUSH)) {
2217 m_freem(so->so_temp);
2218 so->so_temp = NULL;
2219 }
2220 if (so->so_temp)
2221 so->so_tail->m_next = top;
2222 else
2223 so->so_temp = top;
2224 mb1 = top;
2225 while (mb1->m_next)
2226 mb1 = mb1->m_next;
2227 so->so_tail = mb1;
2228 if (flags & MSG_HOLD) {
2229 top = NULL;
2230 goto release;
2231 }
2232 top = so->so_temp;
2233 }
2234 if (dontroute)
2235 so->so_options |= SO_DONTROUTE;
2236
2237 /*
2238 * Compute flags here, for pru_send and NKEs
2239 *
2240 * If the user set MSG_EOF, the protocol
2241 * understands this flag and nothing left to
2242 * send then use PRU_SEND_EOF instead of PRU_SEND.
2243 */
2244 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2245 ((flags & MSG_EOF) &&
2246 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2247 (resid <= 0)) ? PRUS_EOF :
2248 /* If there is more to send set PRUS_MORETOCOME */
2249 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2250
2251 if ((flags & MSG_SKIPCFIL) == 0) {
2252 /*
2253 * Socket filter processing
2254 */
2255 error = sflt_data_out(so, addr, &top,
2256 &control, (sendflags & MSG_OOB) ?
2257 sock_data_filt_flag_oob : 0);
2258 if (error) {
2259 if (error == EJUSTRETURN) {
2260 error = 0;
2261 clen = 0;
2262 control = NULL;
2263 top = NULL;
2264 }
2265 goto release;
2266 }
2267 #if CONTENT_FILTER
2268 /*
2269 * Content filter processing
2270 */
2271 error = cfil_sock_data_out(so, addr, top,
2272 control, (sendflags & MSG_OOB) ?
2273 sock_data_filt_flag_oob : 0);
2274 if (error) {
2275 if (error == EJUSTRETURN) {
2276 error = 0;
2277 clen = 0;
2278 control = NULL;
2279 top = NULL;
2280 }
2281 goto release;
2282 }
2283 #endif /* CONTENT_FILTER */
2284 }
2285 if (so->so_flags & SOF_ENABLE_MSGS) {
2286 /*
2287 * Make a copy of control mbuf,
2288 * so that msg priority can be
2289 * passed to subsequent mbufs.
2290 */
2291 control_copy = m_dup(control, M_NOWAIT);
2292 }
2293 error = (*so->so_proto->pr_usrreqs->pru_send)
2294 (so, sendflags, top, addr, control, p);
2295
2296 if (flags & MSG_SEND)
2297 so->so_temp = NULL;
2298
2299 if (dontroute)
2300 so->so_options &= ~SO_DONTROUTE;
2301
2302 clen = 0;
2303 control = control_copy;
2304 control_copy = NULL;
2305 top = NULL;
2306 mp = &top;
2307 if (error)
2308 goto release;
2309 } while (resid && space > 0);
2310 } while (resid);
2311
2312 release:
2313 if (sblocked)
2314 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2315 else
2316 socket_unlock(so, 1);
2317 out:
2318 if (top != NULL)
2319 m_freem(top);
2320 if (control != NULL)
2321 m_freem(control);
2322 if (freelist != NULL)
2323 m_freem_list(freelist);
2324 if (control_copy != NULL)
2325 m_freem(control_copy);
2326
2327 /*
2328 * One write has been done. This was enough. Get back to "normal"
2329 * behavior.
2330 */
2331 if (so->so_flags1 & SOF1_PRECONNECT_DATA)
2332 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
2333
2334 if (en_tracing) {
2335 /* resid passed here is the bytes left in uio */
2336 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2337 VM_KERNEL_ADDRPERM(so),
2338 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2339 (int64_t)(orig_resid - resid));
2340 }
2341 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2342 so->so_snd.sb_cc, space, error);
2343
2344 return (error);
2345 }
2346
2347 /*
2348 * Supported only connected sockets (no address) without ancillary data
2349 * (control mbuf) for atomic protocols
2350 */
2351 int
2352 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2353 {
2354 struct mbuf *m, *freelist = NULL;
2355 user_ssize_t len, resid;
2356 int error, dontroute, mlen;
2357 int atomic = sosendallatonce(so);
2358 int sblocked = 0;
2359 struct proc *p = current_proc();
2360 u_int uiofirst = 0;
2361 u_int uiolast = 0;
2362 struct mbuf *top = NULL;
2363 uint16_t headroom = 0;
2364 boolean_t bigcl;
2365
2366 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2367 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2368
2369 if (so->so_type != SOCK_DGRAM) {
2370 error = EINVAL;
2371 goto out;
2372 }
2373 if (atomic == 0) {
2374 error = EINVAL;
2375 goto out;
2376 }
2377 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2378 error = EPROTONOSUPPORT;
2379 goto out;
2380 }
2381 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2382 error = EINVAL;
2383 goto out;
2384 }
2385 resid = uio_array_resid(uioarray, uiocnt);
2386
2387 /*
2388 * In theory resid should be unsigned.
2389 * However, space must be signed, as it might be less than 0
2390 * if we over-committed, and we must use a signed comparison
2391 * of space and resid. On the other hand, a negative resid
2392 * causes us to loop sending 0-length segments to the protocol.
2393 *
2394 * Note: We limit resid to be a positive int value as we use
2395 * imin() to set bytes_to_copy -- radr://14558484
2396 */
2397 if (resid < 0 || resid > INT_MAX) {
2398 error = EINVAL;
2399 goto out;
2400 }
2401
2402 socket_lock(so, 1);
2403 so_update_last_owner_locked(so, p);
2404 so_update_policy(so);
2405
2406 #if NECP
2407 so_update_necp_policy(so, NULL, NULL);
2408 #endif /* NECP */
2409
2410 dontroute = (flags & MSG_DONTROUTE) &&
2411 (so->so_options & SO_DONTROUTE) == 0 &&
2412 (so->so_proto->pr_flags & PR_ATOMIC);
2413 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2414
2415 error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2416 &sblocked, NULL);
2417 if (error)
2418 goto release;
2419
2420 /*
2421 * Use big 4 KB clusters when the outgoing interface does not prefer
2422 * 2 KB clusters
2423 */
2424 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2425
2426 if (soreserveheadroom != 0)
2427 headroom = so->so_pktheadroom;
2428
2429 do {
2430 int i;
2431 int num_needed = 0;
2432 int chainlength;
2433 size_t maxpktlen = 0;
2434 int bytes_to_alloc;
2435
2436 if (sosendminchain > 0)
2437 chainlength = 0;
2438 else
2439 chainlength = sosendmaxchain;
2440
2441 socket_unlock(so, 0);
2442
2443 /*
2444 * Find a set of uio that fit in a reasonable number
2445 * of mbuf packets
2446 */
2447 for (i = uiofirst; i < uiocnt; i++) {
2448 struct uio *auio = uioarray[i];
2449
2450 len = uio_resid(auio);
2451
2452 /* Do nothing for empty messages */
2453 if (len == 0)
2454 continue;
2455
2456 num_needed += 1;
2457 uiolast += 1;
2458
2459 if (len > maxpktlen)
2460 maxpktlen = len;
2461
2462 chainlength += len;
2463 if (chainlength > sosendmaxchain)
2464 break;
2465 }
2466 /*
2467 * Nothing left to send
2468 */
2469 if (num_needed == 0) {
2470 socket_lock(so, 0);
2471 break;
2472 }
2473 /*
2474 * Allocate buffer large enough to include headroom space for
2475 * network and link header
2476 *
2477 */
2478 bytes_to_alloc = maxpktlen + headroom;
2479
2480 /*
2481 * Allocate a single contiguous buffer of the smallest available
2482 * size when possible
2483 */
2484 if (bytes_to_alloc > MCLBYTES &&
2485 bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2486 freelist = m_getpackets_internal(
2487 (unsigned int *)&num_needed,
2488 num_needed, M_WAIT, 1,
2489 MBIGCLBYTES);
2490 } else if (bytes_to_alloc > _MHLEN &&
2491 bytes_to_alloc <= MCLBYTES) {
2492 freelist = m_getpackets_internal(
2493 (unsigned int *)&num_needed,
2494 num_needed, M_WAIT, 1,
2495 MCLBYTES);
2496 } else {
2497 freelist = m_allocpacket_internal(
2498 (unsigned int *)&num_needed,
2499 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2500 }
2501
2502 if (freelist == NULL) {
2503 socket_lock(so, 0);
2504 error = ENOMEM;
2505 goto release;
2506 }
2507 /*
2508 * Copy each uio of the set into its own mbuf packet
2509 */
2510 for (i = uiofirst, m = freelist;
2511 i < uiolast && m != NULL;
2512 i++) {
2513 int bytes_to_copy;
2514 struct mbuf *n;
2515 struct uio *auio = uioarray[i];
2516
2517 bytes_to_copy = uio_resid(auio);
2518
2519 /* Do nothing for empty messages */
2520 if (bytes_to_copy == 0)
2521 continue;
2522 /*
2523 * Leave headroom for protocol headers
2524 * in the first mbuf of the chain
2525 */
2526 m->m_data += headroom;
2527
2528 for (n = m; n != NULL; n = n->m_next) {
2529 if ((m->m_flags & M_EXT))
2530 mlen = m->m_ext.ext_size -
2531 m_leadingspace(m);
2532 else if ((m->m_flags & M_PKTHDR))
2533 mlen =
2534 MHLEN - m_leadingspace(m);
2535 else
2536 mlen = MLEN - m_leadingspace(m);
2537 len = imin(mlen, bytes_to_copy);
2538
2539 /*
2540 * Note: uiomove() decrements the iovec
2541 * length
2542 */
2543 error = uiomove(mtod(n, caddr_t),
2544 len, auio);
2545 if (error != 0)
2546 break;
2547 n->m_len = len;
2548 m->m_pkthdr.len += len;
2549
2550 VERIFY(m->m_pkthdr.len <= maxpktlen);
2551
2552 bytes_to_copy -= len;
2553 resid -= len;
2554 }
2555 if (m->m_pkthdr.len == 0) {
2556 printf(
2557 "%s:%d so %llx pkt %llx type %u len null\n",
2558 __func__, __LINE__,
2559 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2560 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2561 m->m_type);
2562 }
2563 if (error != 0)
2564 break;
2565 m = m->m_nextpkt;
2566 }
2567
2568 socket_lock(so, 0);
2569
2570 if (error)
2571 goto release;
2572 top = freelist;
2573 freelist = NULL;
2574
2575 if (dontroute)
2576 so->so_options |= SO_DONTROUTE;
2577
2578 if ((flags & MSG_SKIPCFIL) == 0) {
2579 struct mbuf **prevnextp = NULL;
2580
2581 for (i = uiofirst, m = top;
2582 i < uiolast && m != NULL;
2583 i++) {
2584 struct mbuf *nextpkt = m->m_nextpkt;
2585
2586 /*
2587 * Socket filter processing
2588 */
2589 error = sflt_data_out(so, NULL, &m,
2590 NULL, 0);
2591 if (error != 0 && error != EJUSTRETURN)
2592 goto release;
2593
2594 #if CONTENT_FILTER
2595 if (error == 0) {
2596 /*
2597 * Content filter processing
2598 */
2599 error = cfil_sock_data_out(so, NULL, m,
2600 NULL, 0);
2601 if (error != 0 && error != EJUSTRETURN)
2602 goto release;
2603 }
2604 #endif /* CONTENT_FILTER */
2605 /*
2606 * Remove packet from the list when
2607 * swallowed by a filter
2608 */
2609 if (error == EJUSTRETURN) {
2610 error = 0;
2611 if (prevnextp != NULL)
2612 *prevnextp = nextpkt;
2613 else
2614 top = nextpkt;
2615 }
2616
2617 m = nextpkt;
2618 if (m != NULL)
2619 prevnextp = &m->m_nextpkt;
2620 }
2621 }
2622 if (top != NULL)
2623 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2624 (so, 0, top, NULL, NULL, p);
2625
2626 if (dontroute)
2627 so->so_options &= ~SO_DONTROUTE;
2628
2629 top = NULL;
2630 uiofirst = uiolast;
2631 } while (resid > 0 && error == 0);
2632 release:
2633 if (sblocked)
2634 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2635 else
2636 socket_unlock(so, 1);
2637 out:
2638 if (top != NULL)
2639 m_freem(top);
2640 if (freelist != NULL)
2641 m_freem_list(freelist);
2642
2643 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2644 so->so_snd.sb_cc, 0, error);
2645
2646 return (error);
2647 }
2648
2649 /*
2650 * May return ERESTART when packet is dropped by MAC policy check
2651 */
2652 static int
2653 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2654 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2655 {
2656 int error = 0;
2657 struct mbuf *m = *mp;
2658 struct mbuf *nextrecord = *nextrecordp;
2659
2660 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2661 #if CONFIG_MACF_SOCKET_SUBSET
2662 /*
2663 * Call the MAC framework for policy checking if we're in
2664 * the user process context and the socket isn't connected.
2665 */
2666 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2667 struct mbuf *m0 = m;
2668 /*
2669 * Dequeue this record (temporarily) from the receive
2670 * list since we're about to drop the socket's lock
2671 * where a new record may arrive and be appended to
2672 * the list. Upon MAC policy failure, the record
2673 * will be freed. Otherwise, we'll add it back to
2674 * the head of the list. We cannot rely on SB_LOCK
2675 * because append operation uses the socket's lock.
2676 */
2677 do {
2678 m->m_nextpkt = NULL;
2679 sbfree(&so->so_rcv, m);
2680 m = m->m_next;
2681 } while (m != NULL);
2682 m = m0;
2683 so->so_rcv.sb_mb = nextrecord;
2684 SB_EMPTY_FIXUP(&so->so_rcv);
2685 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2686 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2687 socket_unlock(so, 0);
2688
2689 if (mac_socket_check_received(proc_ucred(p), so,
2690 mtod(m, struct sockaddr *)) != 0) {
2691 /*
2692 * MAC policy failure; free this record and
2693 * process the next record (or block until
2694 * one is available). We have adjusted sb_cc
2695 * and sb_mbcnt above so there is no need to
2696 * call sbfree() again.
2697 */
2698 m_freem(m);
2699 /*
2700 * Clear SB_LOCK but don't unlock the socket.
2701 * Process the next record or wait for one.
2702 */
2703 socket_lock(so, 0);
2704 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2705 error = ERESTART;
2706 goto done;
2707 }
2708 socket_lock(so, 0);
2709 /*
2710 * If the socket has been defunct'd, drop it.
2711 */
2712 if (so->so_flags & SOF_DEFUNCT) {
2713 m_freem(m);
2714 error = ENOTCONN;
2715 goto done;
2716 }
2717 /*
2718 * Re-adjust the socket receive list and re-enqueue
2719 * the record in front of any packets which may have
2720 * been appended while we dropped the lock.
2721 */
2722 for (m = m0; m->m_next != NULL; m = m->m_next)
2723 sballoc(&so->so_rcv, m);
2724 sballoc(&so->so_rcv, m);
2725 if (so->so_rcv.sb_mb == NULL) {
2726 so->so_rcv.sb_lastrecord = m0;
2727 so->so_rcv.sb_mbtail = m;
2728 }
2729 m = m0;
2730 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2731 so->so_rcv.sb_mb = m;
2732 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2733 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2734 }
2735 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2736 if (psa != NULL) {
2737 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
2738 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2739 error = EWOULDBLOCK;
2740 goto done;
2741 }
2742 }
2743 if (flags & MSG_PEEK) {
2744 m = m->m_next;
2745 } else {
2746 sbfree(&so->so_rcv, m);
2747 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2748 panic("%s: about to create invalid socketbuf",
2749 __func__);
2750 /* NOTREACHED */
2751 }
2752 MFREE(m, so->so_rcv.sb_mb);
2753 m = so->so_rcv.sb_mb;
2754 if (m != NULL) {
2755 m->m_nextpkt = nextrecord;
2756 } else {
2757 so->so_rcv.sb_mb = nextrecord;
2758 SB_EMPTY_FIXUP(&so->so_rcv);
2759 }
2760 }
2761 done:
2762 *mp = m;
2763 *nextrecordp = nextrecord;
2764
2765 return (error);
2766 }
2767
2768 /*
2769 * Process one or more MT_CONTROL mbufs present before any data mbufs
2770 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2771 * just copy the data; if !MSG_PEEK, we call into the protocol to
2772 * perform externalization.
2773 */
2774 static int
2775 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
2776 struct mbuf **mp, struct mbuf **nextrecordp)
2777 {
2778 int error = 0;
2779 struct mbuf *cm = NULL, *cmn;
2780 struct mbuf **cme = &cm;
2781 struct sockbuf *sb_rcv = &so->so_rcv;
2782 struct mbuf **msgpcm = NULL;
2783 struct mbuf *m = *mp;
2784 struct mbuf *nextrecord = *nextrecordp;
2785 struct protosw *pr = so->so_proto;
2786
2787 /*
2788 * Externalizing the control messages would require us to
2789 * drop the socket's lock below. Once we re-acquire the
2790 * lock, the mbuf chain might change. In order to preserve
2791 * consistency, we unlink all control messages from the
2792 * first mbuf chain in one shot and link them separately
2793 * onto a different chain.
2794 */
2795 do {
2796 if (flags & MSG_PEEK) {
2797 if (controlp != NULL) {
2798 if (*controlp == NULL) {
2799 msgpcm = controlp;
2800 }
2801 *controlp = m_copy(m, 0, m->m_len);
2802
2803 /*
2804 * If we failed to allocate an mbuf,
2805 * release any previously allocated
2806 * mbufs for control data. Return
2807 * an error. Keep the mbufs in the
2808 * socket as this is using
2809 * MSG_PEEK flag.
2810 */
2811 if (*controlp == NULL) {
2812 m_freem(*msgpcm);
2813 error = ENOBUFS;
2814 goto done;
2815 }
2816 controlp = &(*controlp)->m_next;
2817 }
2818 m = m->m_next;
2819 } else {
2820 m->m_nextpkt = NULL;
2821 sbfree(sb_rcv, m);
2822 sb_rcv->sb_mb = m->m_next;
2823 m->m_next = NULL;
2824 *cme = m;
2825 cme = &(*cme)->m_next;
2826 m = sb_rcv->sb_mb;
2827 }
2828 } while (m != NULL && m->m_type == MT_CONTROL);
2829
2830 if (!(flags & MSG_PEEK)) {
2831 if (sb_rcv->sb_mb != NULL) {
2832 sb_rcv->sb_mb->m_nextpkt = nextrecord;
2833 } else {
2834 sb_rcv->sb_mb = nextrecord;
2835 SB_EMPTY_FIXUP(sb_rcv);
2836 }
2837 if (nextrecord == NULL)
2838 sb_rcv->sb_lastrecord = m;
2839 }
2840
2841 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2842 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2843
2844 while (cm != NULL) {
2845 int cmsg_type;
2846
2847 cmn = cm->m_next;
2848 cm->m_next = NULL;
2849 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2850
2851 /*
2852 * Call the protocol to externalize SCM_RIGHTS message
2853 * and return the modified message to the caller upon
2854 * success. Otherwise, all other control messages are
2855 * returned unmodified to the caller. Note that we
2856 * only get into this loop if MSG_PEEK is not set.
2857 */
2858 if (pr->pr_domain->dom_externalize != NULL &&
2859 cmsg_type == SCM_RIGHTS) {
2860 /*
2861 * Release socket lock: see 3903171. This
2862 * would also allow more records to be appended
2863 * to the socket buffer. We still have SB_LOCK
2864 * set on it, so we can be sure that the head
2865 * of the mbuf chain won't change.
2866 */
2867 socket_unlock(so, 0);
2868 error = (*pr->pr_domain->dom_externalize)(cm);
2869 socket_lock(so, 0);
2870 } else {
2871 error = 0;
2872 }
2873
2874 if (controlp != NULL && error == 0) {
2875 *controlp = cm;
2876 controlp = &(*controlp)->m_next;
2877 } else {
2878 (void) m_free(cm);
2879 }
2880 cm = cmn;
2881 }
2882 /*
2883 * Update the value of nextrecord in case we received new
2884 * records when the socket was unlocked above for
2885 * externalizing SCM_RIGHTS.
2886 */
2887 if (m != NULL)
2888 nextrecord = sb_rcv->sb_mb->m_nextpkt;
2889 else
2890 nextrecord = sb_rcv->sb_mb;
2891
2892 done:
2893 *mp = m;
2894 *nextrecordp = nextrecord;
2895
2896 return (error);
2897 }
2898
2899 /*
2900 * Implement receive operations on a socket.
2901 * We depend on the way that records are added to the sockbuf
2902 * by sbappend*. In particular, each record (mbufs linked through m_next)
2903 * must begin with an address if the protocol so specifies,
2904 * followed by an optional mbuf or mbufs containing ancillary data,
2905 * and then zero or more mbufs of data.
2906 * In order to avoid blocking network interrupts for the entire time here,
2907 * we splx() while doing the actual copy to user space.
2908 * Although the sockbuf is locked, new data may still be appended,
2909 * and thus we must maintain consistency of the sockbuf during that time.
2910 *
2911 * The caller may receive the data as a single mbuf chain by supplying
2912 * an mbuf **mp0 for use in returning the chain. The uio is then used
2913 * only for the count in uio_resid.
2914 *
2915 * Returns: 0 Success
2916 * ENOBUFS
2917 * ENOTCONN
2918 * EWOULDBLOCK
2919 * uiomove:EFAULT
2920 * sblock:EWOULDBLOCK
2921 * sblock:EINTR
2922 * sbwait:EBADF
2923 * sbwait:EINTR
2924 * sodelayed_copy:EFAULT
2925 * <pru_rcvoob>:EINVAL[TCP]
2926 * <pru_rcvoob>:EWOULDBLOCK[TCP]
2927 * <pru_rcvoob>:???
2928 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
2929 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
2930 * <pr_domain->dom_externalize>:???
2931 *
2932 * Notes: Additional return values from calls through <pru_rcvoob> and
2933 * <pr_domain->dom_externalize> depend on protocols other than
2934 * TCP or AF_UNIX, which are documented above.
2935 */
2936 int
2937 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2938 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2939 {
2940 struct mbuf *m, **mp, *ml = NULL;
2941 struct mbuf *nextrecord, *free_list;
2942 int flags, error, offset;
2943 user_ssize_t len;
2944 struct protosw *pr = so->so_proto;
2945 int moff, type = 0;
2946 user_ssize_t orig_resid = uio_resid(uio);
2947 user_ssize_t delayed_copy_len;
2948 int can_delay;
2949 int need_event;
2950 struct proc *p = current_proc();
2951 boolean_t en_tracing = FALSE;
2952
2953 /*
2954 * Sanity check on the length passed by caller as we are making 'int'
2955 * comparisons
2956 */
2957 if (orig_resid < 0 || orig_resid > INT_MAX)
2958 return (EINVAL);
2959
2960 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
2961 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
2962 so->so_rcv.sb_hiwat);
2963
2964 socket_lock(so, 1);
2965 so_update_last_owner_locked(so, p);
2966 so_update_policy(so);
2967
2968 #ifdef MORE_LOCKING_DEBUG
2969 if (so->so_usecount == 1) {
2970 panic("%s: so=%x no other reference on socket\n", __func__, so);
2971 /* NOTREACHED */
2972 }
2973 #endif
2974 mp = mp0;
2975 if (psa != NULL)
2976 *psa = NULL;
2977 if (controlp != NULL)
2978 *controlp = NULL;
2979 if (flagsp != NULL)
2980 flags = *flagsp &~ MSG_EOR;
2981 else
2982 flags = 0;
2983
2984 /*
2985 * If a recv attempt is made on a previously-accepted socket
2986 * that has been marked as inactive (disconnected), reject
2987 * the request.
2988 */
2989 if (so->so_flags & SOF_DEFUNCT) {
2990 struct sockbuf *sb = &so->so_rcv;
2991
2992 error = ENOTCONN;
2993 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
2994 __func__, proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2995 SOCK_DOM(so), SOCK_TYPE(so), error));
2996 /*
2997 * This socket should have been disconnected and flushed
2998 * prior to being returned from sodefunct(); there should
2999 * be no data on its receive list, so panic otherwise.
3000 */
3001 if (so->so_state & SS_DEFUNCT)
3002 sb_empty_assert(sb, __func__);
3003 socket_unlock(so, 1);
3004 return (error);
3005 }
3006
3007 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3008 pr->pr_usrreqs->pru_preconnect) {
3009 /*
3010 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3011 * calling write() right after this. *If* the app calls a read
3012 * we do not want to block this read indefinetely. Thus,
3013 * we trigger a connect so that the session gets initiated.
3014 */
3015 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3016
3017 if (error) {
3018 socket_unlock(so, 1);
3019 return (error);
3020 }
3021 }
3022
3023 if (ENTR_SHOULDTRACE &&
3024 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3025 /*
3026 * enable energy tracing for inet sockets that go over
3027 * non-loopback interfaces only.
3028 */
3029 struct inpcb *inp = sotoinpcb(so);
3030 if (inp->inp_last_outifp != NULL &&
3031 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3032 en_tracing = TRUE;
3033 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3034 VM_KERNEL_ADDRPERM(so),
3035 ((so->so_state & SS_NBIO) ?
3036 kEnTrFlagNonBlocking : 0),
3037 (int64_t)orig_resid);
3038 }
3039 }
3040
3041 /*
3042 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3043 * regardless of the flags argument. Here is the case were
3044 * out-of-band data is not inline.
3045 */
3046 if ((flags & MSG_OOB) ||
3047 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3048 (so->so_options & SO_OOBINLINE) == 0 &&
3049 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3050 m = m_get(M_WAIT, MT_DATA);
3051 if (m == NULL) {
3052 socket_unlock(so, 1);
3053 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3054 ENOBUFS, 0, 0, 0, 0);
3055 return (ENOBUFS);
3056 }
3057 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3058 if (error)
3059 goto bad;
3060 socket_unlock(so, 0);
3061 do {
3062 error = uiomove(mtod(m, caddr_t),
3063 imin(uio_resid(uio), m->m_len), uio);
3064 m = m_free(m);
3065 } while (uio_resid(uio) && error == 0 && m != NULL);
3066 socket_lock(so, 0);
3067 bad:
3068 if (m != NULL)
3069 m_freem(m);
3070
3071 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3072 if (error == EWOULDBLOCK || error == EINVAL) {
3073 /*
3074 * Let's try to get normal data:
3075 * EWOULDBLOCK: out-of-band data not
3076 * receive yet. EINVAL: out-of-band data
3077 * already read.
3078 */
3079 error = 0;
3080 goto nooob;
3081 } else if (error == 0 && flagsp != NULL) {
3082 *flagsp |= MSG_OOB;
3083 }
3084 }
3085 socket_unlock(so, 1);
3086 if (en_tracing) {
3087 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3088 VM_KERNEL_ADDRPERM(so), 0,
3089 (int64_t)(orig_resid - uio_resid(uio)));
3090 }
3091 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3092 0, 0, 0, 0);
3093
3094 return (error);
3095 }
3096 nooob:
3097 if (mp != NULL)
3098 *mp = NULL;
3099
3100 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3101 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3102 }
3103
3104 free_list = NULL;
3105 delayed_copy_len = 0;
3106 restart:
3107 #ifdef MORE_LOCKING_DEBUG
3108 if (so->so_usecount <= 1)
3109 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3110 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3111 #endif
3112 /*
3113 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3114 * and if so just return to the caller. This could happen when
3115 * soreceive() is called by a socket upcall function during the
3116 * time the socket is freed. The socket buffer would have been
3117 * locked across the upcall, therefore we cannot put this thread
3118 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3119 * we may livelock), because the lock on the socket buffer will
3120 * only be released when the upcall routine returns to its caller.
3121 * Because the socket has been officially closed, there can be
3122 * no further read on it.
3123 *
3124 * A multipath subflow socket would have its SS_NOFDREF set by
3125 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3126 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3127 */
3128 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3129 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3130 socket_unlock(so, 1);
3131 return (0);
3132 }
3133
3134 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3135 if (error) {
3136 socket_unlock(so, 1);
3137 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3138 0, 0, 0, 0);
3139 if (en_tracing) {
3140 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3141 VM_KERNEL_ADDRPERM(so), 0,
3142 (int64_t)(orig_resid - uio_resid(uio)));
3143 }
3144 return (error);
3145 }
3146
3147 m = so->so_rcv.sb_mb;
3148 /*
3149 * If we have less data than requested, block awaiting more
3150 * (subject to any timeout) if:
3151 * 1. the current count is less than the low water mark, or
3152 * 2. MSG_WAITALL is set, and it is possible to do the entire
3153 * receive operation at once if we block (resid <= hiwat).
3154 * 3. MSG_DONTWAIT is not set
3155 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3156 * we have to do the receive in sections, and thus risk returning
3157 * a short count if a timeout or signal occurs after we start.
3158 */
3159 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3160 so->so_rcv.sb_cc < uio_resid(uio)) &&
3161 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3162 ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
3163 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
3164 /*
3165 * Panic if we notice inconsistencies in the socket's
3166 * receive list; both sb_mb and sb_cc should correctly
3167 * reflect the contents of the list, otherwise we may
3168 * end up with false positives during select() or poll()
3169 * which could put the application in a bad state.
3170 */
3171 SB_MB_CHECK(&so->so_rcv);
3172
3173 if (so->so_error) {
3174 if (m != NULL)
3175 goto dontblock;
3176 error = so->so_error;
3177 if ((flags & MSG_PEEK) == 0)
3178 so->so_error = 0;
3179 goto release;
3180 }
3181 if (so->so_state & SS_CANTRCVMORE) {
3182 #if CONTENT_FILTER
3183 /*
3184 * Deal with half closed connections
3185 */
3186 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3187 cfil_sock_data_pending(&so->so_rcv) != 0)
3188 CFIL_LOG(LOG_INFO,
3189 "so %llx ignore SS_CANTRCVMORE",
3190 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3191 else
3192 #endif /* CONTENT_FILTER */
3193 if (m != NULL)
3194 goto dontblock;
3195 else
3196 goto release;
3197 }
3198 for (; m != NULL; m = m->m_next)
3199 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3200 m = so->so_rcv.sb_mb;
3201 goto dontblock;
3202 }
3203 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3204 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3205 error = ENOTCONN;
3206 goto release;
3207 }
3208 if (uio_resid(uio) == 0)
3209 goto release;
3210
3211 if ((so->so_state & SS_NBIO) ||
3212 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3213 error = EWOULDBLOCK;
3214 goto release;
3215 }
3216 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3217 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3218 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3219 #if EVEN_MORE_LOCKING_DEBUG
3220 if (socket_debug)
3221 printf("Waiting for socket data\n");
3222 #endif
3223
3224 error = sbwait(&so->so_rcv);
3225 #if EVEN_MORE_LOCKING_DEBUG
3226 if (socket_debug)
3227 printf("SORECEIVE - sbwait returned %d\n", error);
3228 #endif
3229 if (so->so_usecount < 1) {
3230 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3231 __func__, so, so->so_usecount);
3232 /* NOTREACHED */
3233 }
3234 if (error) {
3235 socket_unlock(so, 1);
3236 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3237 0, 0, 0, 0);
3238 if (en_tracing) {
3239 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3240 VM_KERNEL_ADDRPERM(so), 0,
3241 (int64_t)(orig_resid - uio_resid(uio)));
3242 }
3243 return (error);
3244 }
3245 goto restart;
3246 }
3247 dontblock:
3248 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3249 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3250 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3251 nextrecord = m->m_nextpkt;
3252
3253 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3254 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3255 mp0 == NULL);
3256 if (error == ERESTART)
3257 goto restart;
3258 else if (error != 0)
3259 goto release;
3260 orig_resid = 0;
3261 }
3262
3263 /*
3264 * Process one or more MT_CONTROL mbufs present before any data mbufs
3265 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3266 * just copy the data; if !MSG_PEEK, we call into the protocol to
3267 * perform externalization.
3268 */
3269 if (m != NULL && m->m_type == MT_CONTROL) {
3270 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3271 if (error != 0)
3272 goto release;
3273 orig_resid = 0;
3274 }
3275
3276 /*
3277 * If the socket is a TCP socket with message delivery
3278 * enabled, then create a control msg to deliver the
3279 * relative TCP sequence number for this data. Waiting
3280 * until this point will protect against failures to
3281 * allocate an mbuf for control msgs.
3282 */
3283 if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3284 (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3285 struct mbuf *seq_cm;
3286
3287 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
3288 sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
3289 if (seq_cm == NULL) {
3290 /* unable to allocate a control mbuf */
3291 error = ENOBUFS;
3292 goto release;
3293 }
3294 *controlp = seq_cm;
3295 controlp = &seq_cm->m_next;
3296 }
3297
3298 if (m != NULL) {
3299 if (!(flags & MSG_PEEK)) {
3300 /*
3301 * We get here because m points to an mbuf following
3302 * any MT_SONAME or MT_CONTROL mbufs which have been
3303 * processed above. In any case, m should be pointing
3304 * to the head of the mbuf chain, and the nextrecord
3305 * should be either NULL or equal to m->m_nextpkt.
3306 * See comments above about SB_LOCK.
3307 */
3308 if (m != so->so_rcv.sb_mb ||
3309 m->m_nextpkt != nextrecord) {
3310 panic("%s: post-control !sync so=%p m=%p "
3311 "nextrecord=%p\n", __func__, so, m,
3312 nextrecord);
3313 /* NOTREACHED */
3314 }
3315 if (nextrecord == NULL)
3316 so->so_rcv.sb_lastrecord = m;
3317 }
3318 type = m->m_type;
3319 if (type == MT_OOBDATA)
3320 flags |= MSG_OOB;
3321 } else {
3322 if (!(flags & MSG_PEEK)) {
3323 SB_EMPTY_FIXUP(&so->so_rcv);
3324 }
3325 }
3326 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3327 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3328
3329 moff = 0;
3330 offset = 0;
3331
3332 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
3333 can_delay = 1;
3334 else
3335 can_delay = 0;
3336
3337 need_event = 0;
3338
3339 while (m != NULL &&
3340 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3341 if (m->m_type == MT_OOBDATA) {
3342 if (type != MT_OOBDATA)
3343 break;
3344 } else if (type == MT_OOBDATA) {
3345 break;
3346 }
3347 /*
3348 * Make sure to allways set MSG_OOB event when getting
3349 * out of band data inline.
3350 */
3351 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3352 (so->so_options & SO_OOBINLINE) != 0 &&
3353 (so->so_state & SS_RCVATMARK) != 0) {
3354 flags |= MSG_OOB;
3355 }
3356 so->so_state &= ~SS_RCVATMARK;
3357 len = uio_resid(uio) - delayed_copy_len;
3358 if (so->so_oobmark && len > so->so_oobmark - offset)
3359 len = so->so_oobmark - offset;
3360 if (len > m->m_len - moff)
3361 len = m->m_len - moff;
3362 /*
3363 * If mp is set, just pass back the mbufs.
3364 * Otherwise copy them out via the uio, then free.
3365 * Sockbuf must be consistent here (points to current mbuf,
3366 * it points to next record) when we drop priority;
3367 * we must note any additions to the sockbuf when we
3368 * block interrupts again.
3369 */
3370 if (mp == NULL) {
3371 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3372 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3373 if (can_delay && len == m->m_len) {
3374 /*
3375 * only delay the copy if we're consuming the
3376 * mbuf and we're NOT in MSG_PEEK mode
3377 * and we have enough data to make it worthwile
3378 * to drop and retake the lock... can_delay
3379 * reflects the state of the 2 latter
3380 * constraints moff should always be zero
3381 * in these cases
3382 */
3383 delayed_copy_len += len;
3384 } else {
3385 if (delayed_copy_len) {
3386 error = sodelayed_copy(so, uio,
3387 &free_list, &delayed_copy_len);
3388
3389 if (error) {
3390 goto release;
3391 }
3392 /*
3393 * can only get here if MSG_PEEK is not
3394 * set therefore, m should point at the
3395 * head of the rcv queue; if it doesn't,
3396 * it means something drastically
3397 * changed while we were out from behind
3398 * the lock in sodelayed_copy. perhaps
3399 * a RST on the stream. in any event,
3400 * the stream has been interrupted. it's
3401 * probably best just to return whatever
3402 * data we've moved and let the caller
3403 * sort it out...
3404 */
3405 if (m != so->so_rcv.sb_mb) {
3406 break;
3407 }
3408 }
3409 socket_unlock(so, 0);
3410 error = uiomove(mtod(m, caddr_t) + moff,
3411 (int)len, uio);
3412 socket_lock(so, 0);
3413
3414 if (error)
3415 goto release;
3416 }
3417 } else {
3418 uio_setresid(uio, (uio_resid(uio) - len));
3419 }
3420 if (len == m->m_len - moff) {
3421 if (m->m_flags & M_EOR)
3422 flags |= MSG_EOR;
3423 if (flags & MSG_PEEK) {
3424 m = m->m_next;
3425 moff = 0;
3426 } else {
3427 nextrecord = m->m_nextpkt;
3428 sbfree(&so->so_rcv, m);
3429 m->m_nextpkt = NULL;
3430
3431 /*
3432 * If this packet is an unordered packet
3433 * (indicated by M_UNORDERED_DATA flag), remove
3434 * the additional bytes added to the
3435 * receive socket buffer size.
3436 */
3437 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3438 m->m_len &&
3439 (m->m_flags & M_UNORDERED_DATA) &&
3440 sbreserve(&so->so_rcv,
3441 so->so_rcv.sb_hiwat - m->m_len)) {
3442 if (so->so_msg_state->msg_uno_bytes >
3443 m->m_len) {
3444 so->so_msg_state->
3445 msg_uno_bytes -= m->m_len;
3446 } else {
3447 so->so_msg_state->
3448 msg_uno_bytes = 0;
3449 }
3450 m->m_flags &= ~M_UNORDERED_DATA;
3451 }
3452
3453 if (mp != NULL) {
3454 *mp = m;
3455 mp = &m->m_next;
3456 so->so_rcv.sb_mb = m = m->m_next;
3457 *mp = NULL;
3458 } else {
3459 if (free_list == NULL)
3460 free_list = m;
3461 else
3462 ml->m_next = m;
3463 ml = m;
3464 so->so_rcv.sb_mb = m = m->m_next;
3465 ml->m_next = NULL;
3466 }
3467 if (m != NULL) {
3468 m->m_nextpkt = nextrecord;
3469 if (nextrecord == NULL)
3470 so->so_rcv.sb_lastrecord = m;
3471 } else {
3472 so->so_rcv.sb_mb = nextrecord;
3473 SB_EMPTY_FIXUP(&so->so_rcv);
3474 }
3475 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3476 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3477 }
3478 } else {
3479 if (flags & MSG_PEEK) {
3480 moff += len;
3481 } else {
3482 if (mp != NULL) {
3483 int copy_flag;
3484
3485 if (flags & MSG_DONTWAIT)
3486 copy_flag = M_DONTWAIT;
3487 else
3488 copy_flag = M_WAIT;
3489 *mp = m_copym(m, 0, len, copy_flag);
3490 /*
3491 * Failed to allocate an mbuf?
3492 * Adjust uio_resid back, it was
3493 * adjusted down by len bytes which
3494 * we didn't copy over.
3495 */
3496 if (*mp == NULL) {
3497 uio_setresid(uio,
3498 (uio_resid(uio) + len));
3499 break;
3500 }
3501 }
3502 m->m_data += len;
3503 m->m_len -= len;
3504 so->so_rcv.sb_cc -= len;
3505 }
3506 }
3507 if (so->so_oobmark) {
3508 if ((flags & MSG_PEEK) == 0) {
3509 so->so_oobmark -= len;
3510 if (so->so_oobmark == 0) {
3511 so->so_state |= SS_RCVATMARK;
3512 /*
3513 * delay posting the actual event until
3514 * after any delayed copy processing
3515 * has finished
3516 */
3517 need_event = 1;
3518 break;
3519 }
3520 } else {
3521 offset += len;
3522 if (offset == so->so_oobmark)
3523 break;
3524 }
3525 }
3526 if (flags & MSG_EOR)
3527 break;
3528 /*
3529 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3530 * (for non-atomic socket), we must not quit until
3531 * "uio->uio_resid == 0" or an error termination.
3532 * If a signal/timeout occurs, return with a short
3533 * count but without error. Keep sockbuf locked
3534 * against other readers.
3535 */
3536 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL &&
3537 (uio_resid(uio) - delayed_copy_len) > 0 &&
3538 !sosendallatonce(so) && !nextrecord) {
3539 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3540 #if CONTENT_FILTER
3541 && cfil_sock_data_pending(&so->so_rcv) == 0
3542 #endif /* CONTENT_FILTER */
3543 ))
3544 goto release;
3545
3546 /*
3547 * Depending on the protocol (e.g. TCP), the following
3548 * might cause the socket lock to be dropped and later
3549 * be reacquired, and more data could have arrived and
3550 * have been appended to the receive socket buffer by
3551 * the time it returns. Therefore, we only sleep in
3552 * sbwait() below if and only if the socket buffer is
3553 * empty, in order to avoid a false sleep.
3554 */
3555 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3556 (((struct inpcb *)so->so_pcb)->inp_state !=
3557 INPCB_STATE_DEAD))
3558 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3559
3560 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3561 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3562
3563 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3564 error = 0;
3565 goto release;
3566 }
3567 /*
3568 * have to wait until after we get back from the sbwait
3569 * to do the copy because we will drop the lock if we
3570 * have enough data that has been delayed... by dropping
3571 * the lock we open up a window allowing the netisr
3572 * thread to process the incoming packets and to change
3573 * the state of this socket... we're issuing the sbwait
3574 * because the socket is empty and we're expecting the
3575 * netisr thread to wake us up when more packets arrive;
3576 * if we allow that processing to happen and then sbwait
3577 * we could stall forever with packets sitting in the
3578 * socket if no further packets arrive from the remote
3579 * side.
3580 *
3581 * we want to copy before we've collected all the data
3582 * to satisfy this request to allow the copy to overlap
3583 * the incoming packet processing on an MP system
3584 */
3585 if (delayed_copy_len > sorecvmincopy &&
3586 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3587 error = sodelayed_copy(so, uio,
3588 &free_list, &delayed_copy_len);
3589
3590 if (error)
3591 goto release;
3592 }
3593 m = so->so_rcv.sb_mb;
3594 if (m != NULL) {
3595 nextrecord = m->m_nextpkt;
3596 }
3597 SB_MB_CHECK(&so->so_rcv);
3598 }
3599 }
3600 #ifdef MORE_LOCKING_DEBUG
3601 if (so->so_usecount <= 1) {
3602 panic("%s: after big while so=%p ref=%d on socket\n",
3603 __func__, so, so->so_usecount);
3604 /* NOTREACHED */
3605 }
3606 #endif
3607
3608 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3609 if (so->so_options & SO_DONTTRUNC) {
3610 flags |= MSG_RCVMORE;
3611 } else {
3612 flags |= MSG_TRUNC;
3613 if ((flags & MSG_PEEK) == 0)
3614 (void) sbdroprecord(&so->so_rcv);
3615 }
3616 }
3617
3618 /*
3619 * pru_rcvd below (for TCP) may cause more data to be received
3620 * if the socket lock is dropped prior to sending the ACK; some
3621 * legacy OpenTransport applications don't handle this well
3622 * (if it receives less data than requested while MSG_HAVEMORE
3623 * is set), and so we set the flag now based on what we know
3624 * prior to calling pru_rcvd.
3625 */
3626 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
3627 flags |= MSG_HAVEMORE;
3628
3629 if ((flags & MSG_PEEK) == 0) {
3630 if (m == NULL) {
3631 so->so_rcv.sb_mb = nextrecord;
3632 /*
3633 * First part is an inline SB_EMPTY_FIXUP(). Second
3634 * part makes sure sb_lastrecord is up-to-date if
3635 * there is still data in the socket buffer.
3636 */
3637 if (so->so_rcv.sb_mb == NULL) {
3638 so->so_rcv.sb_mbtail = NULL;
3639 so->so_rcv.sb_lastrecord = NULL;
3640 } else if (nextrecord->m_nextpkt == NULL) {
3641 so->so_rcv.sb_lastrecord = nextrecord;
3642 }
3643 SB_MB_CHECK(&so->so_rcv);
3644 }
3645 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3646 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3647 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
3648 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3649 }
3650
3651 if (delayed_copy_len) {
3652 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3653 if (error)
3654 goto release;
3655 }
3656 if (free_list != NULL) {
3657 m_freem_list(free_list);
3658 free_list = NULL;
3659 }
3660 if (need_event)
3661 postevent(so, 0, EV_OOB);
3662
3663 if (orig_resid == uio_resid(uio) && orig_resid &&
3664 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3665 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3666 goto restart;
3667 }
3668
3669 if (flagsp != NULL)
3670 *flagsp |= flags;
3671 release:
3672 #ifdef MORE_LOCKING_DEBUG
3673 if (so->so_usecount <= 1) {
3674 panic("%s: release so=%p ref=%d on socket\n", __func__,
3675 so, so->so_usecount);
3676 /* NOTREACHED */
3677 }
3678 #endif
3679 if (delayed_copy_len)
3680 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3681
3682 if (free_list != NULL)
3683 m_freem_list(free_list);
3684
3685 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
3686
3687 if (en_tracing) {
3688 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3689 VM_KERNEL_ADDRPERM(so),
3690 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
3691 (int64_t)(orig_resid - uio_resid(uio)));
3692 }
3693 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
3694 so->so_rcv.sb_cc, 0, error);
3695
3696 return (error);
3697 }
3698
3699 /*
3700 * Returns: 0 Success
3701 * uiomove:EFAULT
3702 */
3703 static int
3704 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
3705 user_ssize_t *resid)
3706 {
3707 int error = 0;
3708 struct mbuf *m;
3709
3710 m = *free_list;
3711
3712 socket_unlock(so, 0);
3713
3714 while (m != NULL && error == 0) {
3715 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
3716 m = m->m_next;
3717 }
3718 m_freem_list(*free_list);
3719
3720 *free_list = NULL;
3721 *resid = 0;
3722
3723 socket_lock(so, 0);
3724
3725 return (error);
3726 }
3727
3728 static int
3729 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
3730 u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
3731 {
3732 #pragma unused(so)
3733 int error = 0;
3734 struct mbuf *ml, *m;
3735 int i = 0;
3736 struct uio *auio;
3737
3738 for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
3739 ml = ml->m_nextpkt, i++) {
3740 auio = msgarray[i].uio;
3741 for (m = ml; m != NULL; m = m->m_next) {
3742 error = uiomove(mtod(m, caddr_t), m->m_len, auio);
3743 if (error != 0)
3744 goto out;
3745 }
3746 }
3747 out:
3748 m_freem_list(*free_list);
3749
3750 *free_list = NULL;
3751 *resid = 0;
3752
3753 return (error);
3754 }
3755
3756 int
3757 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
3758 int *flagsp)
3759 {
3760 struct mbuf *m;
3761 struct mbuf *nextrecord;
3762 struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
3763 int error;
3764 user_ssize_t len, pktlen, delayed_copy_len = 0;
3765 struct protosw *pr = so->so_proto;
3766 user_ssize_t resid;
3767 struct proc *p = current_proc();
3768 struct uio *auio = NULL;
3769 int npkts = 0;
3770 int sblocked = 0;
3771 struct sockaddr **psa = NULL;
3772 struct mbuf **controlp = NULL;
3773 int can_delay;
3774 int flags;
3775 struct mbuf *free_others = NULL;
3776
3777 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
3778 so, uiocnt,
3779 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
3780
3781 /*
3782 * Sanity checks:
3783 * - Only supports don't wait flags
3784 * - Only support datagram sockets (could be extended to raw)
3785 * - Must be atomic
3786 * - Protocol must support packet chains
3787 * - The uio array is NULL (should we panic?)
3788 */
3789 if (flagsp != NULL)
3790 flags = *flagsp;
3791 else
3792 flags = 0;
3793 if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
3794 MSG_NBIO)) {
3795 printf("%s invalid flags 0x%x\n", __func__, flags);
3796 error = EINVAL;
3797 goto out;
3798 }
3799 if (so->so_type != SOCK_DGRAM) {
3800 error = EINVAL;
3801 goto out;
3802 }
3803 if (sosendallatonce(so) == 0) {
3804 error = EINVAL;
3805 goto out;
3806 }
3807 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
3808 error = EPROTONOSUPPORT;
3809 goto out;
3810 }
3811 if (msgarray == NULL) {
3812 printf("%s uioarray is NULL\n", __func__);
3813 error = EINVAL;
3814 goto out;
3815 }
3816 if (uiocnt == 0) {
3817 printf("%s uiocnt is 0\n", __func__);
3818 error = EINVAL;
3819 goto out;
3820 }
3821 /*
3822 * Sanity check on the length passed by caller as we are making 'int'
3823 * comparisons
3824 */
3825 resid = recv_msg_array_resid(msgarray, uiocnt);
3826 if (resid < 0 || resid > INT_MAX) {
3827 error = EINVAL;
3828 goto out;
3829 }
3830
3831 if (!(flags & MSG_PEEK) && sorecvmincopy > 0)
3832 can_delay = 1;
3833 else
3834 can_delay = 0;
3835
3836 socket_lock(so, 1);
3837 so_update_last_owner_locked(so, p);
3838 so_update_policy(so);
3839
3840 #if NECP
3841 so_update_necp_policy(so, NULL, NULL);
3842 #endif /* NECP */
3843
3844 /*
3845 * If a recv attempt is made on a previously-accepted socket
3846 * that has been marked as inactive (disconnected), reject
3847 * the request.
3848 */
3849 if (so->so_flags & SOF_DEFUNCT) {
3850 struct sockbuf *sb = &so->so_rcv;
3851
3852 error = ENOTCONN;
3853 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
3854 __func__, proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3855 SOCK_DOM(so), SOCK_TYPE(so), error));
3856 /*
3857 * This socket should have been disconnected and flushed
3858 * prior to being returned from sodefunct(); there should
3859 * be no data on its receive list, so panic otherwise.
3860 */
3861 if (so->so_state & SS_DEFUNCT)
3862 sb_empty_assert(sb, __func__);
3863 goto release;
3864 }
3865
3866 next:
3867 /*
3868 * The uio may be empty
3869 */
3870 if (npkts >= uiocnt) {
3871 error = 0;
3872 goto release;
3873 }
3874 restart:
3875 /*
3876 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3877 * and if so just return to the caller. This could happen when
3878 * soreceive() is called by a socket upcall function during the
3879 * time the socket is freed. The socket buffer would have been
3880 * locked across the upcall, therefore we cannot put this thread
3881 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3882 * we may livelock), because the lock on the socket buffer will
3883 * only be released when the upcall routine returns to its caller.
3884 * Because the socket has been officially closed, there can be
3885 * no further read on it.
3886 */
3887 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3888 (SS_NOFDREF | SS_CANTRCVMORE)) {
3889 error = 0;
3890 goto release;
3891 }
3892
3893 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3894 if (error) {
3895 goto release;
3896 }
3897 sblocked = 1;
3898
3899 m = so->so_rcv.sb_mb;
3900 /*
3901 * Block awaiting more datagram if needed
3902 */
3903 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3904 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3905 ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
3906 /*
3907 * Panic if we notice inconsistencies in the socket's
3908 * receive list; both sb_mb and sb_cc should correctly
3909 * reflect the contents of the list, otherwise we may
3910 * end up with false positives during select() or poll()
3911 * which could put the application in a bad state.
3912 */
3913 SB_MB_CHECK(&so->so_rcv);
3914
3915 if (so->so_error) {
3916 error = so->so_error;
3917 if ((flags & MSG_PEEK) == 0)
3918 so->so_error = 0;
3919 goto release;
3920 }
3921 if (so->so_state & SS_CANTRCVMORE) {
3922 goto release;
3923 }
3924 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3925 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3926 error = ENOTCONN;
3927 goto release;
3928 }
3929 if ((so->so_state & SS_NBIO) ||
3930 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3931 error = EWOULDBLOCK;
3932 goto release;
3933 }
3934 /*
3935 * Do not block if we got some data
3936 */
3937 if (free_list != NULL) {
3938 error = 0;
3939 goto release;
3940 }
3941
3942 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3943 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3944
3945 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3946 sblocked = 0;
3947
3948 error = sbwait(&so->so_rcv);
3949 if (error) {
3950 goto release;
3951 }
3952 goto restart;
3953 }
3954
3955 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3956 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3957 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3958
3959 /*
3960 * Consume the current uio index as we have a datagram
3961 */
3962 auio = msgarray[npkts].uio;
3963 resid = uio_resid(auio);
3964 msgarray[npkts].which |= SOCK_MSG_DATA;
3965 psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
3966 &msgarray[npkts].psa : NULL;
3967 controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
3968 &msgarray[npkts].controlp : NULL;
3969 npkts += 1;
3970 nextrecord = m->m_nextpkt;
3971
3972 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3973 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
3974 if (error == ERESTART)
3975 goto restart;
3976 else if (error != 0)
3977 goto release;
3978 }
3979
3980 if (m != NULL && m->m_type == MT_CONTROL) {
3981 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3982 if (error != 0)
3983 goto release;
3984 }
3985
3986 if (m->m_pkthdr.len == 0) {
3987 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
3988 __func__, __LINE__,
3989 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3990 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
3991 m->m_type);
3992 }
3993
3994 /*
3995 * Loop to copy the mbufs of the current record
3996 * Support zero length packets
3997 */
3998 ml = NULL;
3999 pktlen = 0;
4000 while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4001 if (m->m_len == 0)
4002 panic("%p m_len zero", m);
4003 if (m->m_type == 0)
4004 panic("%p m_type zero", m);
4005 /*
4006 * Clip to the residual length
4007 */
4008 if (len > m->m_len)
4009 len = m->m_len;
4010 pktlen += len;
4011 /*
4012 * Copy the mbufs via the uio or delay the copy
4013 * Sockbuf must be consistent here (points to current mbuf,
4014 * it points to next record) when we drop priority;
4015 * we must note any additions to the sockbuf when we
4016 * block interrupts again.
4017 */
4018 if (len > 0 && can_delay == 0) {
4019 socket_unlock(so, 0);
4020 error = uiomove(mtod(m, caddr_t), (int)len, auio);
4021 socket_lock(so, 0);
4022 if (error)
4023 goto release;
4024 } else {
4025 delayed_copy_len += len;
4026 }
4027
4028 if (len == m->m_len) {
4029 /*
4030 * m was entirely copied
4031 */
4032 sbfree(&so->so_rcv, m);
4033 nextrecord = m->m_nextpkt;
4034 m->m_nextpkt = NULL;
4035
4036 /*
4037 * Set the first packet to the head of the free list
4038 */
4039 if (free_list == NULL)
4040 free_list = m;
4041 /*
4042 * Link current packet to tail of free list
4043 */
4044 if (ml == NULL) {
4045 if (free_tail != NULL)
4046 free_tail->m_nextpkt = m;
4047 free_tail = m;
4048 }
4049 /*
4050 * Link current mbuf to last mbuf of current packet
4051 */
4052 if (ml != NULL)
4053 ml->m_next = m;
4054 ml = m;
4055
4056 /*
4057 * Move next buf to head of socket buffer
4058 */
4059 so->so_rcv.sb_mb = m = ml->m_next;
4060 ml->m_next = NULL;
4061
4062 if (m != NULL) {
4063 m->m_nextpkt = nextrecord;
4064 if (nextrecord == NULL)
4065 so->so_rcv.sb_lastrecord = m;
4066 } else {
4067 so->so_rcv.sb_mb = nextrecord;
4068 SB_EMPTY_FIXUP(&so->so_rcv);
4069 }
4070 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4071 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4072 } else {
4073 /*
4074 * Stop the loop on partial copy
4075 */
4076 break;
4077 }
4078 }
4079 #ifdef MORE_LOCKING_DEBUG
4080 if (so->so_usecount <= 1) {
4081 panic("%s: after big while so=%llx ref=%d on socket\n",
4082 __func__,
4083 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4084 /* NOTREACHED */
4085 }
4086 #endif
4087 /*
4088 * Tell the caller we made a partial copy
4089 */
4090 if (m != NULL) {
4091 if (so->so_options & SO_DONTTRUNC) {
4092 /*
4093 * Copyout first the freelist then the partial mbuf
4094 */
4095 socket_unlock(so, 0);
4096 if (delayed_copy_len)
4097 error = sodelayed_copy_list(so, msgarray,
4098 uiocnt, &free_list, &delayed_copy_len);
4099
4100 if (error == 0) {
4101 error = uiomove(mtod(m, caddr_t), (int)len,
4102 auio);
4103 }
4104 socket_lock(so, 0);
4105 if (error)
4106 goto release;
4107
4108 m->m_data += len;
4109 m->m_len -= len;
4110 so->so_rcv.sb_cc -= len;
4111 flags |= MSG_RCVMORE;
4112 } else {
4113 (void) sbdroprecord(&so->so_rcv);
4114 nextrecord = so->so_rcv.sb_mb;
4115 m = NULL;
4116 flags |= MSG_TRUNC;
4117 }
4118 }
4119
4120 if (m == NULL) {
4121 so->so_rcv.sb_mb = nextrecord;
4122 /*
4123 * First part is an inline SB_EMPTY_FIXUP(). Second
4124 * part makes sure sb_lastrecord is up-to-date if
4125 * there is still data in the socket buffer.
4126 */
4127 if (so->so_rcv.sb_mb == NULL) {
4128 so->so_rcv.sb_mbtail = NULL;
4129 so->so_rcv.sb_lastrecord = NULL;
4130 } else if (nextrecord->m_nextpkt == NULL) {
4131 so->so_rcv.sb_lastrecord = nextrecord;
4132 }
4133 SB_MB_CHECK(&so->so_rcv);
4134 }
4135 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4136 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4137
4138 /*
4139 * We can continue to the next packet as long as:
4140 * - We haven't exhausted the uio array
4141 * - There was no error
4142 * - A packet was not truncated
4143 * - We can still receive more data
4144 */
4145 if (npkts < uiocnt && error == 0 &&
4146 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4147 (so->so_state & SS_CANTRCVMORE) == 0) {
4148 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4149 sblocked = 0;
4150
4151 goto next;
4152 }
4153 if (flagsp != NULL)
4154 *flagsp |= flags;
4155
4156 release:
4157 /*
4158 * pru_rcvd may cause more data to be received if the socket lock
4159 * is dropped so we set MSG_HAVEMORE now based on what we know.
4160 * That way the caller won't be surprised if it receives less data
4161 * than requested.
4162 */
4163 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
4164 flags |= MSG_HAVEMORE;
4165
4166 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
4167 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4168
4169 if (sblocked)
4170 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4171 else
4172 socket_unlock(so, 1);
4173
4174 if (delayed_copy_len)
4175 error = sodelayed_copy_list(so, msgarray, uiocnt,
4176 &free_list, &delayed_copy_len);
4177 out:
4178 /*
4179 * Amortize the cost of freeing the mbufs
4180 */
4181 if (free_list != NULL)
4182 m_freem_list(free_list);
4183 if (free_others != NULL)
4184 m_freem_list(free_others);
4185
4186 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4187 0, 0, 0, 0);
4188 return (error);
4189 }
4190
4191 /*
4192 * Returns: 0 Success
4193 * EINVAL
4194 * ENOTCONN
4195 * <pru_shutdown>:EINVAL
4196 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4197 * <pru_shutdown>:ENOBUFS[TCP]
4198 * <pru_shutdown>:EMSGSIZE[TCP]
4199 * <pru_shutdown>:EHOSTUNREACH[TCP]
4200 * <pru_shutdown>:ENETUNREACH[TCP]
4201 * <pru_shutdown>:ENETDOWN[TCP]
4202 * <pru_shutdown>:ENOMEM[TCP]
4203 * <pru_shutdown>:EACCES[TCP]
4204 * <pru_shutdown>:EMSGSIZE[TCP]
4205 * <pru_shutdown>:ENOBUFS[TCP]
4206 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4207 * <pru_shutdown>:??? [other protocol families]
4208 */
4209 int
4210 soshutdown(struct socket *so, int how)
4211 {
4212 int error;
4213
4214 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4215
4216 switch (how) {
4217 case SHUT_RD:
4218 case SHUT_WR:
4219 case SHUT_RDWR:
4220 socket_lock(so, 1);
4221 if ((so->so_state &
4222 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
4223 error = ENOTCONN;
4224 } else {
4225 error = soshutdownlock(so, how);
4226 }
4227 socket_unlock(so, 1);
4228 break;
4229 default:
4230 error = EINVAL;
4231 break;
4232 }
4233
4234 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4235
4236 return (error);
4237 }
4238
4239 int
4240 soshutdownlock_final(struct socket *so, int how)
4241 {
4242 struct protosw *pr = so->so_proto;
4243 int error = 0;
4244
4245 sflt_notify(so, sock_evt_shutdown, &how);
4246
4247 if (how != SHUT_WR) {
4248 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4249 /* read already shut down */
4250 error = ENOTCONN;
4251 goto done;
4252 }
4253 sorflush(so);
4254 postevent(so, 0, EV_RCLOSED);
4255 }
4256 if (how != SHUT_RD) {
4257 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4258 /* write already shut down */
4259 error = ENOTCONN;
4260 goto done;
4261 }
4262 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4263 postevent(so, 0, EV_WCLOSED);
4264 }
4265 done:
4266 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4267 return (error);
4268 }
4269
4270 int
4271 soshutdownlock(struct socket *so, int how)
4272 {
4273 int error = 0;
4274
4275 #if CONTENT_FILTER
4276 /*
4277 * A content filter may delay the actual shutdown until it
4278 * has processed the pending data
4279 */
4280 if (so->so_flags & SOF_CONTENT_FILTER) {
4281 error = cfil_sock_shutdown(so, &how);
4282 if (error == EJUSTRETURN) {
4283 error = 0;
4284 goto done;
4285 } else if (error != 0) {
4286 goto done;
4287 }
4288 }
4289 #endif /* CONTENT_FILTER */
4290
4291 error = soshutdownlock_final(so, how);
4292
4293 done:
4294 return (error);
4295 }
4296
4297 void
4298 sowflush(struct socket *so)
4299 {
4300 struct sockbuf *sb = &so->so_snd;
4301 #ifdef notyet
4302 lck_mtx_t *mutex_held;
4303 /*
4304 * XXX: This code is currently commented out, because we may get here
4305 * as part of sofreelastref(), and at that time, pr_getlock() may no
4306 * longer be able to return us the lock; this will be fixed in future.
4307 */
4308 if (so->so_proto->pr_getlock != NULL)
4309 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4310 else
4311 mutex_held = so->so_proto->pr_domain->dom_mtx;
4312
4313 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4314 #endif /* notyet */
4315
4316 /*
4317 * Obtain lock on the socket buffer (SB_LOCK). This is required
4318 * to prevent the socket buffer from being unexpectedly altered
4319 * while it is used by another thread in socket send/receive.
4320 *
4321 * sblock() must not fail here, hence the assertion.
4322 */
4323 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4324 VERIFY(sb->sb_flags & SB_LOCK);
4325
4326 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
4327 sb->sb_flags |= SB_DROP;
4328 sb->sb_upcall = NULL;
4329 sb->sb_upcallarg = NULL;
4330
4331 sbunlock(sb, TRUE); /* keep socket locked */
4332
4333 selthreadclear(&sb->sb_sel);
4334 sbrelease(sb);
4335 }
4336
4337 void
4338 sorflush(struct socket *so)
4339 {
4340 struct sockbuf *sb = &so->so_rcv;
4341 struct protosw *pr = so->so_proto;
4342 struct sockbuf asb;
4343 #ifdef notyet
4344 lck_mtx_t *mutex_held;
4345 /*
4346 * XXX: This code is currently commented out, because we may get here
4347 * as part of sofreelastref(), and at that time, pr_getlock() may no
4348 * longer be able to return us the lock; this will be fixed in future.
4349 */
4350 if (so->so_proto->pr_getlock != NULL)
4351 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4352 else
4353 mutex_held = so->so_proto->pr_domain->dom_mtx;
4354
4355 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4356 #endif /* notyet */
4357
4358 sflt_notify(so, sock_evt_flush_read, NULL);
4359
4360 socantrcvmore(so);
4361
4362 /*
4363 * Obtain lock on the socket buffer (SB_LOCK). This is required
4364 * to prevent the socket buffer from being unexpectedly altered
4365 * while it is used by another thread in socket send/receive.
4366 *
4367 * sblock() must not fail here, hence the assertion.
4368 */
4369 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4370 VERIFY(sb->sb_flags & SB_LOCK);
4371
4372 /*
4373 * Copy only the relevant fields from "sb" to "asb" which we
4374 * need for sbrelease() to function. In particular, skip
4375 * sb_sel as it contains the wait queue linkage, which would
4376 * wreak havoc if we were to issue selthreadclear() on "asb".
4377 * Make sure to not carry over SB_LOCK in "asb", as we need
4378 * to acquire it later as part of sbrelease().
4379 */
4380 bzero(&asb, sizeof (asb));
4381 asb.sb_cc = sb->sb_cc;
4382 asb.sb_hiwat = sb->sb_hiwat;
4383 asb.sb_mbcnt = sb->sb_mbcnt;
4384 asb.sb_mbmax = sb->sb_mbmax;
4385 asb.sb_ctl = sb->sb_ctl;
4386 asb.sb_lowat = sb->sb_lowat;
4387 asb.sb_mb = sb->sb_mb;
4388 asb.sb_mbtail = sb->sb_mbtail;
4389 asb.sb_lastrecord = sb->sb_lastrecord;
4390 asb.sb_so = sb->sb_so;
4391 asb.sb_flags = sb->sb_flags;
4392 asb.sb_flags &= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL);
4393 asb.sb_flags |= SB_DROP;
4394
4395 /*
4396 * Ideally we'd bzero() these and preserve the ones we need;
4397 * but to do that we'd need to shuffle things around in the
4398 * sockbuf, and we can't do it now because there are KEXTS
4399 * that are directly referring to the socket structure.
4400 *
4401 * Setting SB_DROP acts as a barrier to prevent further appends.
4402 * Clearing SB_SEL is done for selthreadclear() below.
4403 */
4404 sb->sb_cc = 0;
4405 sb->sb_hiwat = 0;
4406 sb->sb_mbcnt = 0;
4407 sb->sb_mbmax = 0;
4408 sb->sb_ctl = 0;
4409 sb->sb_lowat = 0;
4410 sb->sb_mb = NULL;
4411 sb->sb_mbtail = NULL;
4412 sb->sb_lastrecord = NULL;
4413 sb->sb_timeo.tv_sec = 0;
4414 sb->sb_timeo.tv_usec = 0;
4415 sb->sb_upcall = NULL;
4416 sb->sb_upcallarg = NULL;
4417 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
4418 sb->sb_flags |= SB_DROP;
4419
4420 sbunlock(sb, TRUE); /* keep socket locked */
4421
4422 /*
4423 * Note that selthreadclear() is called on the original "sb" and
4424 * not the local "asb" because of the way wait queue linkage is
4425 * implemented. Given that selwakeup() may be triggered, SB_SEL
4426 * should no longer be set (cleared above.)
4427 */
4428 selthreadclear(&sb->sb_sel);
4429
4430 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
4431 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4432
4433 sbrelease(&asb);
4434 }
4435
4436 /*
4437 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4438 * an additional variant to handle the case where the option value needs
4439 * to be some kind of integer, but not a specific size.
4440 * In addition to their use here, these functions are also called by the
4441 * protocol-level pr_ctloutput() routines.
4442 *
4443 * Returns: 0 Success
4444 * EINVAL
4445 * copyin:EFAULT
4446 */
4447 int
4448 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4449 {
4450 size_t valsize;
4451
4452 /*
4453 * If the user gives us more than we wanted, we ignore it,
4454 * but if we don't get the minimum length the caller
4455 * wants, we return EINVAL. On success, sopt->sopt_valsize
4456 * is set to however much we actually retrieved.
4457 */
4458 if ((valsize = sopt->sopt_valsize) < minlen)
4459 return (EINVAL);
4460 if (valsize > len)
4461 sopt->sopt_valsize = valsize = len;
4462
4463 if (sopt->sopt_p != kernproc)
4464 return (copyin(sopt->sopt_val, buf, valsize));
4465
4466 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4467 return (0);
4468 }
4469
4470 /*
4471 * sooptcopyin_timeval
4472 * Copy in a timeval value into tv_p, and take into account whether the
4473 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4474 * code here so that we can verify the 64-bit tv_sec value before we lose
4475 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4476 */
4477 static int
4478 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4479 {
4480 int error;
4481
4482 if (proc_is64bit(sopt->sopt_p)) {
4483 struct user64_timeval tv64;
4484
4485 if (sopt->sopt_valsize < sizeof (tv64))
4486 return (EINVAL);
4487
4488 sopt->sopt_valsize = sizeof (tv64);
4489 if (sopt->sopt_p != kernproc) {
4490 error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
4491 if (error != 0)
4492 return (error);
4493 } else {
4494 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4495 sizeof (tv64));
4496 }
4497 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4498 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000)
4499 return (EDOM);
4500
4501 tv_p->tv_sec = tv64.tv_sec;
4502 tv_p->tv_usec = tv64.tv_usec;
4503 } else {
4504 struct user32_timeval tv32;
4505
4506 if (sopt->sopt_valsize < sizeof (tv32))
4507 return (EINVAL);
4508
4509 sopt->sopt_valsize = sizeof (tv32);
4510 if (sopt->sopt_p != kernproc) {
4511 error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
4512 if (error != 0) {
4513 return (error);
4514 }
4515 } else {
4516 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4517 sizeof (tv32));
4518 }
4519 #ifndef __LP64__
4520 /*
4521 * K64todo "comparison is always false due to
4522 * limited range of data type"
4523 */
4524 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4525 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000)
4526 return (EDOM);
4527 #endif
4528 tv_p->tv_sec = tv32.tv_sec;
4529 tv_p->tv_usec = tv32.tv_usec;
4530 }
4531 return (0);
4532 }
4533
4534 /*
4535 * Returns: 0 Success
4536 * EINVAL
4537 * ENOPROTOOPT
4538 * ENOBUFS
4539 * EDOM
4540 * sooptcopyin:EINVAL
4541 * sooptcopyin:EFAULT
4542 * sooptcopyin_timeval:EINVAL
4543 * sooptcopyin_timeval:EFAULT
4544 * sooptcopyin_timeval:EDOM
4545 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4546 * <pr_ctloutput>:???w
4547 * sflt_attach_private:??? [whatever a filter author chooses]
4548 * <sf_setoption>:??? [whatever a filter author chooses]
4549 *
4550 * Notes: Other <pru_listen> returns depend on the protocol family; all
4551 * <sf_listen> returns depend on what the filter author causes
4552 * their filter to return.
4553 */
4554 int
4555 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4556 {
4557 int error, optval;
4558 struct linger l;
4559 struct timeval tv;
4560 #if CONFIG_MACF_SOCKET
4561 struct mac extmac;
4562 #endif /* MAC_SOCKET */
4563
4564 if (sopt->sopt_dir != SOPT_SET)
4565 sopt->sopt_dir = SOPT_SET;
4566
4567 if (dolock)
4568 socket_lock(so, 1);
4569
4570 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4571 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4572 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4573 /* the socket has been shutdown, no more sockopt's */
4574 error = EINVAL;
4575 goto out;
4576 }
4577
4578 error = sflt_setsockopt(so, sopt);
4579 if (error != 0) {
4580 if (error == EJUSTRETURN)
4581 error = 0;
4582 goto out;
4583 }
4584
4585 if (sopt->sopt_level != SOL_SOCKET) {
4586 if (so->so_proto != NULL &&
4587 so->so_proto->pr_ctloutput != NULL) {
4588 error = (*so->so_proto->pr_ctloutput)(so, sopt);
4589 goto out;
4590 }
4591 error = ENOPROTOOPT;
4592 } else {
4593 /*
4594 * Allow socket-level (SOL_SOCKET) options to be filtered by
4595 * the protocol layer, if needed. A zero value returned from
4596 * the handler means use default socket-level processing as
4597 * done by the rest of this routine. Otherwise, any other
4598 * return value indicates that the option is unsupported.
4599 */
4600 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4601 pru_socheckopt(so, sopt)) != 0)
4602 goto out;
4603
4604 error = 0;
4605 switch (sopt->sopt_name) {
4606 case SO_LINGER:
4607 case SO_LINGER_SEC:
4608 error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
4609 if (error != 0)
4610 goto out;
4611
4612 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
4613 l.l_linger : l.l_linger * hz;
4614 if (l.l_onoff != 0)
4615 so->so_options |= SO_LINGER;
4616 else
4617 so->so_options &= ~SO_LINGER;
4618 break;
4619
4620 case SO_DEBUG:
4621 case SO_KEEPALIVE:
4622 case SO_DONTROUTE:
4623 case SO_USELOOPBACK:
4624 case SO_BROADCAST:
4625 case SO_REUSEADDR:
4626 case SO_REUSEPORT:
4627 case SO_OOBINLINE:
4628 case SO_TIMESTAMP:
4629 case SO_TIMESTAMP_MONOTONIC:
4630 case SO_DONTTRUNC:
4631 case SO_WANTMORE:
4632 case SO_WANTOOBFLAG:
4633 case SO_NOWAKEFROMSLEEP:
4634 error = sooptcopyin(sopt, &optval, sizeof (optval),
4635 sizeof (optval));
4636 if (error != 0)
4637 goto out;
4638 if (optval)
4639 so->so_options |= sopt->sopt_name;
4640 else
4641 so->so_options &= ~sopt->sopt_name;
4642 break;
4643
4644 case SO_SNDBUF:
4645 case SO_RCVBUF:
4646 case SO_SNDLOWAT:
4647 case SO_RCVLOWAT:
4648 error = sooptcopyin(sopt, &optval, sizeof (optval),
4649 sizeof (optval));
4650 if (error != 0)
4651 goto out;
4652
4653 /*
4654 * Values < 1 make no sense for any of these
4655 * options, so disallow them.
4656 */
4657 if (optval < 1) {
4658 error = EINVAL;
4659 goto out;
4660 }
4661
4662 switch (sopt->sopt_name) {
4663 case SO_SNDBUF:
4664 case SO_RCVBUF: {
4665 struct sockbuf *sb =
4666 (sopt->sopt_name == SO_SNDBUF) ?
4667 &so->so_snd : &so->so_rcv;
4668 if (sbreserve(sb, (u_int32_t)optval) == 0) {
4669 error = ENOBUFS;
4670 goto out;
4671 }
4672 sb->sb_flags |= SB_USRSIZE;
4673 sb->sb_flags &= ~SB_AUTOSIZE;
4674 sb->sb_idealsize = (u_int32_t)optval;
4675 break;
4676 }
4677 /*
4678 * Make sure the low-water is never greater than
4679 * the high-water.
4680 */
4681 case SO_SNDLOWAT: {
4682 int space = sbspace(&so->so_snd);
4683 u_int32_t hiwat = so->so_snd.sb_hiwat;
4684
4685 if (so->so_snd.sb_flags & SB_UNIX) {
4686 struct unpcb *unp =
4687 (struct unpcb *)(so->so_pcb);
4688 if (unp != NULL &&
4689 unp->unp_conn != NULL) {
4690 hiwat += unp->unp_conn->unp_cc;
4691 }
4692 }
4693
4694 so->so_snd.sb_lowat =
4695 (optval > hiwat) ?
4696 hiwat : optval;
4697
4698 if (space >= so->so_snd.sb_lowat) {
4699 sowwakeup(so);
4700 }
4701 break;
4702 }
4703 case SO_RCVLOWAT: {
4704 int64_t data_len;
4705 so->so_rcv.sb_lowat =
4706 (optval > so->so_rcv.sb_hiwat) ?
4707 so->so_rcv.sb_hiwat : optval;
4708 data_len = so->so_rcv.sb_cc
4709 - so->so_rcv.sb_ctl;
4710 if (data_len >= so->so_rcv.sb_lowat)
4711 sorwakeup(so);
4712 break;
4713 }
4714 }
4715 break;
4716
4717 case SO_SNDTIMEO:
4718 case SO_RCVTIMEO:
4719 error = sooptcopyin_timeval(sopt, &tv);
4720 if (error != 0)
4721 goto out;
4722
4723 switch (sopt->sopt_name) {
4724 case SO_SNDTIMEO:
4725 so->so_snd.sb_timeo = tv;
4726 break;
4727 case SO_RCVTIMEO:
4728 so->so_rcv.sb_timeo = tv;
4729 break;
4730 }
4731 break;
4732
4733 case SO_NKE: {
4734 struct so_nke nke;
4735
4736 error = sooptcopyin(sopt, &nke, sizeof (nke),
4737 sizeof (nke));
4738 if (error != 0)
4739 goto out;
4740
4741 error = sflt_attach_internal(so, nke.nke_handle);
4742 break;
4743 }
4744
4745 case SO_NOSIGPIPE:
4746 error = sooptcopyin(sopt, &optval, sizeof (optval),
4747 sizeof (optval));
4748 if (error != 0)
4749 goto out;
4750 if (optval != 0)
4751 so->so_flags |= SOF_NOSIGPIPE;
4752 else
4753 so->so_flags &= ~SOF_NOSIGPIPE;
4754 break;
4755
4756 case SO_NOADDRERR:
4757 error = sooptcopyin(sopt, &optval, sizeof (optval),
4758 sizeof (optval));
4759 if (error != 0)
4760 goto out;
4761 if (optval != 0)
4762 so->so_flags |= SOF_NOADDRAVAIL;
4763 else
4764 so->so_flags &= ~SOF_NOADDRAVAIL;
4765 break;
4766
4767 case SO_REUSESHAREUID:
4768 error = sooptcopyin(sopt, &optval, sizeof (optval),
4769 sizeof (optval));
4770 if (error != 0)
4771 goto out;
4772 if (optval != 0)
4773 so->so_flags |= SOF_REUSESHAREUID;
4774 else
4775 so->so_flags &= ~SOF_REUSESHAREUID;
4776 break;
4777
4778 case SO_NOTIFYCONFLICT:
4779 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
4780 error = EPERM;
4781 goto out;
4782 }
4783 error = sooptcopyin(sopt, &optval, sizeof (optval),
4784 sizeof (optval));
4785 if (error != 0)
4786 goto out;
4787 if (optval != 0)
4788 so->so_flags |= SOF_NOTIFYCONFLICT;
4789 else
4790 so->so_flags &= ~SOF_NOTIFYCONFLICT;
4791 break;
4792
4793 case SO_RESTRICTIONS:
4794 error = sooptcopyin(sopt, &optval, sizeof (optval),
4795 sizeof (optval));
4796 if (error != 0)
4797 goto out;
4798
4799 error = so_set_restrictions(so, optval);
4800 break;
4801
4802 case SO_AWDL_UNRESTRICTED:
4803 if (SOCK_DOM(so) != PF_INET &&
4804 SOCK_DOM(so) != PF_INET6) {
4805 error = EOPNOTSUPP;
4806 goto out;
4807 }
4808 error = sooptcopyin(sopt, &optval, sizeof(optval),
4809 sizeof(optval));
4810 if (error != 0)
4811 goto out;
4812 if (optval != 0) {
4813 kauth_cred_t cred = NULL;
4814 proc_t ep = PROC_NULL;
4815
4816 if (so->so_flags & SOF_DELEGATED) {
4817 ep = proc_find(so->e_pid);
4818 if (ep)
4819 cred = kauth_cred_proc_ref(ep);
4820 }
4821 error = priv_check_cred(
4822 cred ? cred : so->so_cred,
4823 PRIV_NET_RESTRICTED_AWDL, 0);
4824 if (error == 0)
4825 inp_set_awdl_unrestricted(
4826 sotoinpcb(so));
4827 if (cred)
4828 kauth_cred_unref(&cred);
4829 if (ep != PROC_NULL)
4830 proc_rele(ep);
4831 } else
4832 inp_clear_awdl_unrestricted(sotoinpcb(so));
4833 break;
4834
4835 case SO_LABEL:
4836 #if CONFIG_MACF_SOCKET
4837 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
4838 sizeof (extmac))) != 0)
4839 goto out;
4840
4841 error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
4842 so, &extmac);
4843 #else
4844 error = EOPNOTSUPP;
4845 #endif /* MAC_SOCKET */
4846 break;
4847
4848 case SO_UPCALLCLOSEWAIT:
4849 error = sooptcopyin(sopt, &optval, sizeof (optval),
4850 sizeof (optval));
4851 if (error != 0)
4852 goto out;
4853 if (optval != 0)
4854 so->so_flags |= SOF_UPCALLCLOSEWAIT;
4855 else
4856 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
4857 break;
4858
4859 case SO_RANDOMPORT:
4860 error = sooptcopyin(sopt, &optval, sizeof (optval),
4861 sizeof (optval));
4862 if (error != 0)
4863 goto out;
4864 if (optval != 0)
4865 so->so_flags |= SOF_BINDRANDOMPORT;
4866 else
4867 so->so_flags &= ~SOF_BINDRANDOMPORT;
4868 break;
4869
4870 case SO_NP_EXTENSIONS: {
4871 struct so_np_extensions sonpx;
4872
4873 error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
4874 sizeof (sonpx));
4875 if (error != 0)
4876 goto out;
4877 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
4878 error = EINVAL;
4879 goto out;
4880 }
4881 /*
4882 * Only one bit defined for now
4883 */
4884 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
4885 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
4886 so->so_flags |= SOF_NPX_SETOPTSHUT;
4887 else
4888 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
4889 }
4890 break;
4891 }
4892
4893 case SO_TRAFFIC_CLASS: {
4894 error = sooptcopyin(sopt, &optval, sizeof (optval),
4895 sizeof (optval));
4896 if (error != 0)
4897 goto out;
4898 error = so_set_traffic_class(so, optval);
4899 if (error != 0)
4900 goto out;
4901 break;
4902 }
4903
4904 case SO_RECV_TRAFFIC_CLASS: {
4905 error = sooptcopyin(sopt, &optval, sizeof (optval),
4906 sizeof (optval));
4907 if (error != 0)
4908 goto out;
4909 if (optval == 0)
4910 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
4911 else
4912 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
4913 break;
4914 }
4915
4916 case SO_TRAFFIC_CLASS_DBG: {
4917 struct so_tcdbg so_tcdbg;
4918
4919 error = sooptcopyin(sopt, &so_tcdbg,
4920 sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
4921 if (error != 0)
4922 goto out;
4923 error = so_set_tcdbg(so, &so_tcdbg);
4924 if (error != 0)
4925 goto out;
4926 break;
4927 }
4928
4929 case SO_PRIVILEGED_TRAFFIC_CLASS:
4930 error = priv_check_cred(kauth_cred_get(),
4931 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
4932 if (error != 0)
4933 goto out;
4934 error = sooptcopyin(sopt, &optval, sizeof (optval),
4935 sizeof (optval));
4936 if (error != 0)
4937 goto out;
4938 if (optval == 0)
4939 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
4940 else
4941 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
4942 break;
4943
4944 case SO_DEFUNCTOK:
4945 error = sooptcopyin(sopt, &optval, sizeof (optval),
4946 sizeof (optval));
4947 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
4948 if (error == 0)
4949 error = EBADF;
4950 goto out;
4951 }
4952 /*
4953 * Any process can set SO_DEFUNCTOK (clear
4954 * SOF_NODEFUNCT), but only root can clear
4955 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
4956 */
4957 if (optval == 0 &&
4958 kauth_cred_issuser(kauth_cred_get()) == 0) {
4959 error = EPERM;
4960 goto out;
4961 }
4962 if (optval)
4963 so->so_flags &= ~SOF_NODEFUNCT;
4964 else
4965 so->so_flags |= SOF_NODEFUNCT;
4966
4967 if (SOCK_DOM(so) == PF_INET ||
4968 SOCK_DOM(so) == PF_INET6) {
4969 char s[MAX_IPv6_STR_LEN];
4970 char d[MAX_IPv6_STR_LEN];
4971 struct inpcb *inp = sotoinpcb(so);
4972
4973 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%s %s:%d -> "
4974 "%s:%d] is now marked as %seligible for "
4975 "defunct\n", __func__, proc_selfpid(),
4976 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4977 (SOCK_TYPE(so) == SOCK_STREAM) ?
4978 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
4979 ((SOCK_DOM(so) == PF_INET) ?
4980 (void *)&inp->inp_laddr.s_addr :
4981 (void *)&inp->in6p_laddr), s, sizeof (s)),
4982 ntohs(inp->in6p_lport),
4983 inet_ntop(SOCK_DOM(so),
4984 (SOCK_DOM(so) == PF_INET) ?
4985 (void *)&inp->inp_faddr.s_addr :
4986 (void *)&inp->in6p_faddr, d, sizeof (d)),
4987 ntohs(inp->in6p_fport),
4988 (so->so_flags & SOF_NODEFUNCT) ?
4989 "not " : ""));
4990 } else {
4991 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d] is "
4992 "now marked as %seligible for defunct\n",
4993 __func__, proc_selfpid(),
4994 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4995 SOCK_DOM(so), SOCK_TYPE(so),
4996 (so->so_flags & SOF_NODEFUNCT) ?
4997 "not " : ""));
4998 }
4999 break;
5000
5001 case SO_ISDEFUNCT:
5002 /* This option is not settable */
5003 error = EINVAL;
5004 break;
5005
5006 case SO_OPPORTUNISTIC:
5007 error = sooptcopyin(sopt, &optval, sizeof (optval),
5008 sizeof (optval));
5009 if (error == 0)
5010 error = so_set_opportunistic(so, optval);
5011 break;
5012
5013 case SO_FLUSH:
5014 /* This option is handled by lower layer(s) */
5015 error = 0;
5016 break;
5017
5018 case SO_RECV_ANYIF:
5019 error = sooptcopyin(sopt, &optval, sizeof (optval),
5020 sizeof (optval));
5021 if (error == 0)
5022 error = so_set_recv_anyif(so, optval);
5023 break;
5024
5025 case SO_TRAFFIC_MGT_BACKGROUND: {
5026 /* This option is handled by lower layer(s) */
5027 error = 0;
5028 break;
5029 }
5030
5031 #if FLOW_DIVERT
5032 case SO_FLOW_DIVERT_TOKEN:
5033 error = flow_divert_token_set(so, sopt);
5034 break;
5035 #endif /* FLOW_DIVERT */
5036
5037
5038 case SO_DELEGATED:
5039 if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
5040 sizeof (optval))) != 0)
5041 break;
5042
5043 error = so_set_effective_pid(so, optval, sopt->sopt_p);
5044 break;
5045
5046 case SO_DELEGATED_UUID: {
5047 uuid_t euuid;
5048
5049 if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
5050 sizeof (euuid))) != 0)
5051 break;
5052
5053 error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
5054 break;
5055 }
5056
5057 #if NECP
5058 case SO_NECP_ATTRIBUTES:
5059 error = necp_set_socket_attributes(so, sopt);
5060 break;
5061 #endif /* NECP */
5062
5063 #if MPTCP
5064 case SO_MPTCP_FASTJOIN:
5065 if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5066 ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5067 (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5068 error = ENOPROTOOPT;
5069 break;
5070 }
5071
5072 error = sooptcopyin(sopt, &optval, sizeof (optval),
5073 sizeof (optval));
5074 if (error != 0)
5075 goto out;
5076 if (optval == 0)
5077 so->so_flags &= ~SOF_MPTCP_FASTJOIN;
5078 else
5079 so->so_flags |= SOF_MPTCP_FASTJOIN;
5080 break;
5081 #endif /* MPTCP */
5082
5083 case SO_EXTENDED_BK_IDLE:
5084 error = sooptcopyin(sopt, &optval, sizeof (optval),
5085 sizeof (optval));
5086 if (error == 0)
5087 error = so_set_extended_bk_idle(so, optval);
5088 break;
5089
5090 default:
5091 error = ENOPROTOOPT;
5092 break;
5093 }
5094 if (error == 0 && so->so_proto != NULL &&
5095 so->so_proto->pr_ctloutput != NULL) {
5096 (void) so->so_proto->pr_ctloutput(so, sopt);
5097 }
5098 }
5099 out:
5100 if (dolock)
5101 socket_unlock(so, 1);
5102 return (error);
5103 }
5104
5105 /* Helper routines for getsockopt */
5106 int
5107 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5108 {
5109 int error;
5110 size_t valsize;
5111
5112 error = 0;
5113
5114 /*
5115 * Documented get behavior is that we always return a value,
5116 * possibly truncated to fit in the user's buffer.
5117 * Traditional behavior is that we always tell the user
5118 * precisely how much we copied, rather than something useful
5119 * like the total amount we had available for her.
5120 * Note that this interface is not idempotent; the entire answer must
5121 * generated ahead of time.
5122 */
5123 valsize = min(len, sopt->sopt_valsize);
5124 sopt->sopt_valsize = valsize;
5125 if (sopt->sopt_val != USER_ADDR_NULL) {
5126 if (sopt->sopt_p != kernproc)
5127 error = copyout(buf, sopt->sopt_val, valsize);
5128 else
5129 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5130 }
5131 return (error);
5132 }
5133
5134 static int
5135 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5136 {
5137 int error;
5138 size_t len;
5139 struct user64_timeval tv64;
5140 struct user32_timeval tv32;
5141 const void * val;
5142 size_t valsize;
5143
5144 error = 0;
5145 if (proc_is64bit(sopt->sopt_p)) {
5146 len = sizeof (tv64);
5147 tv64.tv_sec = tv_p->tv_sec;
5148 tv64.tv_usec = tv_p->tv_usec;
5149 val = &tv64;
5150 } else {
5151 len = sizeof (tv32);
5152 tv32.tv_sec = tv_p->tv_sec;
5153 tv32.tv_usec = tv_p->tv_usec;
5154 val = &tv32;
5155 }
5156 valsize = min(len, sopt->sopt_valsize);
5157 sopt->sopt_valsize = valsize;
5158 if (sopt->sopt_val != USER_ADDR_NULL) {
5159 if (sopt->sopt_p != kernproc)
5160 error = copyout(val, sopt->sopt_val, valsize);
5161 else
5162 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5163 }
5164 return (error);
5165 }
5166
5167 /*
5168 * Return: 0 Success
5169 * ENOPROTOOPT
5170 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5171 * <pr_ctloutput>:???
5172 * <sf_getoption>:???
5173 */
5174 int
5175 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5176 {
5177 int error, optval;
5178 struct linger l;
5179 struct timeval tv;
5180 #if CONFIG_MACF_SOCKET
5181 struct mac extmac;
5182 #endif /* MAC_SOCKET */
5183
5184 if (sopt->sopt_dir != SOPT_GET)
5185 sopt->sopt_dir = SOPT_GET;
5186
5187 if (dolock)
5188 socket_lock(so, 1);
5189
5190 error = sflt_getsockopt(so, sopt);
5191 if (error != 0) {
5192 if (error == EJUSTRETURN)
5193 error = 0;
5194 goto out;
5195 }
5196
5197 if (sopt->sopt_level != SOL_SOCKET) {
5198 if (so->so_proto != NULL &&
5199 so->so_proto->pr_ctloutput != NULL) {
5200 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5201 goto out;
5202 }
5203 error = ENOPROTOOPT;
5204 } else {
5205 /*
5206 * Allow socket-level (SOL_SOCKET) options to be filtered by
5207 * the protocol layer, if needed. A zero value returned from
5208 * the handler means use default socket-level processing as
5209 * done by the rest of this routine. Otherwise, any other
5210 * return value indicates that the option is unsupported.
5211 */
5212 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5213 pru_socheckopt(so, sopt)) != 0)
5214 goto out;
5215
5216 error = 0;
5217 switch (sopt->sopt_name) {
5218 case SO_LINGER:
5219 case SO_LINGER_SEC:
5220 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5221 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5222 so->so_linger : so->so_linger / hz;
5223 error = sooptcopyout(sopt, &l, sizeof (l));
5224 break;
5225
5226 case SO_USELOOPBACK:
5227 case SO_DONTROUTE:
5228 case SO_DEBUG:
5229 case SO_KEEPALIVE:
5230 case SO_REUSEADDR:
5231 case SO_REUSEPORT:
5232 case SO_BROADCAST:
5233 case SO_OOBINLINE:
5234 case SO_TIMESTAMP:
5235 case SO_TIMESTAMP_MONOTONIC:
5236 case SO_DONTTRUNC:
5237 case SO_WANTMORE:
5238 case SO_WANTOOBFLAG:
5239 case SO_NOWAKEFROMSLEEP:
5240 optval = so->so_options & sopt->sopt_name;
5241 integer:
5242 error = sooptcopyout(sopt, &optval, sizeof (optval));
5243 break;
5244
5245 case SO_TYPE:
5246 optval = so->so_type;
5247 goto integer;
5248
5249 case SO_NREAD:
5250 if (so->so_proto->pr_flags & PR_ATOMIC) {
5251 int pkt_total;
5252 struct mbuf *m1;
5253
5254 pkt_total = 0;
5255 m1 = so->so_rcv.sb_mb;
5256 while (m1 != NULL) {
5257 if (m1->m_type == MT_DATA ||
5258 m1->m_type == MT_HEADER ||
5259 m1->m_type == MT_OOBDATA)
5260 pkt_total += m1->m_len;
5261 m1 = m1->m_next;
5262 }
5263 optval = pkt_total;
5264 } else {
5265 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5266 }
5267 goto integer;
5268
5269 case SO_NUMRCVPKT:
5270 if (so->so_proto->pr_flags & PR_ATOMIC) {
5271 int cnt = 0;
5272 struct mbuf *m1;
5273
5274 m1 = so->so_rcv.sb_mb;
5275 while (m1 != NULL) {
5276 if (m1->m_type == MT_DATA ||
5277 m1->m_type == MT_HEADER ||
5278 m1->m_type == MT_OOBDATA)
5279 cnt += 1;
5280 m1 = m1->m_nextpkt;
5281 }
5282 optval = cnt;
5283 goto integer;
5284 } else {
5285 error = EINVAL;
5286 break;
5287 }
5288
5289 case SO_NWRITE:
5290 optval = so->so_snd.sb_cc;
5291 goto integer;
5292
5293 case SO_ERROR:
5294 optval = so->so_error;
5295 so->so_error = 0;
5296 goto integer;
5297
5298 case SO_SNDBUF: {
5299 u_int32_t hiwat = so->so_snd.sb_hiwat;
5300
5301 if (so->so_snd.sb_flags & SB_UNIX) {
5302 struct unpcb *unp =
5303 (struct unpcb *)(so->so_pcb);
5304 if (unp != NULL && unp->unp_conn != NULL) {
5305 hiwat += unp->unp_conn->unp_cc;
5306 }
5307 }
5308
5309 optval = hiwat;
5310 goto integer;
5311 }
5312 case SO_RCVBUF:
5313 optval = so->so_rcv.sb_hiwat;
5314 goto integer;
5315
5316 case SO_SNDLOWAT:
5317 optval = so->so_snd.sb_lowat;
5318 goto integer;
5319
5320 case SO_RCVLOWAT:
5321 optval = so->so_rcv.sb_lowat;
5322 goto integer;
5323
5324 case SO_SNDTIMEO:
5325 case SO_RCVTIMEO:
5326 tv = (sopt->sopt_name == SO_SNDTIMEO ?
5327 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5328
5329 error = sooptcopyout_timeval(sopt, &tv);
5330 break;
5331
5332 case SO_NOSIGPIPE:
5333 optval = (so->so_flags & SOF_NOSIGPIPE);
5334 goto integer;
5335
5336 case SO_NOADDRERR:
5337 optval = (so->so_flags & SOF_NOADDRAVAIL);
5338 goto integer;
5339
5340 case SO_REUSESHAREUID:
5341 optval = (so->so_flags & SOF_REUSESHAREUID);
5342 goto integer;
5343
5344
5345 case SO_NOTIFYCONFLICT:
5346 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5347 goto integer;
5348
5349 case SO_RESTRICTIONS:
5350 optval = so_get_restrictions(so);
5351 goto integer;
5352
5353 case SO_AWDL_UNRESTRICTED:
5354 if (SOCK_DOM(so) == PF_INET ||
5355 SOCK_DOM(so) == PF_INET6) {
5356 optval = inp_get_awdl_unrestricted(
5357 sotoinpcb(so));
5358 goto integer;
5359 } else
5360 error = EOPNOTSUPP;
5361 break;
5362
5363 case SO_LABEL:
5364 #if CONFIG_MACF_SOCKET
5365 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5366 sizeof (extmac))) != 0 ||
5367 (error = mac_socket_label_get(proc_ucred(
5368 sopt->sopt_p), so, &extmac)) != 0)
5369 break;
5370
5371 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5372 #else
5373 error = EOPNOTSUPP;
5374 #endif /* MAC_SOCKET */
5375 break;
5376
5377 case SO_PEERLABEL:
5378 #if CONFIG_MACF_SOCKET
5379 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5380 sizeof (extmac))) != 0 ||
5381 (error = mac_socketpeer_label_get(proc_ucred(
5382 sopt->sopt_p), so, &extmac)) != 0)
5383 break;
5384
5385 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5386 #else
5387 error = EOPNOTSUPP;
5388 #endif /* MAC_SOCKET */
5389 break;
5390
5391 #ifdef __APPLE_API_PRIVATE
5392 case SO_UPCALLCLOSEWAIT:
5393 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
5394 goto integer;
5395 #endif
5396 case SO_RANDOMPORT:
5397 optval = (so->so_flags & SOF_BINDRANDOMPORT);
5398 goto integer;
5399
5400 case SO_NP_EXTENSIONS: {
5401 struct so_np_extensions sonpx;
5402
5403 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
5404 SONPX_SETOPTSHUT : 0;
5405 sonpx.npx_mask = SONPX_MASK_VALID;
5406
5407 error = sooptcopyout(sopt, &sonpx,
5408 sizeof (struct so_np_extensions));
5409 break;
5410 }
5411
5412 case SO_TRAFFIC_CLASS:
5413 optval = so->so_traffic_class;
5414 goto integer;
5415
5416 case SO_RECV_TRAFFIC_CLASS:
5417 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
5418 goto integer;
5419
5420 case SO_TRAFFIC_CLASS_STATS:
5421 error = sooptcopyout(sopt, &so->so_tc_stats,
5422 sizeof (so->so_tc_stats));
5423 break;
5424
5425 case SO_TRAFFIC_CLASS_DBG:
5426 error = sogetopt_tcdbg(so, sopt);
5427 break;
5428
5429 case SO_PRIVILEGED_TRAFFIC_CLASS:
5430 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
5431 goto integer;
5432
5433 case SO_DEFUNCTOK:
5434 optval = !(so->so_flags & SOF_NODEFUNCT);
5435 goto integer;
5436
5437 case SO_ISDEFUNCT:
5438 optval = (so->so_flags & SOF_DEFUNCT);
5439 goto integer;
5440
5441 case SO_OPPORTUNISTIC:
5442 optval = so_get_opportunistic(so);
5443 goto integer;
5444
5445 case SO_FLUSH:
5446 /* This option is not gettable */
5447 error = EINVAL;
5448 break;
5449
5450 case SO_RECV_ANYIF:
5451 optval = so_get_recv_anyif(so);
5452 goto integer;
5453
5454 case SO_TRAFFIC_MGT_BACKGROUND:
5455 /* This option is handled by lower layer(s) */
5456 if (so->so_proto != NULL &&
5457 so->so_proto->pr_ctloutput != NULL) {
5458 (void) so->so_proto->pr_ctloutput(so, sopt);
5459 }
5460 break;
5461
5462 #if FLOW_DIVERT
5463 case SO_FLOW_DIVERT_TOKEN:
5464 error = flow_divert_token_get(so, sopt);
5465 break;
5466 #endif /* FLOW_DIVERT */
5467
5468 #if NECP
5469 case SO_NECP_ATTRIBUTES:
5470 error = necp_get_socket_attributes(so, sopt);
5471 break;
5472 #endif /* NECP */
5473
5474 #if CONTENT_FILTER
5475 case SO_CFIL_SOCK_ID: {
5476 cfil_sock_id_t sock_id;
5477
5478 sock_id = cfil_sock_id_from_socket(so);
5479
5480 error = sooptcopyout(sopt, &sock_id,
5481 sizeof(cfil_sock_id_t));
5482 break;
5483 }
5484 #endif /* CONTENT_FILTER */
5485
5486 #if MPTCP
5487 case SO_MPTCP_FASTJOIN:
5488 if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5489 ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5490 (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5491 error = ENOPROTOOPT;
5492 break;
5493 }
5494 optval = (so->so_flags & SOF_MPTCP_FASTJOIN);
5495 /* Fixed along with rdar://19391339 */
5496 goto integer;
5497 #endif /* MPTCP */
5498
5499 case SO_EXTENDED_BK_IDLE:
5500 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
5501 goto integer;
5502
5503 default:
5504 error = ENOPROTOOPT;
5505 break;
5506 }
5507 }
5508 out:
5509 if (dolock)
5510 socket_unlock(so, 1);
5511 return (error);
5512 }
5513
5514 /*
5515 * The size limits on our soopt_getm is different from that on FreeBSD.
5516 * We limit the size of options to MCLBYTES. This will have to change
5517 * if we need to define options that need more space than MCLBYTES.
5518 */
5519 int
5520 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
5521 {
5522 struct mbuf *m, *m_prev;
5523 int sopt_size = sopt->sopt_valsize;
5524 int how;
5525
5526 if (sopt_size <= 0 || sopt_size > MCLBYTES)
5527 return (EMSGSIZE);
5528
5529 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
5530 MGET(m, how, MT_DATA);
5531 if (m == NULL)
5532 return (ENOBUFS);
5533 if (sopt_size > MLEN) {
5534 MCLGET(m, how);
5535 if ((m->m_flags & M_EXT) == 0) {
5536 m_free(m);
5537 return (ENOBUFS);
5538 }
5539 m->m_len = min(MCLBYTES, sopt_size);
5540 } else {
5541 m->m_len = min(MLEN, sopt_size);
5542 }
5543 sopt_size -= m->m_len;
5544 *mp = m;
5545 m_prev = m;
5546
5547 while (sopt_size > 0) {
5548 MGET(m, how, MT_DATA);
5549 if (m == NULL) {
5550 m_freem(*mp);
5551 return (ENOBUFS);
5552 }
5553 if (sopt_size > MLEN) {
5554 MCLGET(m, how);
5555 if ((m->m_flags & M_EXT) == 0) {
5556 m_freem(*mp);
5557 m_freem(m);
5558 return (ENOBUFS);
5559 }
5560 m->m_len = min(MCLBYTES, sopt_size);
5561 } else {
5562 m->m_len = min(MLEN, sopt_size);
5563 }
5564 sopt_size -= m->m_len;
5565 m_prev->m_next = m;
5566 m_prev = m;
5567 }
5568 return (0);
5569 }
5570
5571 /* copyin sopt data into mbuf chain */
5572 int
5573 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
5574 {
5575 struct mbuf *m0 = m;
5576
5577 if (sopt->sopt_val == USER_ADDR_NULL)
5578 return (0);
5579 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5580 if (sopt->sopt_p != kernproc) {
5581 int error;
5582
5583 error = copyin(sopt->sopt_val, mtod(m, char *),
5584 m->m_len);
5585 if (error != 0) {
5586 m_freem(m0);
5587 return (error);
5588 }
5589 } else {
5590 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
5591 mtod(m, char *), m->m_len);
5592 }
5593 sopt->sopt_valsize -= m->m_len;
5594 sopt->sopt_val += m->m_len;
5595 m = m->m_next;
5596 }
5597 /* should be allocated enoughly at ip6_sooptmcopyin() */
5598 if (m != NULL) {
5599 panic("soopt_mcopyin");
5600 /* NOTREACHED */
5601 }
5602 return (0);
5603 }
5604
5605 /* copyout mbuf chain data into soopt */
5606 int
5607 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
5608 {
5609 struct mbuf *m0 = m;
5610 size_t valsize = 0;
5611
5612 if (sopt->sopt_val == USER_ADDR_NULL)
5613 return (0);
5614 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5615 if (sopt->sopt_p != kernproc) {
5616 int error;
5617
5618 error = copyout(mtod(m, char *), sopt->sopt_val,
5619 m->m_len);
5620 if (error != 0) {
5621 m_freem(m0);
5622 return (error);
5623 }
5624 } else {
5625 bcopy(mtod(m, char *),
5626 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
5627 }
5628 sopt->sopt_valsize -= m->m_len;
5629 sopt->sopt_val += m->m_len;
5630 valsize += m->m_len;
5631 m = m->m_next;
5632 }
5633 if (m != NULL) {
5634 /* enough soopt buffer should be given from user-land */
5635 m_freem(m0);
5636 return (EINVAL);
5637 }
5638 sopt->sopt_valsize = valsize;
5639 return (0);
5640 }
5641
5642 void
5643 sohasoutofband(struct socket *so)
5644 {
5645 if (so->so_pgid < 0)
5646 gsignal(-so->so_pgid, SIGURG);
5647 else if (so->so_pgid > 0)
5648 proc_signal(so->so_pgid, SIGURG);
5649 selwakeup(&so->so_rcv.sb_sel);
5650 }
5651
5652 int
5653 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
5654 {
5655 #pragma unused(cred)
5656 struct proc *p = current_proc();
5657 int revents = 0;
5658
5659 socket_lock(so, 1);
5660 so_update_last_owner_locked(so, PROC_NULL);
5661 so_update_policy(so);
5662
5663 if (events & (POLLIN | POLLRDNORM))
5664 if (soreadable(so))
5665 revents |= events & (POLLIN | POLLRDNORM);
5666
5667 if (events & (POLLOUT | POLLWRNORM))
5668 if (sowriteable(so))
5669 revents |= events & (POLLOUT | POLLWRNORM);
5670
5671 if (events & (POLLPRI | POLLRDBAND))
5672 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
5673 revents |= events & (POLLPRI | POLLRDBAND);
5674
5675 if (revents == 0) {
5676 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
5677 /*
5678 * Darwin sets the flag first,
5679 * BSD calls selrecord first
5680 */
5681 so->so_rcv.sb_flags |= SB_SEL;
5682 selrecord(p, &so->so_rcv.sb_sel, wql);
5683 }
5684
5685 if (events & (POLLOUT | POLLWRNORM)) {
5686 /*
5687 * Darwin sets the flag first,
5688 * BSD calls selrecord first
5689 */
5690 so->so_snd.sb_flags |= SB_SEL;
5691 selrecord(p, &so->so_snd.sb_sel, wql);
5692 }
5693 }
5694
5695 socket_unlock(so, 1);
5696 return (revents);
5697 }
5698
5699 int
5700 soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx)
5701 {
5702 #pragma unused(fp)
5703 #if !CONFIG_MACF_SOCKET
5704 #pragma unused(ctx)
5705 #endif /* MAC_SOCKET */
5706 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5707 struct klist *skl;
5708
5709 socket_lock(so, 1);
5710 so_update_last_owner_locked(so, PROC_NULL);
5711 so_update_policy(so);
5712
5713 #if CONFIG_MACF_SOCKET
5714 if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
5715 kn, so) != 0) {
5716 socket_unlock(so, 1);
5717 return (1);
5718 }
5719 #endif /* MAC_SOCKET */
5720
5721 switch (kn->kn_filter) {
5722 case EVFILT_READ:
5723 kn->kn_fop = &soread_filtops;
5724 /*
5725 * If the caller explicitly asked for OOB results (e.g. poll()),
5726 * save that off in the hookid field and reserve the kn_flags
5727 * EV_OOBAND bit for output only.
5728 */
5729 if (kn->kn_flags & EV_OOBAND) {
5730 kn->kn_flags &= ~EV_OOBAND;
5731 kn->kn_hookid = EV_OOBAND;
5732 } else {
5733 kn->kn_hookid = 0;
5734 }
5735 skl = &so->so_rcv.sb_sel.si_note;
5736 break;
5737 case EVFILT_WRITE:
5738 kn->kn_fop = &sowrite_filtops;
5739 skl = &so->so_snd.sb_sel.si_note;
5740 break;
5741 case EVFILT_SOCK:
5742 kn->kn_fop = &sock_filtops;
5743 skl = &so->so_klist;
5744 kn->kn_hookid = 0;
5745 kn->kn_status |= KN_TOUCH;
5746 break;
5747 default:
5748 socket_unlock(so, 1);
5749 return (1);
5750 }
5751
5752 if (KNOTE_ATTACH(skl, kn)) {
5753 switch (kn->kn_filter) {
5754 case EVFILT_READ:
5755 so->so_rcv.sb_flags |= SB_KNOTE;
5756 break;
5757 case EVFILT_WRITE:
5758 so->so_snd.sb_flags |= SB_KNOTE;
5759 break;
5760 case EVFILT_SOCK:
5761 so->so_flags |= SOF_KNOTE;
5762 break;
5763 default:
5764 socket_unlock(so, 1);
5765 return (1);
5766 }
5767 }
5768 socket_unlock(so, 1);
5769 return (0);
5770 }
5771
5772 static void
5773 filt_sordetach(struct knote *kn)
5774 {
5775 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5776
5777 socket_lock(so, 1);
5778 if (so->so_rcv.sb_flags & SB_KNOTE)
5779 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
5780 so->so_rcv.sb_flags &= ~SB_KNOTE;
5781 socket_unlock(so, 1);
5782 }
5783
5784 /*ARGSUSED*/
5785 static int
5786 filt_soread(struct knote *kn, long hint)
5787 {
5788 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5789
5790 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5791 socket_lock(so, 1);
5792
5793 if (so->so_options & SO_ACCEPTCONN) {
5794 int isempty;
5795
5796 /*
5797 * Radar 6615193 handle the listen case dynamically
5798 * for kqueue read filter. This allows to call listen()
5799 * after registering the kqueue EVFILT_READ.
5800 */
5801
5802 kn->kn_data = so->so_qlen;
5803 isempty = ! TAILQ_EMPTY(&so->so_comp);
5804
5805 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5806 socket_unlock(so, 1);
5807
5808 return (isempty);
5809 }
5810
5811 /* socket isn't a listener */
5812 /*
5813 * NOTE_LOWAT specifies new low water mark in data, i.e.
5814 * the bytes of protocol data. We therefore exclude any
5815 * control bytes.
5816 */
5817 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5818
5819 /*
5820 * Clear out EV_OOBAND that filt_soread may have set in the
5821 * past.
5822 */
5823 kn->kn_flags &= ~EV_OOBAND;
5824 if ((so->so_oobmark) || (so->so_state & SS_RCVATMARK)) {
5825 kn->kn_flags |= EV_OOBAND;
5826 /*
5827 * If caller registered explicit interest in OOB data,
5828 * return immediately (data == amount beyond mark, for
5829 * legacy reasons - that should be changed later).
5830 */
5831 if (kn->kn_hookid == EV_OOBAND) {
5832 /*
5833 * When so_state is SS_RCVATMARK, so_oobmark
5834 * is 0.
5835 */
5836 kn->kn_data -= so->so_oobmark;
5837 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5838 socket_unlock(so, 1);
5839 return (1);
5840 }
5841 }
5842
5843 if ((so->so_state & SS_CANTRCVMORE)
5844 #if CONTENT_FILTER
5845 && cfil_sock_data_pending(&so->so_rcv) == 0
5846 #endif /* CONTENT_FILTER */
5847 ) {
5848 kn->kn_flags |= EV_EOF;
5849 kn->kn_fflags = so->so_error;
5850 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5851 socket_unlock(so, 1);
5852 return (1);
5853 }
5854
5855 if (so->so_error) { /* temporary udp error */
5856 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5857 socket_unlock(so, 1);
5858 return (1);
5859 }
5860
5861 int64_t lowwat = so->so_rcv.sb_lowat;
5862 /*
5863 * Ensure that when NOTE_LOWAT is used, the derived
5864 * low water mark is bounded by socket's rcv buf's
5865 * high and low water mark values.
5866 */
5867 if (kn->kn_sfflags & NOTE_LOWAT) {
5868 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
5869 lowwat = so->so_rcv.sb_hiwat;
5870 else if (kn->kn_sdata > lowwat)
5871 lowwat = kn->kn_sdata;
5872 }
5873
5874 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5875 socket_unlock(so, 1);
5876
5877 /*
5878 * The order below is important. Since NOTE_LOWAT
5879 * overrides sb_lowat, check for NOTE_LOWAT case
5880 * first.
5881 */
5882 if (kn->kn_sfflags & NOTE_LOWAT)
5883 return (kn->kn_data >= lowwat);
5884
5885 return (so->so_rcv.sb_cc >= lowwat);
5886 }
5887
5888 static void
5889 filt_sowdetach(struct knote *kn)
5890 {
5891 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5892 socket_lock(so, 1);
5893
5894 if (so->so_snd.sb_flags & SB_KNOTE)
5895 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
5896 so->so_snd.sb_flags &= ~SB_KNOTE;
5897 socket_unlock(so, 1);
5898 }
5899
5900 int
5901 so_wait_for_if_feedback(struct socket *so)
5902 {
5903 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
5904 (so->so_state & SS_ISCONNECTED)) {
5905 struct inpcb *inp = sotoinpcb(so);
5906 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
5907 return (1);
5908 }
5909 return (0);
5910 }
5911
5912 /*ARGSUSED*/
5913 static int
5914 filt_sowrite(struct knote *kn, long hint)
5915 {
5916 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5917 int ret = 0;
5918
5919 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5920 socket_lock(so, 1);
5921
5922 kn->kn_data = sbspace(&so->so_snd);
5923 if (so->so_state & SS_CANTSENDMORE) {
5924 kn->kn_flags |= EV_EOF;
5925 kn->kn_fflags = so->so_error;
5926 ret = 1;
5927 goto out;
5928 }
5929 if (so->so_error) { /* temporary udp error */
5930 ret = 1;
5931 goto out;
5932 }
5933 if (!socanwrite(so)) {
5934 ret = 0;
5935 goto out;
5936 }
5937 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
5938 ret = 1;
5939 goto out;
5940 }
5941 int64_t lowwat = so->so_snd.sb_lowat;
5942 if (kn->kn_sfflags & NOTE_LOWAT) {
5943 if (kn->kn_sdata > so->so_snd.sb_hiwat)
5944 lowwat = so->so_snd.sb_hiwat;
5945 else if (kn->kn_sdata > lowwat)
5946 lowwat = kn->kn_sdata;
5947 }
5948 if (kn->kn_data >= lowwat) {
5949 if (so->so_flags & SOF_NOTSENT_LOWAT) {
5950 if ((SOCK_DOM(so) == PF_INET
5951 || SOCK_DOM(so) == PF_INET6)
5952 && so->so_type == SOCK_STREAM) {
5953 ret = tcp_notsent_lowat_check(so);
5954 }
5955 #if MPTCP
5956 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
5957 (SOCK_PROTO(so) == IPPROTO_TCP)) {
5958 ret = mptcp_notsent_lowat_check(so);
5959 }
5960 #endif
5961 else {
5962 ret = 1;
5963 goto out;
5964 }
5965 } else {
5966 ret = 1;
5967 }
5968 }
5969 if (so_wait_for_if_feedback(so))
5970 ret = 0;
5971 out:
5972 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5973 socket_unlock(so, 1);
5974 return (ret);
5975 }
5976
5977 static void
5978 filt_sockdetach(struct knote *kn)
5979 {
5980 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5981 socket_lock(so, 1);
5982
5983 if ((so->so_flags & SOF_KNOTE) != 0)
5984 if (KNOTE_DETACH(&so->so_klist, kn))
5985 so->so_flags &= ~SOF_KNOTE;
5986 socket_unlock(so, 1);
5987 }
5988
5989 static int
5990 filt_sockev(struct knote *kn, long hint)
5991 {
5992 int ret = 0, locked = 0;
5993 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5994 long ev_hint = (hint & SO_FILT_HINT_EV);
5995 uint32_t level_trigger = 0;
5996
5997 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
5998 socket_lock(so, 1);
5999 locked = 1;
6000 }
6001
6002 if (ev_hint & SO_FILT_HINT_CONNRESET) {
6003 kn->kn_fflags |= NOTE_CONNRESET;
6004 }
6005 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6006 kn->kn_fflags |= NOTE_TIMEOUT;
6007 }
6008 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6009 kn->kn_fflags |= NOTE_NOSRCADDR;
6010 }
6011 if (ev_hint & SO_FILT_HINT_IFDENIED) {
6012 kn->kn_fflags |= NOTE_IFDENIED;
6013 }
6014 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6015 kn->kn_fflags |= NOTE_KEEPALIVE;
6016 }
6017 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6018 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6019 }
6020 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6021 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6022 }
6023 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6024 (so->so_state & SS_ISCONNECTED)) {
6025 kn->kn_fflags |= NOTE_CONNECTED;
6026 level_trigger |= NOTE_CONNECTED;
6027 }
6028 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6029 (so->so_state & SS_ISDISCONNECTED)) {
6030 kn->kn_fflags |= NOTE_DISCONNECTED;
6031 level_trigger |= NOTE_DISCONNECTED;
6032 }
6033 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6034 if (so->so_proto != NULL &&
6035 (so->so_proto->pr_flags & PR_EVCONNINFO))
6036 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6037 }
6038
6039 if ((so->so_state & SS_CANTRCVMORE)
6040 #if CONTENT_FILTER
6041 && cfil_sock_data_pending(&so->so_rcv) == 0
6042 #endif /* CONTENT_FILTER */
6043 ) {
6044 kn->kn_fflags |= NOTE_READCLOSED;
6045 level_trigger |= NOTE_READCLOSED;
6046 }
6047
6048 if (so->so_state & SS_CANTSENDMORE) {
6049 kn->kn_fflags |= NOTE_WRITECLOSED;
6050 level_trigger |= NOTE_WRITECLOSED;
6051 }
6052
6053 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6054 (so->so_flags & SOF_SUSPENDED)) {
6055 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6056
6057 /* If resume event was delivered before, reset it */
6058 kn->kn_hookid &= ~NOTE_RESUME;
6059
6060 kn->kn_fflags |= NOTE_SUSPEND;
6061 level_trigger |= NOTE_SUSPEND;
6062 }
6063
6064 if ((ev_hint & SO_FILT_HINT_RESUME) ||
6065 (so->so_flags & SOF_SUSPENDED) == 0) {
6066 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6067
6068 /* If suspend event was delivered before, reset it */
6069 kn->kn_hookid &= ~NOTE_SUSPEND;
6070
6071 kn->kn_fflags |= NOTE_RESUME;
6072 level_trigger |= NOTE_RESUME;
6073 }
6074
6075 if (so->so_error != 0) {
6076 ret = 1;
6077 kn->kn_data = so->so_error;
6078 kn->kn_flags |= EV_EOF;
6079 } else {
6080 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
6081 }
6082
6083 /* Reset any events that are not requested on this knote */
6084 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6085 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6086
6087 /* Find the level triggerred events that are already delivered */
6088 level_trigger &= kn->kn_hookid;
6089 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6090
6091 /* Do not deliver level triggerred events more than once */
6092 if ((kn->kn_fflags & ~level_trigger) != 0)
6093 ret = 1;
6094
6095 if (locked)
6096 socket_unlock(so, 1);
6097
6098 return (ret);
6099 }
6100
6101 static void
6102 filt_socktouch(struct knote *kn, struct kevent_internal_s *kev, long type)
6103 {
6104 #pragma unused(kev)
6105 switch (type) {
6106 case EVENT_REGISTER:
6107 {
6108 uint32_t changed_flags;
6109 changed_flags = (kn->kn_sfflags ^ kn->kn_hookid);
6110
6111 /*
6112 * Since we keep track of events that are already
6113 * delivered, if any of those events are not requested
6114 * anymore the state related to them can be reset
6115 */
6116 kn->kn_hookid &=
6117 ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6118 break;
6119 }
6120 case EVENT_PROCESS:
6121 /*
6122 * Store the state of the events being delivered. This
6123 * state can be used to deliver level triggered events
6124 * ateast once and still avoid waking up the application
6125 * multiple times as long as the event is active.
6126 */
6127 if (kn->kn_fflags != 0)
6128 kn->kn_hookid |= (kn->kn_fflags &
6129 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6130
6131 /*
6132 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6133 * only one of them and remember the last one that was
6134 * delivered last
6135 */
6136 if (kn->kn_fflags & NOTE_SUSPEND)
6137 kn->kn_hookid &= ~NOTE_RESUME;
6138 if (kn->kn_fflags & NOTE_RESUME)
6139 kn->kn_hookid &= ~NOTE_SUSPEND;
6140 break;
6141 default:
6142 break;
6143 }
6144 }
6145
6146 void
6147 get_sockev_state(struct socket *so, u_int32_t *statep)
6148 {
6149 u_int32_t state = *(statep);
6150
6151 if (so->so_state & SS_ISCONNECTED)
6152 state |= SOCKEV_CONNECTED;
6153 else
6154 state &= ~(SOCKEV_CONNECTED);
6155 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
6156 *(statep) = state;
6157 }
6158
6159 #define SO_LOCK_HISTORY_STR_LEN \
6160 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
6161
6162 __private_extern__ const char *
6163 solockhistory_nr(struct socket *so)
6164 {
6165 size_t n = 0;
6166 int i;
6167 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
6168
6169 bzero(lock_history_str, sizeof (lock_history_str));
6170 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
6171 n += snprintf(lock_history_str + n,
6172 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
6173 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
6174 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
6175 }
6176 return (lock_history_str);
6177 }
6178
6179 int
6180 socket_lock(struct socket *so, int refcount)
6181 {
6182 int error = 0;
6183 void *lr_saved;
6184
6185 lr_saved = __builtin_return_address(0);
6186
6187 if (so->so_proto->pr_lock) {
6188 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
6189 } else {
6190 #ifdef MORE_LOCKING_DEBUG
6191 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
6192 LCK_MTX_ASSERT_NOTOWNED);
6193 #endif
6194 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
6195 if (refcount)
6196 so->so_usecount++;
6197 so->lock_lr[so->next_lock_lr] = lr_saved;
6198 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
6199 }
6200
6201 return (error);
6202 }
6203
6204 int
6205 socket_unlock(struct socket *so, int refcount)
6206 {
6207 int error = 0;
6208 void *lr_saved;
6209 lck_mtx_t *mutex_held;
6210
6211 lr_saved = __builtin_return_address(0);
6212
6213 if (so->so_proto == NULL) {
6214 panic("%s: null so_proto so=%p\n", __func__, so);
6215 /* NOTREACHED */
6216 }
6217
6218 if (so && so->so_proto->pr_unlock) {
6219 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
6220 } else {
6221 mutex_held = so->so_proto->pr_domain->dom_mtx;
6222 #ifdef MORE_LOCKING_DEBUG
6223 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
6224 #endif
6225 so->unlock_lr[so->next_unlock_lr] = lr_saved;
6226 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
6227
6228 if (refcount) {
6229 if (so->so_usecount <= 0) {
6230 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
6231 "lrh=%s", __func__, so->so_usecount, so,
6232 SOCK_DOM(so), so->so_type,
6233 SOCK_PROTO(so), solockhistory_nr(so));
6234 /* NOTREACHED */
6235 }
6236
6237 so->so_usecount--;
6238 if (so->so_usecount == 0)
6239 sofreelastref(so, 1);
6240 }
6241 lck_mtx_unlock(mutex_held);
6242 }
6243
6244 return (error);
6245 }
6246
6247 /* Called with socket locked, will unlock socket */
6248 void
6249 sofree(struct socket *so)
6250 {
6251 lck_mtx_t *mutex_held;
6252
6253 if (so->so_proto->pr_getlock != NULL)
6254 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
6255 else
6256 mutex_held = so->so_proto->pr_domain->dom_mtx;
6257 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
6258
6259 sofreelastref(so, 0);
6260 }
6261
6262 void
6263 soreference(struct socket *so)
6264 {
6265 socket_lock(so, 1); /* locks & take one reference on socket */
6266 socket_unlock(so, 0); /* unlock only */
6267 }
6268
6269 void
6270 sodereference(struct socket *so)
6271 {
6272 socket_lock(so, 0);
6273 socket_unlock(so, 1);
6274 }
6275
6276 /*
6277 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
6278 * possibility of using jumbo clusters. Caller must ensure to hold
6279 * the socket lock.
6280 */
6281 void
6282 somultipages(struct socket *so, boolean_t set)
6283 {
6284 if (set)
6285 so->so_flags |= SOF_MULTIPAGES;
6286 else
6287 so->so_flags &= ~SOF_MULTIPAGES;
6288 }
6289
6290 void
6291 soif2kcl(struct socket *so, boolean_t set)
6292 {
6293 if (set)
6294 so->so_flags1 |= SOF1_IF_2KCL;
6295 else
6296 so->so_flags1 &= ~SOF1_IF_2KCL;
6297 }
6298
6299 int
6300 so_isdstlocal(struct socket *so) {
6301
6302 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6303
6304 if (SOCK_DOM(so) == PF_INET)
6305 return (inaddr_local(inp->inp_faddr));
6306 else if (SOCK_DOM(so) == PF_INET6)
6307 return (in6addr_local(&inp->in6p_faddr));
6308
6309 return (0);
6310 }
6311
6312 int
6313 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
6314 {
6315 struct sockbuf *rcv, *snd;
6316 int err = 0, defunct;
6317
6318 rcv = &so->so_rcv;
6319 snd = &so->so_snd;
6320
6321 defunct = (so->so_flags & SOF_DEFUNCT);
6322 if (defunct) {
6323 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
6324 panic("%s: SB_DROP not set", __func__);
6325 /* NOTREACHED */
6326 }
6327 goto done;
6328 }
6329
6330 if (so->so_flags & SOF_NODEFUNCT) {
6331 if (noforce) {
6332 err = EOPNOTSUPP;
6333 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) "
6334 "so 0x%llx [%d,%d] is not eligible for defunct "
6335 "(%d)\n", __func__, proc_selfpid(), proc_pid(p),
6336 level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6337 SOCK_DOM(so), SOCK_TYPE(so), err));
6338 return (err);
6339 }
6340 so->so_flags &= ~SOF_NODEFUNCT;
6341 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
6342 "[%d,%d] defunct by force\n", __func__, proc_selfpid(),
6343 proc_pid(p), level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6344 SOCK_DOM(so), SOCK_TYPE(so)));
6345 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
6346 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6347 struct ifnet *ifp = inp->inp_last_outifp;
6348
6349 if (ifp && IFNET_IS_CELLULAR(ifp)) {
6350 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
6351 } else if (so->so_flags & SOF_DELEGATED) {
6352 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
6353 } else if (soextbkidlestat.so_xbkidle_time == 0) {
6354 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
6355 } else if (noforce) {
6356 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
6357
6358 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
6359 so->so_extended_bk_start = net_uptime();
6360 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
6361
6362 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
6363
6364 err = EOPNOTSUPP;
6365 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) "
6366 "extend bk idle "
6367 "so 0x%llx rcv hw %d cc %d\n",
6368 __func__, proc_selfpid(), proc_pid(p),
6369 level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6370 so->so_rcv.sb_hiwat, so->so_rcv.sb_cc));
6371 return (err);
6372 } else {
6373 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
6374 }
6375 }
6376
6377 so->so_flags |= SOF_DEFUNCT;
6378
6379 /* Prevent further data from being appended to the socket buffers */
6380 snd->sb_flags |= SB_DROP;
6381 rcv->sb_flags |= SB_DROP;
6382
6383 /* Flush any existing data in the socket buffers */
6384 if (rcv->sb_cc != 0) {
6385 rcv->sb_flags &= ~SB_SEL;
6386 selthreadclear(&rcv->sb_sel);
6387 sbrelease(rcv);
6388 }
6389 if (snd->sb_cc != 0) {
6390 snd->sb_flags &= ~SB_SEL;
6391 selthreadclear(&snd->sb_sel);
6392 sbrelease(snd);
6393 }
6394
6395 done:
6396 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%d,%d] %s "
6397 "defunct%s\n", __func__, proc_selfpid(), proc_pid(p), level,
6398 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so),
6399 defunct ? "is already" : "marked as",
6400 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ? " extbkidle" : ""));
6401
6402 return (err);
6403 }
6404
6405 int
6406 sodefunct(struct proc *p, struct socket *so, int level)
6407 {
6408 struct sockbuf *rcv, *snd;
6409
6410 if (!(so->so_flags & SOF_DEFUNCT)) {
6411 panic("%s improperly called", __func__);
6412 /* NOTREACHED */
6413 }
6414 if (so->so_state & SS_DEFUNCT)
6415 goto done;
6416
6417 rcv = &so->so_rcv;
6418 snd = &so->so_snd;
6419
6420 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6421 char s[MAX_IPv6_STR_LEN];
6422 char d[MAX_IPv6_STR_LEN];
6423 struct inpcb *inp = sotoinpcb(so);
6424
6425 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%s "
6426 "%s:%d -> %s:%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
6427 "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
6428 proc_pid(p), level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6429 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
6430 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
6431 (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr),
6432 s, sizeof (s)), ntohs(inp->in6p_lport),
6433 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
6434 (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr,
6435 d, sizeof (d)), ntohs(inp->in6p_fport),
6436 (uint32_t)rcv->sb_sel.si_flags,
6437 (uint32_t)snd->sb_sel.si_flags,
6438 rcv->sb_flags, snd->sb_flags));
6439 } else {
6440 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
6441 "[%d,%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
6442 "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
6443 proc_pid(p), level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6444 SOCK_DOM(so), SOCK_TYPE(so), (uint32_t)rcv->sb_sel.si_flags,
6445 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
6446 snd->sb_flags));
6447 }
6448
6449 /*
6450 * Unwedge threads blocked on sbwait() and sb_lock().
6451 */
6452 sbwakeup(rcv);
6453 sbwakeup(snd);
6454
6455 so->so_flags1 |= SOF1_DEFUNCTINPROG;
6456 if (rcv->sb_flags & SB_LOCK)
6457 sbunlock(rcv, TRUE); /* keep socket locked */
6458 if (snd->sb_flags & SB_LOCK)
6459 sbunlock(snd, TRUE); /* keep socket locked */
6460
6461 /*
6462 * Flush the buffers and disconnect. We explicitly call shutdown
6463 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
6464 * states are set for the socket. This would also flush out data
6465 * hanging off the receive list of this socket.
6466 */
6467 (void) soshutdownlock_final(so, SHUT_RD);
6468 (void) soshutdownlock_final(so, SHUT_WR);
6469 (void) sodisconnectlocked(so);
6470
6471 /*
6472 * Explicitly handle connectionless-protocol disconnection
6473 * and release any remaining data in the socket buffers.
6474 */
6475 if (!(so->so_flags & SS_ISDISCONNECTED))
6476 (void) soisdisconnected(so);
6477
6478 if (so->so_error == 0)
6479 so->so_error = EBADF;
6480
6481 if (rcv->sb_cc != 0) {
6482 rcv->sb_flags &= ~SB_SEL;
6483 selthreadclear(&rcv->sb_sel);
6484 sbrelease(rcv);
6485 }
6486 if (snd->sb_cc != 0) {
6487 snd->sb_flags &= ~SB_SEL;
6488 selthreadclear(&snd->sb_sel);
6489 sbrelease(snd);
6490 }
6491 so->so_state |= SS_DEFUNCT;
6492
6493 done:
6494 return (0);
6495 }
6496
6497 int
6498 soresume(struct proc *p, struct socket *so, int locked)
6499 {
6500 if (locked == 0)
6501 socket_lock(so, 1);
6502
6503 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
6504 SODEFUNCTLOG(("%s[%d]: )target pid %d) so 0x%llx [%d,%d] "
6505 "resumed from bk idle\n",
6506 __func__, proc_selfpid(), proc_pid(p),
6507 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6508 SOCK_DOM(so), SOCK_TYPE(so)));
6509
6510 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
6511 so->so_extended_bk_start = 0;
6512 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
6513
6514 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
6515 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
6516 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
6517 }
6518 if (locked == 0)
6519 socket_unlock(so, 1);
6520
6521 return (0);
6522 }
6523
6524 /*
6525 * Does not attempt to account for sockets that are delegated from
6526 * the current process
6527 */
6528 int
6529 so_set_extended_bk_idle(struct socket *so, int optval)
6530 {
6531 int error = 0;
6532
6533 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
6534 SOCK_PROTO(so) != IPPROTO_TCP) {
6535 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
6536 error = EOPNOTSUPP;
6537 } else if (optval == 0) {
6538 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
6539
6540 soresume(current_proc(), so, 1);
6541 } else {
6542 struct proc *p = current_proc();
6543 int i;
6544 struct filedesc *fdp;
6545 int count = 0;
6546
6547 proc_fdlock(p);
6548
6549 fdp = p->p_fd;
6550 for (i = 0; i < fdp->fd_nfiles; i++) {
6551 struct fileproc *fp = fdp->fd_ofiles[i];
6552 struct socket *so2;
6553
6554 if (fp == NULL ||
6555 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
6556 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
6557 continue;
6558
6559 so2 = (struct socket *)fp->f_fglob->fg_data;
6560 if (so != so2 &&
6561 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED)
6562 count++;
6563 if (count >= soextbkidlestat.so_xbkidle_maxperproc)
6564 break;
6565 }
6566 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
6567 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
6568 error = EBUSY;
6569 } else if (so->so_flags & SOF_DELEGATED) {
6570 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
6571 error = EBUSY;
6572 } else {
6573 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
6574 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
6575 }
6576 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d] "
6577 "%s marked for extended bk idle\n",
6578 __func__, proc_selfpid(),
6579 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6580 SOCK_DOM(so), SOCK_TYPE(so),
6581 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
6582 "is" : "not"));
6583
6584 proc_fdunlock(p);
6585 }
6586
6587 return (error);
6588 }
6589
6590 static void
6591 so_stop_extended_bk_idle(struct socket *so)
6592 {
6593 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
6594 so->so_extended_bk_start = 0;
6595
6596 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
6597 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
6598 /*
6599 * Force defunct
6600 */
6601 sosetdefunct(current_proc(), so,
6602 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
6603 if (so->so_flags & SOF_DEFUNCT) {
6604 sodefunct(current_proc(), so,
6605 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
6606 }
6607 }
6608
6609 void
6610 so_drain_extended_bk_idle(struct socket *so)
6611 {
6612 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
6613 /*
6614 * Only penalize sockets that have outstanding data
6615 */
6616 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
6617 so_stop_extended_bk_idle(so);
6618
6619 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
6620 }
6621 }
6622 }
6623
6624 /*
6625 * Return values tells if socket is still in extended background idle
6626 */
6627 int
6628 so_check_extended_bk_idle_time(struct socket *so)
6629 {
6630 int ret = 1;
6631
6632 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
6633 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d]\n",
6634 __func__, proc_selfpid(),
6635 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6636 SOCK_DOM(so), SOCK_TYPE(so)));
6637 if (net_uptime() - so->so_extended_bk_start >
6638 soextbkidlestat.so_xbkidle_time) {
6639 so_stop_extended_bk_idle(so);
6640
6641 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
6642
6643 ret = 0;
6644 } else {
6645 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6646
6647 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
6648 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
6649 }
6650 }
6651
6652 return (ret);
6653 }
6654
6655 void
6656 resume_proc_sockets(proc_t p)
6657 {
6658 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
6659 struct filedesc *fdp;
6660 int i;
6661
6662 proc_fdlock(p);
6663 fdp = p->p_fd;
6664 for (i = 0; i < fdp->fd_nfiles; i++) {
6665 struct fileproc *fp;
6666 struct socket *so;
6667
6668 fp = fdp->fd_ofiles[i];
6669 if (fp == NULL ||
6670 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
6671 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
6672 continue;
6673
6674 so = (struct socket *)fp->f_fglob->fg_data;
6675 (void) soresume(p, so, 0);
6676 }
6677 proc_fdunlock(p);
6678
6679 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
6680 }
6681 }
6682
6683 __private_extern__ int
6684 so_set_recv_anyif(struct socket *so, int optval)
6685 {
6686 int ret = 0;
6687
6688 #if INET6
6689 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6690 #else
6691 if (SOCK_DOM(so) == PF_INET) {
6692 #endif /* !INET6 */
6693 if (optval)
6694 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
6695 else
6696 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
6697 }
6698
6699 return (ret);
6700 }
6701
6702 __private_extern__ int
6703 so_get_recv_anyif(struct socket *so)
6704 {
6705 int ret = 0;
6706
6707 #if INET6
6708 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6709 #else
6710 if (SOCK_DOM(so) == PF_INET) {
6711 #endif /* !INET6 */
6712 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
6713 }
6714
6715 return (ret);
6716 }
6717
6718 int
6719 so_set_restrictions(struct socket *so, uint32_t vals)
6720 {
6721 int nocell_old, nocell_new;
6722 int noexpensive_old, noexpensive_new;
6723
6724 /*
6725 * Deny-type restrictions are trapdoors; once set they cannot be
6726 * unset for the lifetime of the socket. This allows them to be
6727 * issued by a framework on behalf of the application without
6728 * having to worry that they can be undone.
6729 *
6730 * Note here that socket-level restrictions overrides any protocol
6731 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
6732 * socket restriction issued on the socket has a higher precendence
6733 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
6734 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
6735 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
6736 */
6737 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
6738 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
6739 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
6740 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
6741 SO_RESTRICT_DENY_EXPENSIVE));
6742 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
6743 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
6744
6745 /* we can only set, not clear restrictions */
6746 if ((nocell_new - nocell_old) == 0 &&
6747 (noexpensive_new - noexpensive_old) == 0)
6748 return (0);
6749 #if INET6
6750 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6751 #else
6752 if (SOCK_DOM(so) == PF_INET) {
6753 #endif /* !INET6 */
6754 if (nocell_new - nocell_old != 0) {
6755 /*
6756 * if deny cellular is now set, do what's needed
6757 * for INPCB
6758 */
6759 inp_set_nocellular(sotoinpcb(so));
6760 }
6761 if (noexpensive_new - noexpensive_old != 0) {
6762 inp_set_noexpensive(sotoinpcb(so));
6763 }
6764 }
6765
6766 return (0);
6767 }
6768
6769 uint32_t
6770 so_get_restrictions(struct socket *so)
6771 {
6772 return (so->so_restrictions & (SO_RESTRICT_DENY_IN |
6773 SO_RESTRICT_DENY_OUT |
6774 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE));
6775 }
6776
6777 struct sockaddr_entry *
6778 sockaddrentry_alloc(int how)
6779 {
6780 struct sockaddr_entry *se;
6781
6782 se = (how == M_WAITOK) ? zalloc(se_zone) : zalloc_noblock(se_zone);
6783 if (se != NULL)
6784 bzero(se, se_zone_size);
6785
6786 return (se);
6787 }
6788
6789 void
6790 sockaddrentry_free(struct sockaddr_entry *se)
6791 {
6792 if (se->se_addr != NULL) {
6793 FREE(se->se_addr, M_SONAME);
6794 se->se_addr = NULL;
6795 }
6796 zfree(se_zone, se);
6797 }
6798
6799 struct sockaddr_entry *
6800 sockaddrentry_dup(const struct sockaddr_entry *src_se, int how)
6801 {
6802 struct sockaddr_entry *dst_se;
6803
6804 dst_se = sockaddrentry_alloc(how);
6805 if (dst_se != NULL) {
6806 int len = src_se->se_addr->sa_len;
6807
6808 MALLOC(dst_se->se_addr, struct sockaddr *,
6809 len, M_SONAME, how | M_ZERO);
6810 if (dst_se->se_addr != NULL) {
6811 bcopy(src_se->se_addr, dst_se->se_addr, len);
6812 } else {
6813 sockaddrentry_free(dst_se);
6814 dst_se = NULL;
6815 }
6816 }
6817
6818 return (dst_se);
6819 }
6820
6821 struct sockaddr_list *
6822 sockaddrlist_alloc(int how)
6823 {
6824 struct sockaddr_list *sl;
6825
6826 sl = (how == M_WAITOK) ? zalloc(sl_zone) : zalloc_noblock(sl_zone);
6827 if (sl != NULL) {
6828 bzero(sl, sl_zone_size);
6829 TAILQ_INIT(&sl->sl_head);
6830 }
6831 return (sl);
6832 }
6833
6834 void
6835 sockaddrlist_free(struct sockaddr_list *sl)
6836 {
6837 struct sockaddr_entry *se, *tse;
6838
6839 TAILQ_FOREACH_SAFE(se, &sl->sl_head, se_link, tse) {
6840 sockaddrlist_remove(sl, se);
6841 sockaddrentry_free(se);
6842 }
6843 VERIFY(sl->sl_cnt == 0 && TAILQ_EMPTY(&sl->sl_head));
6844 zfree(sl_zone, sl);
6845 }
6846
6847 void
6848 sockaddrlist_insert(struct sockaddr_list *sl, struct sockaddr_entry *se)
6849 {
6850 VERIFY(!(se->se_flags & SEF_ATTACHED));
6851 se->se_flags |= SEF_ATTACHED;
6852 TAILQ_INSERT_TAIL(&sl->sl_head, se, se_link);
6853 sl->sl_cnt++;
6854 VERIFY(sl->sl_cnt != 0);
6855 }
6856
6857 void
6858 sockaddrlist_remove(struct sockaddr_list *sl, struct sockaddr_entry *se)
6859 {
6860 VERIFY(se->se_flags & SEF_ATTACHED);
6861 se->se_flags &= ~SEF_ATTACHED;
6862 VERIFY(sl->sl_cnt != 0);
6863 sl->sl_cnt--;
6864 TAILQ_REMOVE(&sl->sl_head, se, se_link);
6865 }
6866
6867 struct sockaddr_list *
6868 sockaddrlist_dup(const struct sockaddr_list *src_sl, int how)
6869 {
6870 struct sockaddr_entry *src_se, *tse;
6871 struct sockaddr_list *dst_sl;
6872
6873 dst_sl = sockaddrlist_alloc(how);
6874 if (dst_sl == NULL)
6875 return (NULL);
6876
6877 TAILQ_FOREACH_SAFE(src_se, &src_sl->sl_head, se_link, tse) {
6878 struct sockaddr_entry *dst_se;
6879
6880 if (src_se->se_addr == NULL)
6881 continue;
6882
6883 dst_se = sockaddrentry_dup(src_se, how);
6884 if (dst_se == NULL) {
6885 sockaddrlist_free(dst_sl);
6886 return (NULL);
6887 }
6888
6889 sockaddrlist_insert(dst_sl, dst_se);
6890 }
6891 VERIFY(src_sl->sl_cnt == dst_sl->sl_cnt);
6892
6893 return (dst_sl);
6894 }
6895
6896 int
6897 so_set_effective_pid(struct socket *so, int epid, struct proc *p)
6898 {
6899 struct proc *ep = PROC_NULL;
6900 int error = 0;
6901
6902 /* pid 0 is reserved for kernel */
6903 if (epid == 0) {
6904 error = EINVAL;
6905 goto done;
6906 }
6907
6908 /*
6909 * If this is an in-kernel socket, prevent its delegate
6910 * association from changing unless the socket option is
6911 * coming from within the kernel itself.
6912 */
6913 if (so->last_pid == 0 && p != kernproc) {
6914 error = EACCES;
6915 goto done;
6916 }
6917
6918 /*
6919 * If this is issued by a process that's recorded as the
6920 * real owner of the socket, or if the pid is the same as
6921 * the process's own pid, then proceed. Otherwise ensure
6922 * that the issuing process has the necessary privileges.
6923 */
6924 if (epid != so->last_pid || epid != proc_pid(p)) {
6925 if ((error = priv_check_cred(kauth_cred_get(),
6926 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
6927 error = EACCES;
6928 goto done;
6929 }
6930 }
6931
6932 /* Find the process that corresponds to the effective pid */
6933 if ((ep = proc_find(epid)) == PROC_NULL) {
6934 error = ESRCH;
6935 goto done;
6936 }
6937
6938 /*
6939 * If a process tries to delegate the socket to itself, then
6940 * there's really nothing to do; treat it as a way for the
6941 * delegate association to be cleared. Note that we check
6942 * the passed-in proc rather than calling proc_selfpid(),
6943 * as we need to check the process issuing the socket option
6944 * which could be kernproc. Given that we don't allow 0 for
6945 * effective pid, it means that a delegated in-kernel socket
6946 * stays delegated during its lifetime (which is probably OK.)
6947 */
6948 if (epid == proc_pid(p)) {
6949 so->so_flags &= ~SOF_DELEGATED;
6950 so->e_upid = 0;
6951 so->e_pid = 0;
6952 uuid_clear(so->e_uuid);
6953 } else {
6954 so->so_flags |= SOF_DELEGATED;
6955 so->e_upid = proc_uniqueid(ep);
6956 so->e_pid = proc_pid(ep);
6957 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
6958 }
6959 done:
6960 if (error == 0 && net_io_policy_log) {
6961 uuid_string_t buf;
6962
6963 uuid_unparse(so->e_uuid, buf);
6964 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
6965 "euuid %s%s\n", __func__, proc_name_address(p),
6966 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6967 SOCK_DOM(so), SOCK_TYPE(so),
6968 so->e_pid, proc_name_address(ep), buf,
6969 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
6970 } else if (error != 0 && net_io_policy_log) {
6971 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
6972 "ERROR (%d)\n", __func__, proc_name_address(p),
6973 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6974 SOCK_DOM(so), SOCK_TYPE(so),
6975 epid, (ep == PROC_NULL) ? "PROC_NULL" :
6976 proc_name_address(ep), error);
6977 }
6978
6979 /* Update this socket's policy upon success */
6980 if (error == 0) {
6981 so->so_policy_gencnt *= -1;
6982 so_update_policy(so);
6983 #if NECP
6984 so_update_necp_policy(so, NULL, NULL);
6985 #endif /* NECP */
6986 }
6987
6988 if (ep != PROC_NULL)
6989 proc_rele(ep);
6990
6991 return (error);
6992 }
6993
6994 int
6995 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
6996 {
6997 uuid_string_t buf;
6998 uuid_t uuid;
6999 int error = 0;
7000
7001 /* UUID must not be all-zeroes (reserved for kernel) */
7002 if (uuid_is_null(euuid)) {
7003 error = EINVAL;
7004 goto done;
7005 }
7006
7007 /*
7008 * If this is an in-kernel socket, prevent its delegate
7009 * association from changing unless the socket option is
7010 * coming from within the kernel itself.
7011 */
7012 if (so->last_pid == 0 && p != kernproc) {
7013 error = EACCES;
7014 goto done;
7015 }
7016
7017 /* Get the UUID of the issuing process */
7018 proc_getexecutableuuid(p, uuid, sizeof (uuid));
7019
7020 /*
7021 * If this is issued by a process that's recorded as the
7022 * real owner of the socket, or if the uuid is the same as
7023 * the process's own uuid, then proceed. Otherwise ensure
7024 * that the issuing process has the necessary privileges.
7025 */
7026 if (uuid_compare(euuid, so->last_uuid) != 0 ||
7027 uuid_compare(euuid, uuid) != 0) {
7028 if ((error = priv_check_cred(kauth_cred_get(),
7029 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7030 error = EACCES;
7031 goto done;
7032 }
7033 }
7034
7035 /*
7036 * If a process tries to delegate the socket to itself, then
7037 * there's really nothing to do; treat it as a way for the
7038 * delegate association to be cleared. Note that we check
7039 * the uuid of the passed-in proc rather than that of the
7040 * current process, as we need to check the process issuing
7041 * the socket option which could be kernproc itself. Given
7042 * that we don't allow 0 for effective uuid, it means that
7043 * a delegated in-kernel socket stays delegated during its
7044 * lifetime (which is okay.)
7045 */
7046 if (uuid_compare(euuid, uuid) == 0) {
7047 so->so_flags &= ~SOF_DELEGATED;
7048 so->e_upid = 0;
7049 so->e_pid = 0;
7050 uuid_clear(so->e_uuid);
7051 } else {
7052 so->so_flags |= SOF_DELEGATED;
7053 /*
7054 * Unlike so_set_effective_pid(), we only have the UUID
7055 * here and the process ID is not known. Inherit the
7056 * real {pid,upid} of the socket.
7057 */
7058 so->e_upid = so->last_upid;
7059 so->e_pid = so->last_pid;
7060 uuid_copy(so->e_uuid, euuid);
7061 }
7062
7063 done:
7064 if (error == 0 && net_io_policy_log) {
7065 uuid_unparse(so->e_uuid, buf);
7066 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7067 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
7068 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7069 SOCK_TYPE(so), so->e_pid, buf,
7070 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7071 } else if (error != 0 && net_io_policy_log) {
7072 uuid_unparse(euuid, buf);
7073 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7074 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
7075 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7076 SOCK_TYPE(so), buf, error);
7077 }
7078
7079 /* Update this socket's policy upon success */
7080 if (error == 0) {
7081 so->so_policy_gencnt *= -1;
7082 so_update_policy(so);
7083 #if NECP
7084 so_update_necp_policy(so, NULL, NULL);
7085 #endif /* NECP */
7086 }
7087
7088 return (error);
7089 }
7090
7091 void
7092 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
7093 uint32_t ev_datalen)
7094 {
7095 struct kev_msg ev_msg;
7096
7097 /*
7098 * A netpolicy event always starts with a netpolicy_event_data
7099 * structure, but the caller can provide for a longer event
7100 * structure to post, depending on the event code.
7101 */
7102 VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
7103
7104 bzero(&ev_msg, sizeof (ev_msg));
7105 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7106 ev_msg.kev_class = KEV_NETWORK_CLASS;
7107 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
7108 ev_msg.event_code = ev_code;
7109
7110 ev_msg.dv[0].data_ptr = ev_data;
7111 ev_msg.dv[0].data_length = ev_datalen;
7112
7113 kev_post_msg(&ev_msg);
7114 }
7115
7116 void
7117 socket_post_kev_msg(uint32_t ev_code,
7118 struct kev_socket_event_data *ev_data,
7119 uint32_t ev_datalen)
7120 {
7121 struct kev_msg ev_msg;
7122
7123 bzero(&ev_msg, sizeof(ev_msg));
7124 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7125 ev_msg.kev_class = KEV_NETWORK_CLASS;
7126 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
7127 ev_msg.event_code = ev_code;
7128
7129 ev_msg.dv[0].data_ptr = ev_data;
7130 ev_msg.dv[0]. data_length = ev_datalen;
7131
7132 kev_post_msg(&ev_msg);
7133 }
7134
7135 void
7136 socket_post_kev_msg_closed(struct socket *so)
7137 {
7138 struct kev_socket_closed ev;
7139 struct sockaddr *socksa = NULL, *peersa = NULL;
7140 int err;
7141 bzero(&ev, sizeof(ev));
7142 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
7143 if (err == 0) {
7144 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
7145 &peersa);
7146 if (err == 0) {
7147 memcpy(&ev.ev_data.kev_sockname, socksa,
7148 min(socksa->sa_len,
7149 sizeof (ev.ev_data.kev_sockname)));
7150 memcpy(&ev.ev_data.kev_peername, peersa,
7151 min(peersa->sa_len,
7152 sizeof (ev.ev_data.kev_peername)));
7153 socket_post_kev_msg(KEV_SOCKET_CLOSED,
7154 &ev.ev_data, sizeof (ev));
7155 }
7156 }
7157 if (socksa != NULL)
7158 FREE(socksa, M_SONAME);
7159 if (peersa != NULL)
7160 FREE(peersa, M_SONAME);
7161 }