]> git.saurik.com Git - apple/xnu.git/blob - bsd/kern/uipc_socket.c
29568b48ae1edbbf2a30b3314ef848039323cddd
[apple/xnu.git] / bsd / kern / uipc_socket.c
1 /*
2 * Copyright (c) 1998-2015 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29 /*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63 /*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70 #include <sys/param.h>
71 #include <sys/systm.h>
72 #include <sys/filedesc.h>
73 #include <sys/proc.h>
74 #include <sys/proc_internal.h>
75 #include <sys/kauth.h>
76 #include <sys/file_internal.h>
77 #include <sys/fcntl.h>
78 #include <sys/malloc.h>
79 #include <sys/mbuf.h>
80 #include <sys/domain.h>
81 #include <sys/kernel.h>
82 #include <sys/event.h>
83 #include <sys/poll.h>
84 #include <sys/protosw.h>
85 #include <sys/socket.h>
86 #include <sys/socketvar.h>
87 #include <sys/resourcevar.h>
88 #include <sys/signalvar.h>
89 #include <sys/sysctl.h>
90 #include <sys/syslog.h>
91 #include <sys/uio.h>
92 #include <sys/uio_internal.h>
93 #include <sys/ev.h>
94 #include <sys/kdebug.h>
95 #include <sys/un.h>
96 #include <sys/user.h>
97 #include <sys/priv.h>
98 #include <sys/kern_event.h>
99 #include <net/route.h>
100 #include <net/init.h>
101 #include <net/ntstat.h>
102 #include <net/content_filter.h>
103 #include <netinet/in.h>
104 #include <netinet/in_pcb.h>
105 #include <netinet/ip6.h>
106 #include <netinet6/ip6_var.h>
107 #include <netinet/flow_divert.h>
108 #include <kern/zalloc.h>
109 #include <kern/locks.h>
110 #include <machine/limits.h>
111 #include <libkern/OSAtomic.h>
112 #include <pexpert/pexpert.h>
113 #include <kern/assert.h>
114 #include <kern/task.h>
115 #include <sys/kpi_mbuf.h>
116 #include <sys/mcache.h>
117 #include <sys/unpcb.h>
118
119 #if CONFIG_MACF
120 #include <security/mac.h>
121 #include <security/mac_framework.h>
122 #endif /* MAC */
123
124 #if MULTIPATH
125 #include <netinet/mp_pcb.h>
126 #include <netinet/mptcp_var.h>
127 #endif /* MULTIPATH */
128
129 #define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
130
131 #if DEBUG || DEVELOPMENT
132 #define DEBUG_KERNEL_ADDRPERM(_v) (_v)
133 #else
134 #define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
135 #endif
136
137 /* TODO: this should be in a header file somewhere */
138 extern char *proc_name_address(void *p);
139
140 static u_int32_t so_cache_hw; /* High water mark for socache */
141 static u_int32_t so_cache_timeouts; /* number of timeouts */
142 static u_int32_t so_cache_max_freed; /* max freed per timeout */
143 static u_int32_t cached_sock_count = 0;
144 STAILQ_HEAD(, socket) so_cache_head;
145 int max_cached_sock_count = MAX_CACHED_SOCKETS;
146 static u_int32_t so_cache_time;
147 static int socketinit_done;
148 static struct zone *so_cache_zone;
149
150 static lck_grp_t *so_cache_mtx_grp;
151 static lck_attr_t *so_cache_mtx_attr;
152 static lck_grp_attr_t *so_cache_mtx_grp_attr;
153 static lck_mtx_t *so_cache_mtx;
154
155 #include <machine/limits.h>
156
157 static void filt_sordetach(struct knote *kn);
158 static int filt_soread(struct knote *kn, long hint);
159 static void filt_sowdetach(struct knote *kn);
160 static int filt_sowrite(struct knote *kn, long hint);
161 static void filt_sockdetach(struct knote *kn);
162 static int filt_sockev(struct knote *kn, long hint);
163 static void filt_socktouch(struct knote *kn, struct kevent_internal_s *kev,
164 long type);
165
166 static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
167 static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
168
169 static struct filterops soread_filtops = {
170 .f_isfd = 1,
171 .f_detach = filt_sordetach,
172 .f_event = filt_soread,
173 };
174
175 static struct filterops sowrite_filtops = {
176 .f_isfd = 1,
177 .f_detach = filt_sowdetach,
178 .f_event = filt_sowrite,
179 };
180
181 static struct filterops sock_filtops = {
182 .f_isfd = 1,
183 .f_detach = filt_sockdetach,
184 .f_event = filt_sockev,
185 .f_touch = filt_socktouch,
186 };
187
188 SYSCTL_DECL(_kern_ipc);
189
190 #define EVEN_MORE_LOCKING_DEBUG 0
191
192 int socket_debug = 0;
193 SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
194 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
195
196 static int socket_zone = M_SOCKET;
197 so_gen_t so_gencnt; /* generation count for sockets */
198
199 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
200 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
201
202 #define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
203 #define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
204 #define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
205 #define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
206 #define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
207 #define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
208 #define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
209 #define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
210 #define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
211
212 #define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
213
214 int somaxconn = SOMAXCONN;
215 SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
216 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
217
218 /* Should we get a maximum also ??? */
219 static int sosendmaxchain = 65536;
220 static int sosendminchain = 16384;
221 static int sorecvmincopy = 16384;
222 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
223 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
224 SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
225 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
226
227 /*
228 * Set to enable jumbo clusters (if available) for large writes when
229 * the socket is marked with SOF_MULTIPAGES; see below.
230 */
231 int sosendjcl = 1;
232 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
233 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
234
235 /*
236 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
237 * writes on the socket for all protocols on any network interfaces,
238 * depending upon sosendjcl above. Be extra careful when setting this
239 * to 1, because sending down packets that cross physical pages down to
240 * broken drivers (those that falsely assume that the physical pages
241 * are contiguous) might lead to system panics or silent data corruption.
242 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
243 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
244 * capable. Set this to 1 only for testing/debugging purposes.
245 */
246 int sosendjcl_ignore_capab = 0;
247 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
248 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
249
250 /*
251 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
252 * writes on the socket for all protocols on any network interfaces.
253 * Be extra careful when setting this to 1, because sending down packets with
254 * clusters larger that 2 KB might lead to system panics or data corruption.
255 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
256 * on the outgoing interface
257 * Set this to 1 for testing/debugging purposes only.
258 */
259 int sosendbigcl_ignore_capab = 0;
260 SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
261 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
262
263 int sodefunctlog = 0;
264 SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
265 &sodefunctlog, 0, "");
266
267 int sothrottlelog = 0;
268 SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
269 &sothrottlelog, 0, "");
270
271 int sorestrictrecv = 1;
272 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
273 &sorestrictrecv, 0, "Enable inbound interface restrictions");
274
275 int sorestrictsend = 1;
276 SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
277 &sorestrictsend, 0, "Enable outbound interface restrictions");
278
279 int soreserveheadroom = 1;
280 SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
281 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
282
283 extern struct inpcbinfo tcbinfo;
284
285 /* TODO: these should be in header file */
286 extern int get_inpcb_str_size(void);
287 extern int get_tcp_str_size(void);
288
289 static unsigned int sl_zone_size; /* size of sockaddr_list */
290 static struct zone *sl_zone; /* zone for sockaddr_list */
291
292 static unsigned int se_zone_size; /* size of sockaddr_entry */
293 static struct zone *se_zone; /* zone for sockaddr_entry */
294
295 vm_size_t so_cache_zone_element_size;
296
297 static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
298 user_ssize_t *);
299 static void cached_sock_alloc(struct socket **, int);
300 static void cached_sock_free(struct socket *);
301
302 /*
303 * Maximum of extended background idle sockets per process
304 * Set to zero to disable further setting of the option
305 */
306
307 #define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
308 #define SO_IDLE_BK_IDLE_TIME 600
309 #define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
310
311 struct soextbkidlestat soextbkidlestat;
312
313 SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
314 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
315 "Maximum of extended background idle sockets per process");
316
317 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
318 &soextbkidlestat.so_xbkidle_time, 0,
319 "Time in seconds to keep extended background idle sockets");
320
321 SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
322 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
323 "High water mark for extended background idle sockets");
324
325 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
326 &soextbkidlestat, soextbkidlestat, "");
327
328 int so_set_extended_bk_idle(struct socket *, int);
329
330 /*
331 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
332 * setting the DSCP code on the packet based on the service class; see
333 * <rdar://problem/11277343> for details.
334 */
335 __private_extern__ u_int32_t sotcdb = SOTCDB_NO_DSCP;
336 SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
337 &sotcdb, 0, "");
338
339 void
340 socketinit(void)
341 {
342 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
343 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
344
345 #ifdef __LP64__
346 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
347 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
348 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
349 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
350 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
351 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
352 #else
353 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
354 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
355 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
356 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
357 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
358 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
359 #endif
360
361 if (socketinit_done) {
362 printf("socketinit: already called...\n");
363 return;
364 }
365 socketinit_done = 1;
366
367 PE_parse_boot_argn("socket_debug", &socket_debug,
368 sizeof (socket_debug));
369
370 /*
371 * allocate lock group attribute and group for socket cache mutex
372 */
373 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
374 so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
375 so_cache_mtx_grp_attr);
376
377 /*
378 * allocate the lock attribute for socket cache mutex
379 */
380 so_cache_mtx_attr = lck_attr_alloc_init();
381
382 /* cached sockets mutex */
383 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
384 if (so_cache_mtx == NULL) {
385 panic("%s: unable to allocate so_cache_mtx\n", __func__);
386 /* NOTREACHED */
387 }
388 STAILQ_INIT(&so_cache_head);
389
390 so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4
391 + get_inpcb_str_size() + 4 + get_tcp_str_size());
392
393 so_cache_zone = zinit(so_cache_zone_element_size,
394 (120000 * so_cache_zone_element_size), 8192, "socache zone");
395 zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
396 zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
397
398 sl_zone_size = sizeof (struct sockaddr_list);
399 if ((sl_zone = zinit(sl_zone_size, 1024 * sl_zone_size, 1024,
400 "sockaddr_list")) == NULL) {
401 panic("%s: unable to allocate sockaddr_list zone\n", __func__);
402 /* NOTREACHED */
403 }
404 zone_change(sl_zone, Z_CALLERACCT, FALSE);
405 zone_change(sl_zone, Z_EXPAND, TRUE);
406
407 se_zone_size = sizeof (struct sockaddr_entry);
408 if ((se_zone = zinit(se_zone_size, 1024 * se_zone_size, 1024,
409 "sockaddr_entry")) == NULL) {
410 panic("%s: unable to allocate sockaddr_entry zone\n", __func__);
411 /* NOTREACHED */
412 }
413 zone_change(se_zone, Z_CALLERACCT, FALSE);
414 zone_change(se_zone, Z_EXPAND, TRUE);
415
416 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
417 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
418 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
419 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
420
421 in_pcbinit();
422 sflt_init();
423 socket_tclass_init();
424 #if MULTIPATH
425 mp_pcbinit();
426 #endif /* MULTIPATH */
427 }
428
429 static void
430 cached_sock_alloc(struct socket **so, int waitok)
431 {
432 caddr_t temp;
433 uintptr_t offset;
434
435 lck_mtx_lock(so_cache_mtx);
436
437 if (!STAILQ_EMPTY(&so_cache_head)) {
438 VERIFY(cached_sock_count > 0);
439
440 *so = STAILQ_FIRST(&so_cache_head);
441 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
442 STAILQ_NEXT((*so), so_cache_ent) = NULL;
443
444 cached_sock_count--;
445 lck_mtx_unlock(so_cache_mtx);
446
447 temp = (*so)->so_saved_pcb;
448 bzero((caddr_t)*so, sizeof (struct socket));
449
450 (*so)->so_saved_pcb = temp;
451 } else {
452
453 lck_mtx_unlock(so_cache_mtx);
454
455 if (waitok)
456 *so = (struct socket *)zalloc(so_cache_zone);
457 else
458 *so = (struct socket *)zalloc_noblock(so_cache_zone);
459
460 if (*so == NULL)
461 return;
462
463 bzero((caddr_t)*so, sizeof (struct socket));
464
465 /*
466 * Define offsets for extra structures into our
467 * single block of memory. Align extra structures
468 * on longword boundaries.
469 */
470
471 offset = (uintptr_t)*so;
472 offset += sizeof (struct socket);
473
474 offset = ALIGN(offset);
475
476 (*so)->so_saved_pcb = (caddr_t)offset;
477 offset += get_inpcb_str_size();
478
479 offset = ALIGN(offset);
480
481 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
482 (caddr_t)offset;
483 }
484
485 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
486 }
487
488 static void
489 cached_sock_free(struct socket *so)
490 {
491
492 lck_mtx_lock(so_cache_mtx);
493
494 so_cache_time = net_uptime();
495 if (++cached_sock_count > max_cached_sock_count) {
496 --cached_sock_count;
497 lck_mtx_unlock(so_cache_mtx);
498 zfree(so_cache_zone, so);
499 } else {
500 if (so_cache_hw < cached_sock_count)
501 so_cache_hw = cached_sock_count;
502
503 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
504
505 so->cache_timestamp = so_cache_time;
506 lck_mtx_unlock(so_cache_mtx);
507 }
508 }
509
510 void
511 so_update_last_owner_locked(struct socket *so, proc_t self)
512 {
513 if (so->last_pid != 0) {
514 /*
515 * last_pid and last_upid should remain zero for sockets
516 * created using sock_socket. The check above achieves that
517 */
518 if (self == PROC_NULL)
519 self = current_proc();
520
521 if (so->last_upid != proc_uniqueid(self) ||
522 so->last_pid != proc_pid(self)) {
523 so->last_upid = proc_uniqueid(self);
524 so->last_pid = proc_pid(self);
525 proc_getexecutableuuid(self, so->last_uuid,
526 sizeof (so->last_uuid));
527 }
528 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
529 }
530 }
531
532 void
533 so_update_policy(struct socket *so)
534 {
535 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
536 (void) inp_update_policy(sotoinpcb(so));
537 }
538
539 #if NECP
540 static void
541 so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
542 struct sockaddr *override_remote_addr)
543 {
544 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
545 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
546 override_remote_addr, 0);
547 }
548 #endif /* NECP */
549
550 boolean_t
551 so_cache_timer(void)
552 {
553 struct socket *p;
554 int n_freed = 0;
555 boolean_t rc = FALSE;
556
557 lck_mtx_lock(so_cache_mtx);
558 so_cache_timeouts++;
559 so_cache_time = net_uptime();
560
561 while (!STAILQ_EMPTY(&so_cache_head)) {
562 VERIFY(cached_sock_count > 0);
563 p = STAILQ_FIRST(&so_cache_head);
564 if ((so_cache_time - p->cache_timestamp) <
565 SO_CACHE_TIME_LIMIT)
566 break;
567
568 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
569 --cached_sock_count;
570
571 zfree(so_cache_zone, p);
572
573 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
574 so_cache_max_freed++;
575 break;
576 }
577 }
578
579 /* Schedule again if there is more to cleanup */
580 if (!STAILQ_EMPTY(&so_cache_head))
581 rc = TRUE;
582
583 lck_mtx_unlock(so_cache_mtx);
584 return (rc);
585 }
586
587 /*
588 * Get a socket structure from our zone, and initialize it.
589 * We don't implement `waitok' yet (see comments in uipc_domain.c).
590 * Note that it would probably be better to allocate socket
591 * and PCB at the same time, but I'm not convinced that all
592 * the protocols can be easily modified to do this.
593 */
594 struct socket *
595 soalloc(int waitok, int dom, int type)
596 {
597 struct socket *so;
598
599 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
600 cached_sock_alloc(&so, waitok);
601 } else {
602 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
603 M_WAITOK);
604 if (so != NULL)
605 bzero(so, sizeof (*so));
606 }
607 if (so != NULL) {
608 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
609 so->so_zone = socket_zone;
610 #if CONFIG_MACF_SOCKET
611 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
612 if (mac_socket_label_init(so, !waitok) != 0) {
613 sodealloc(so);
614 return (NULL);
615 }
616 #endif /* MAC_SOCKET */
617 }
618
619 return (so);
620 }
621
622 int
623 socreate_internal(int dom, struct socket **aso, int type, int proto,
624 struct proc *p, uint32_t flags, struct proc *ep)
625 {
626 struct protosw *prp;
627 struct socket *so;
628 int error = 0;
629
630 #if TCPDEBUG
631 extern int tcpconsdebug;
632 #endif
633
634 VERIFY(aso != NULL);
635 *aso = NULL;
636
637 if (proto != 0)
638 prp = pffindproto(dom, proto, type);
639 else
640 prp = pffindtype(dom, type);
641
642 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
643 if (pffinddomain(dom) == NULL)
644 return (EAFNOSUPPORT);
645 if (proto != 0) {
646 if (pffindprotonotype(dom, proto) != NULL)
647 return (EPROTOTYPE);
648 }
649 return (EPROTONOSUPPORT);
650 }
651 if (prp->pr_type != type)
652 return (EPROTOTYPE);
653 so = soalloc(1, dom, type);
654 if (so == NULL)
655 return (ENOBUFS);
656
657 if (flags & SOCF_ASYNC)
658 so->so_state |= SS_NBIO;
659 #if MULTIPATH
660 if (flags & SOCF_MP_SUBFLOW) {
661 /*
662 * A multipath subflow socket is used internally in the kernel,
663 * therefore it does not have a file desciptor associated by
664 * default.
665 */
666 so->so_state |= SS_NOFDREF;
667 so->so_flags |= SOF_MP_SUBFLOW;
668 }
669 #endif /* MULTIPATH */
670
671 TAILQ_INIT(&so->so_incomp);
672 TAILQ_INIT(&so->so_comp);
673 so->so_type = type;
674 so->last_upid = proc_uniqueid(p);
675 so->last_pid = proc_pid(p);
676 proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
677 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
678
679 if (ep != PROC_NULL && ep != p) {
680 so->e_upid = proc_uniqueid(ep);
681 so->e_pid = proc_pid(ep);
682 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
683 so->so_flags |= SOF_DELEGATED;
684 }
685
686 so->so_cred = kauth_cred_proc_ref(p);
687 if (!suser(kauth_cred_get(), NULL))
688 so->so_state |= SS_PRIV;
689
690 so->so_proto = prp;
691 so->so_rcv.sb_flags |= SB_RECV;
692 so->so_rcv.sb_so = so->so_snd.sb_so = so;
693 so->next_lock_lr = 0;
694 so->next_unlock_lr = 0;
695
696 #if CONFIG_MACF_SOCKET
697 mac_socket_label_associate(kauth_cred_get(), so);
698 #endif /* MAC_SOCKET */
699
700 /*
701 * Attachment will create the per pcb lock if necessary and
702 * increase refcount for creation, make sure it's done before
703 * socket is inserted in lists.
704 */
705 so->so_usecount++;
706
707 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
708 if (error != 0) {
709 /*
710 * Warning:
711 * If so_pcb is not zero, the socket will be leaked,
712 * so protocol attachment handler must be coded carefuly
713 */
714 so->so_state |= SS_NOFDREF;
715 so->so_usecount--;
716 sofreelastref(so, 1); /* will deallocate the socket */
717 return (error);
718 }
719
720 atomic_add_32(&prp->pr_domain->dom_refs, 1);
721 TAILQ_INIT(&so->so_evlist);
722
723 /* Attach socket filters for this protocol */
724 sflt_initsock(so);
725 #if TCPDEBUG
726 if (tcpconsdebug == 2)
727 so->so_options |= SO_DEBUG;
728 #endif
729 so_set_default_traffic_class(so);
730
731 /*
732 * If this thread or task is marked to create backgrounded sockets,
733 * mark the socket as background.
734 */
735 if (proc_get_effective_thread_policy(current_thread(),
736 TASK_POLICY_NEW_SOCKETS_BG)) {
737 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
738 so->so_background_thread = current_thread();
739 }
740
741 switch (dom) {
742 /*
743 * Don't mark Unix domain, system or multipath sockets as
744 * eligible for defunct by default.
745 */
746 case PF_LOCAL:
747 case PF_SYSTEM:
748 case PF_MULTIPATH:
749 so->so_flags |= SOF_NODEFUNCT;
750 break;
751 default:
752 break;
753 }
754
755 /*
756 * Entitlements can't be checked at socket creation time except if the
757 * application requested a feature guarded by a privilege (c.f., socket
758 * delegation).
759 * The priv(9) and the Sandboxing APIs are designed with the idea that
760 * a privilege check should only be triggered by a userland request.
761 * A privilege check at socket creation time is time consuming and
762 * could trigger many authorisation error messages from the security
763 * APIs.
764 */
765
766 *aso = so;
767
768 return (0);
769 }
770
771 /*
772 * Returns: 0 Success
773 * EAFNOSUPPORT
774 * EPROTOTYPE
775 * EPROTONOSUPPORT
776 * ENOBUFS
777 * <pru_attach>:ENOBUFS[AF_UNIX]
778 * <pru_attach>:ENOBUFS[TCP]
779 * <pru_attach>:ENOMEM[TCP]
780 * <pru_attach>:??? [other protocol families, IPSEC]
781 */
782 int
783 socreate(int dom, struct socket **aso, int type, int proto)
784 {
785 return (socreate_internal(dom, aso, type, proto, current_proc(), 0,
786 PROC_NULL));
787 }
788
789 int
790 socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
791 {
792 int error = 0;
793 struct proc *ep = PROC_NULL;
794
795 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
796 error = ESRCH;
797 goto done;
798 }
799
800 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
801
802 /*
803 * It might not be wise to hold the proc reference when calling
804 * socreate_internal since it calls soalloc with M_WAITOK
805 */
806 done:
807 if (ep != PROC_NULL)
808 proc_rele(ep);
809
810 return (error);
811 }
812
813 /*
814 * Returns: 0 Success
815 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
816 * <pru_bind>:EAFNOSUPPORT Address family not supported
817 * <pru_bind>:EADDRNOTAVAIL Address not available.
818 * <pru_bind>:EINVAL Invalid argument
819 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
820 * <pru_bind>:EACCES Permission denied
821 * <pru_bind>:EADDRINUSE Address in use
822 * <pru_bind>:EAGAIN Resource unavailable, try again
823 * <pru_bind>:EPERM Operation not permitted
824 * <pru_bind>:???
825 * <sf_bind>:???
826 *
827 * Notes: It's not possible to fully enumerate the return codes above,
828 * since socket filter authors and protocol family authors may
829 * not choose to limit their error returns to those listed, even
830 * though this may result in some software operating incorrectly.
831 *
832 * The error codes which are enumerated above are those known to
833 * be returned by the tcp_usr_bind function supplied.
834 */
835 int
836 sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
837 {
838 struct proc *p = current_proc();
839 int error = 0;
840
841 if (dolock)
842 socket_lock(so, 1);
843 VERIFY(so->so_usecount > 1);
844
845 so_update_last_owner_locked(so, p);
846 so_update_policy(so);
847
848 #if NECP
849 so_update_necp_policy(so, nam, NULL);
850 #endif /* NECP */
851
852 /*
853 * If this is a bind request on a socket that has been marked
854 * as inactive, reject it now before we go any further.
855 */
856 if (so->so_flags & SOF_DEFUNCT) {
857 error = EINVAL;
858 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
859 __func__, proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
860 SOCK_DOM(so), SOCK_TYPE(so), error));
861 goto out;
862 }
863
864 /* Socket filter */
865 error = sflt_bind(so, nam);
866
867 if (error == 0)
868 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
869 out:
870 if (dolock)
871 socket_unlock(so, 1);
872
873 if (error == EJUSTRETURN)
874 error = 0;
875
876 return (error);
877 }
878
879 void
880 sodealloc(struct socket *so)
881 {
882 kauth_cred_unref(&so->so_cred);
883
884 /* Remove any filters */
885 sflt_termsock(so);
886
887 #if CONTENT_FILTER
888 cfil_sock_detach(so);
889 #endif /* CONTENT_FILTER */
890
891 /* Delete the state allocated for msg queues on a socket */
892 if (so->so_flags & SOF_ENABLE_MSGS) {
893 FREE(so->so_msg_state, M_TEMP);
894 so->so_msg_state = NULL;
895 }
896 VERIFY(so->so_msg_state == NULL);
897
898 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
899
900 #if CONFIG_MACF_SOCKET
901 mac_socket_label_destroy(so);
902 #endif /* MAC_SOCKET */
903
904 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
905 cached_sock_free(so);
906 } else {
907 FREE_ZONE(so, sizeof (*so), so->so_zone);
908 }
909 }
910
911 /*
912 * Returns: 0 Success
913 * EINVAL
914 * EOPNOTSUPP
915 * <pru_listen>:EINVAL[AF_UNIX]
916 * <pru_listen>:EINVAL[TCP]
917 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
918 * <pru_listen>:EINVAL[TCP] Invalid argument
919 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
920 * <pru_listen>:EACCES[TCP] Permission denied
921 * <pru_listen>:EADDRINUSE[TCP] Address in use
922 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
923 * <pru_listen>:EPERM[TCP] Operation not permitted
924 * <sf_listen>:???
925 *
926 * Notes: Other <pru_listen> returns depend on the protocol family; all
927 * <sf_listen> returns depend on what the filter author causes
928 * their filter to return.
929 */
930 int
931 solisten(struct socket *so, int backlog)
932 {
933 struct proc *p = current_proc();
934 int error = 0;
935
936 socket_lock(so, 1);
937
938 so_update_last_owner_locked(so, p);
939 so_update_policy(so);
940
941 #if NECP
942 so_update_necp_policy(so, NULL, NULL);
943 #endif /* NECP */
944
945 if (so->so_proto == NULL) {
946 error = EINVAL;
947 goto out;
948 }
949 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
950 error = EOPNOTSUPP;
951 goto out;
952 }
953
954 /*
955 * If the listen request is made on a socket that is not fully
956 * disconnected, or on a socket that has been marked as inactive,
957 * reject the request now.
958 */
959 if ((so->so_state &
960 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
961 (so->so_flags & SOF_DEFUNCT)) {
962 error = EINVAL;
963 if (so->so_flags & SOF_DEFUNCT) {
964 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
965 "(%d)\n", __func__, proc_pid(p),
966 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
967 SOCK_DOM(so), SOCK_TYPE(so), error));
968 }
969 goto out;
970 }
971
972 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
973 error = EPERM;
974 goto out;
975 }
976
977 error = sflt_listen(so);
978 if (error == 0)
979 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
980
981 if (error) {
982 if (error == EJUSTRETURN)
983 error = 0;
984 goto out;
985 }
986
987 if (TAILQ_EMPTY(&so->so_comp))
988 so->so_options |= SO_ACCEPTCONN;
989 /*
990 * POSIX: The implementation may have an upper limit on the length of
991 * the listen queue-either global or per accepting socket. If backlog
992 * exceeds this limit, the length of the listen queue is set to the
993 * limit.
994 *
995 * If listen() is called with a backlog argument value that is less
996 * than 0, the function behaves as if it had been called with a backlog
997 * argument value of 0.
998 *
999 * A backlog argument of 0 may allow the socket to accept connections,
1000 * in which case the length of the listen queue may be set to an
1001 * implementation-defined minimum value.
1002 */
1003 if (backlog <= 0 || backlog > somaxconn)
1004 backlog = somaxconn;
1005
1006 so->so_qlimit = backlog;
1007 out:
1008 socket_unlock(so, 1);
1009 return (error);
1010 }
1011
1012 void
1013 sofreelastref(struct socket *so, int dealloc)
1014 {
1015 struct socket *head = so->so_head;
1016
1017 /* Assume socket is locked */
1018
1019 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1020 selthreadclear(&so->so_snd.sb_sel);
1021 selthreadclear(&so->so_rcv.sb_sel);
1022 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1023 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1024 so->so_event = sonullevent;
1025 return;
1026 }
1027 if (head != NULL) {
1028 socket_lock(head, 1);
1029 if (so->so_state & SS_INCOMP) {
1030 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1031 head->so_incqlen--;
1032 } else if (so->so_state & SS_COMP) {
1033 /*
1034 * We must not decommission a socket that's
1035 * on the accept(2) queue. If we do, then
1036 * accept(2) may hang after select(2) indicated
1037 * that the listening socket was ready.
1038 */
1039 selthreadclear(&so->so_snd.sb_sel);
1040 selthreadclear(&so->so_rcv.sb_sel);
1041 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1042 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1043 so->so_event = sonullevent;
1044 socket_unlock(head, 1);
1045 return;
1046 } else {
1047 panic("sofree: not queued");
1048 }
1049 head->so_qlen--;
1050 so->so_state &= ~SS_INCOMP;
1051 so->so_head = NULL;
1052 socket_unlock(head, 1);
1053 }
1054 sowflush(so);
1055 sorflush(so);
1056
1057 #if FLOW_DIVERT
1058 if (so->so_flags & SOF_FLOW_DIVERT) {
1059 flow_divert_detach(so);
1060 }
1061 #endif /* FLOW_DIVERT */
1062
1063 /* 3932268: disable upcall */
1064 so->so_rcv.sb_flags &= ~SB_UPCALL;
1065 so->so_snd.sb_flags &= ~SB_UPCALL;
1066 so->so_event = sonullevent;
1067
1068 if (dealloc)
1069 sodealloc(so);
1070 }
1071
1072 void
1073 soclose_wait_locked(struct socket *so)
1074 {
1075 lck_mtx_t *mutex_held;
1076
1077 if (so->so_proto->pr_getlock != NULL)
1078 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1079 else
1080 mutex_held = so->so_proto->pr_domain->dom_mtx;
1081 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1082
1083 /*
1084 * Double check here and return if there's no outstanding upcall;
1085 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1086 */
1087 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
1088 return;
1089 so->so_rcv.sb_flags &= ~SB_UPCALL;
1090 so->so_snd.sb_flags &= ~SB_UPCALL;
1091 so->so_flags |= SOF_CLOSEWAIT;
1092 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1093 "soclose_wait_locked", NULL);
1094 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1095 so->so_flags &= ~SOF_CLOSEWAIT;
1096 }
1097
1098 /*
1099 * Close a socket on last file table reference removal.
1100 * Initiate disconnect if connected.
1101 * Free socket when disconnect complete.
1102 */
1103 int
1104 soclose_locked(struct socket *so)
1105 {
1106 int error = 0;
1107 lck_mtx_t *mutex_held;
1108 struct timespec ts;
1109
1110 if (so->so_usecount == 0) {
1111 panic("soclose: so=%p refcount=0\n", so);
1112 /* NOTREACHED */
1113 }
1114
1115 sflt_notify(so, sock_evt_closing, NULL);
1116
1117 if (so->so_upcallusecount)
1118 soclose_wait_locked(so);
1119
1120 #if CONTENT_FILTER
1121 /*
1122 * We have to wait until the content filters are done
1123 */
1124 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1125 cfil_sock_close_wait(so);
1126 cfil_sock_is_closed(so);
1127 cfil_sock_detach(so);
1128 }
1129 #endif /* CONTENT_FILTER */
1130
1131 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1132 soresume(current_proc(), so, 1);
1133 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1134 }
1135
1136 if ((so->so_options & SO_ACCEPTCONN)) {
1137 struct socket *sp, *sonext;
1138 int socklock = 0;
1139
1140 /*
1141 * We do not want new connection to be added
1142 * to the connection queues
1143 */
1144 so->so_options &= ~SO_ACCEPTCONN;
1145
1146 for (sp = TAILQ_FIRST(&so->so_incomp);
1147 sp != NULL; sp = sonext) {
1148 sonext = TAILQ_NEXT(sp, so_list);
1149
1150 /*
1151 * Radar 5350314
1152 * skip sockets thrown away by tcpdropdropblreq
1153 * they will get cleanup by the garbage collection.
1154 * otherwise, remove the incomp socket from the queue
1155 * and let soabort trigger the appropriate cleanup.
1156 */
1157 if (sp->so_flags & SOF_OVERFLOW)
1158 continue;
1159
1160 if (so->so_proto->pr_getlock != NULL) {
1161 /*
1162 * Lock ordering for consistency with the
1163 * rest of the stack, we lock the socket
1164 * first and then grabb the head.
1165 */
1166 socket_unlock(so, 0);
1167 socket_lock(sp, 1);
1168 socket_lock(so, 0);
1169 socklock = 1;
1170 }
1171
1172 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1173 so->so_incqlen--;
1174
1175 if (sp->so_state & SS_INCOMP) {
1176 sp->so_state &= ~SS_INCOMP;
1177 sp->so_head = NULL;
1178
1179 (void) soabort(sp);
1180 }
1181
1182 if (socklock)
1183 socket_unlock(sp, 1);
1184 }
1185
1186 while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
1187 /* Dequeue from so_comp since sofree() won't do it */
1188 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1189 so->so_qlen--;
1190
1191 if (so->so_proto->pr_getlock != NULL) {
1192 socket_unlock(so, 0);
1193 socket_lock(sp, 1);
1194 }
1195
1196 if (sp->so_state & SS_COMP) {
1197 sp->so_state &= ~SS_COMP;
1198 sp->so_head = NULL;
1199
1200 (void) soabort(sp);
1201 }
1202
1203 if (so->so_proto->pr_getlock != NULL) {
1204 socket_unlock(sp, 1);
1205 socket_lock(so, 0);
1206 }
1207 }
1208 }
1209 if (so->so_pcb == NULL) {
1210 /* 3915887: mark the socket as ready for dealloc */
1211 so->so_flags |= SOF_PCBCLEARING;
1212 goto discard;
1213 }
1214 if (so->so_state & SS_ISCONNECTED) {
1215 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1216 error = sodisconnectlocked(so);
1217 if (error)
1218 goto drop;
1219 }
1220 if (so->so_options & SO_LINGER) {
1221 if ((so->so_state & SS_ISDISCONNECTING) &&
1222 (so->so_state & SS_NBIO))
1223 goto drop;
1224 if (so->so_proto->pr_getlock != NULL)
1225 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1226 else
1227 mutex_held = so->so_proto->pr_domain->dom_mtx;
1228 while (so->so_state & SS_ISCONNECTED) {
1229 ts.tv_sec = (so->so_linger/100);
1230 ts.tv_nsec = (so->so_linger % 100) *
1231 NSEC_PER_USEC * 1000 * 10;
1232 error = msleep((caddr_t)&so->so_timeo,
1233 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1234 if (error) {
1235 /*
1236 * It's OK when the time fires,
1237 * don't report an error
1238 */
1239 if (error == EWOULDBLOCK)
1240 error = 0;
1241 break;
1242 }
1243 }
1244 }
1245 }
1246 drop:
1247 if (so->so_usecount == 0) {
1248 panic("soclose: usecount is zero so=%p\n", so);
1249 /* NOTREACHED */
1250 }
1251 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1252 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1253 if (error == 0)
1254 error = error2;
1255 }
1256 if (so->so_usecount <= 0) {
1257 panic("soclose: usecount is zero so=%p\n", so);
1258 /* NOTREACHED */
1259 }
1260 discard:
1261 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1262 (so->so_state & SS_NOFDREF)) {
1263 panic("soclose: NOFDREF");
1264 /* NOTREACHED */
1265 }
1266 so->so_state |= SS_NOFDREF;
1267
1268 if (so->so_flags & SOF_MP_SUBFLOW)
1269 so->so_flags &= ~SOF_MP_SUBFLOW;
1270
1271 if ((so->so_flags & SOF_KNOTE) != 0)
1272 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1273
1274 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1275 evsofree(so);
1276
1277 so->so_usecount--;
1278 sofree(so);
1279 return (error);
1280 }
1281
1282 int
1283 soclose(struct socket *so)
1284 {
1285 int error = 0;
1286 socket_lock(so, 1);
1287
1288 if (so->so_retaincnt == 0) {
1289 error = soclose_locked(so);
1290 } else {
1291 /*
1292 * if the FD is going away, but socket is
1293 * retained in kernel remove its reference
1294 */
1295 so->so_usecount--;
1296 if (so->so_usecount < 2)
1297 panic("soclose: retaincnt non null and so=%p "
1298 "usecount=%d\n", so, so->so_usecount);
1299 }
1300 socket_unlock(so, 1);
1301 return (error);
1302 }
1303
1304 /*
1305 * Must be called at splnet...
1306 */
1307 /* Should already be locked */
1308 int
1309 soabort(struct socket *so)
1310 {
1311 int error;
1312
1313 #ifdef MORE_LOCKING_DEBUG
1314 lck_mtx_t *mutex_held;
1315
1316 if (so->so_proto->pr_getlock != NULL)
1317 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1318 else
1319 mutex_held = so->so_proto->pr_domain->dom_mtx;
1320 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
1321 #endif
1322
1323 if ((so->so_flags & SOF_ABORTED) == 0) {
1324 so->so_flags |= SOF_ABORTED;
1325 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1326 if (error) {
1327 sofree(so);
1328 return (error);
1329 }
1330 }
1331 return (0);
1332 }
1333
1334 int
1335 soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1336 {
1337 int error;
1338
1339 if (dolock)
1340 socket_lock(so, 1);
1341
1342 so_update_last_owner_locked(so, PROC_NULL);
1343 so_update_policy(so);
1344 #if NECP
1345 so_update_necp_policy(so, NULL, NULL);
1346 #endif /* NECP */
1347
1348 if ((so->so_state & SS_NOFDREF) == 0)
1349 panic("soaccept: !NOFDREF");
1350 so->so_state &= ~SS_NOFDREF;
1351 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1352
1353 if (dolock)
1354 socket_unlock(so, 1);
1355 return (error);
1356 }
1357
1358 int
1359 soaccept(struct socket *so, struct sockaddr **nam)
1360 {
1361 return (soacceptlock(so, nam, 1));
1362 }
1363
1364 int
1365 soacceptfilter(struct socket *so)
1366 {
1367 struct sockaddr *local = NULL, *remote = NULL;
1368 int error = 0;
1369 struct socket *head = so->so_head;
1370
1371 /*
1372 * Hold the lock even if this socket has not been made visible
1373 * to the filter(s). For sockets with global locks, this protects
1374 * against the head or peer going away
1375 */
1376 socket_lock(so, 1);
1377 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1378 sogetaddr_locked(so, &local, 0) != 0) {
1379 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1380 so->so_head = NULL;
1381 socket_unlock(so, 1);
1382 soclose(so);
1383 /* Out of resources; try it again next time */
1384 error = ECONNABORTED;
1385 goto done;
1386 }
1387
1388 error = sflt_accept(head, so, local, remote);
1389
1390 /*
1391 * If we get EJUSTRETURN from one of the filters, mark this socket
1392 * as inactive and return it anyway. This newly accepted socket
1393 * will be disconnected later before we hand it off to the caller.
1394 */
1395 if (error == EJUSTRETURN) {
1396 error = 0;
1397 (void) sosetdefunct(current_proc(), so,
1398 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1399 }
1400
1401 if (error != 0) {
1402 /*
1403 * This may seem like a duplication to the above error
1404 * handling part when we return ECONNABORTED, except
1405 * the following is done while holding the lock since
1406 * the socket has been exposed to the filter(s) earlier.
1407 */
1408 so->so_state &= ~(SS_NOFDREF | SS_COMP);
1409 so->so_head = NULL;
1410 socket_unlock(so, 1);
1411 soclose(so);
1412 /* Propagate socket filter's error code to the caller */
1413 } else {
1414 socket_unlock(so, 1);
1415 }
1416 done:
1417 /* Callee checks for NULL pointer */
1418 sock_freeaddr(remote);
1419 sock_freeaddr(local);
1420 return (error);
1421 }
1422
1423 /*
1424 * Returns: 0 Success
1425 * EOPNOTSUPP Operation not supported on socket
1426 * EISCONN Socket is connected
1427 * <pru_connect>:EADDRNOTAVAIL Address not available.
1428 * <pru_connect>:EINVAL Invalid argument
1429 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1430 * <pru_connect>:EACCES Permission denied
1431 * <pru_connect>:EADDRINUSE Address in use
1432 * <pru_connect>:EAGAIN Resource unavailable, try again
1433 * <pru_connect>:EPERM Operation not permitted
1434 * <sf_connect_out>:??? [anything a filter writer might set]
1435 */
1436 int
1437 soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1438 {
1439 int error;
1440 struct proc *p = current_proc();
1441
1442 if (dolock)
1443 socket_lock(so, 1);
1444
1445 so_update_last_owner_locked(so, p);
1446 so_update_policy(so);
1447
1448 #if NECP
1449 so_update_necp_policy(so, NULL, nam);
1450 #endif /* NECP */
1451
1452 /*
1453 * If this is a listening socket or if this is a previously-accepted
1454 * socket that has been marked as inactive, reject the connect request.
1455 */
1456 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1457 error = EOPNOTSUPP;
1458 if (so->so_flags & SOF_DEFUNCT) {
1459 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1460 "(%d)\n", __func__, proc_pid(p),
1461 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1462 SOCK_DOM(so), SOCK_TYPE(so), error));
1463 }
1464 if (dolock)
1465 socket_unlock(so, 1);
1466 return (error);
1467 }
1468
1469 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1470 if (dolock)
1471 socket_unlock(so, 1);
1472 return (EPERM);
1473 }
1474
1475 /*
1476 * If protocol is connection-based, can only connect once.
1477 * Otherwise, if connected, try to disconnect first.
1478 * This allows user to disconnect by connecting to, e.g.,
1479 * a null address.
1480 */
1481 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1482 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1483 (error = sodisconnectlocked(so)))) {
1484 error = EISCONN;
1485 } else {
1486 /*
1487 * Run connect filter before calling protocol:
1488 * - non-blocking connect returns before completion;
1489 */
1490 error = sflt_connectout(so, nam);
1491 if (error != 0) {
1492 if (error == EJUSTRETURN)
1493 error = 0;
1494 } else {
1495 error = (*so->so_proto->pr_usrreqs->pru_connect)
1496 (so, nam, p);
1497 }
1498 }
1499 if (dolock)
1500 socket_unlock(so, 1);
1501 return (error);
1502 }
1503
1504 int
1505 soconnect(struct socket *so, struct sockaddr *nam)
1506 {
1507 return (soconnectlock(so, nam, 1));
1508 }
1509
1510 /*
1511 * Returns: 0 Success
1512 * <pru_connect2>:EINVAL[AF_UNIX]
1513 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1514 * <pru_connect2>:??? [other protocol families]
1515 *
1516 * Notes: <pru_connect2> is not supported by [TCP].
1517 */
1518 int
1519 soconnect2(struct socket *so1, struct socket *so2)
1520 {
1521 int error;
1522
1523 socket_lock(so1, 1);
1524 if (so2->so_proto->pr_lock)
1525 socket_lock(so2, 1);
1526
1527 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1528
1529 socket_unlock(so1, 1);
1530 if (so2->so_proto->pr_lock)
1531 socket_unlock(so2, 1);
1532 return (error);
1533 }
1534
1535 int
1536 soconnectxlocked(struct socket *so, struct sockaddr_list **src_sl,
1537 struct sockaddr_list **dst_sl, struct proc *p, uint32_t ifscope,
1538 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1539 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1540 {
1541 int error;
1542
1543 so_update_last_owner_locked(so, p);
1544 so_update_policy(so);
1545
1546 /*
1547 * If this is a listening socket or if this is a previously-accepted
1548 * socket that has been marked as inactive, reject the connect request.
1549 */
1550 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1551 error = EOPNOTSUPP;
1552 if (so->so_flags & SOF_DEFUNCT) {
1553 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] "
1554 "(%d)\n", __func__, proc_pid(p),
1555 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1556 SOCK_DOM(so), SOCK_TYPE(so), error));
1557 }
1558 return (error);
1559 }
1560
1561 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0)
1562 return (EPERM);
1563
1564 /*
1565 * If protocol is connection-based, can only connect once
1566 * unless PR_MULTICONN is set. Otherwise, if connected,
1567 * try to disconnect first. This allows user to disconnect
1568 * by connecting to, e.g., a null address.
1569 */
1570 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) &&
1571 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1572 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1573 (error = sodisconnectlocked(so)) != 0)) {
1574 error = EISCONN;
1575 } else {
1576 /*
1577 * Run connect filter before calling protocol:
1578 * - non-blocking connect returns before completion;
1579 */
1580 error = sflt_connectxout(so, dst_sl);
1581 if (error != 0) {
1582 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1583 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1584 if (error == EJUSTRETURN)
1585 error = 0;
1586 } else {
1587 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1588 (so, src_sl, dst_sl, p, ifscope, aid, pcid,
1589 flags, arg, arglen, auio, bytes_written);
1590 }
1591 }
1592
1593 return (error);
1594 }
1595
1596 int
1597 sodisconnectlocked(struct socket *so)
1598 {
1599 int error;
1600
1601 if ((so->so_state & SS_ISCONNECTED) == 0) {
1602 error = ENOTCONN;
1603 goto bad;
1604 }
1605 if (so->so_state & SS_ISDISCONNECTING) {
1606 error = EALREADY;
1607 goto bad;
1608 }
1609
1610 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1611 if (error == 0)
1612 sflt_notify(so, sock_evt_disconnected, NULL);
1613
1614 bad:
1615 return (error);
1616 }
1617
1618 /* Locking version */
1619 int
1620 sodisconnect(struct socket *so)
1621 {
1622 int error;
1623
1624 socket_lock(so, 1);
1625 error = sodisconnectlocked(so);
1626 socket_unlock(so, 1);
1627 return (error);
1628 }
1629
1630 int
1631 sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1632 {
1633 int error;
1634
1635 /*
1636 * Call the protocol disconnectx handler; let it handle all
1637 * matters related to the connection state of this session.
1638 */
1639 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1640 if (error == 0) {
1641 /*
1642 * The event applies only for the session, not for
1643 * the disconnection of individual subflows.
1644 */
1645 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
1646 sflt_notify(so, sock_evt_disconnected, NULL);
1647 }
1648 return (error);
1649 }
1650
1651 int
1652 sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1653 {
1654 int error;
1655
1656 socket_lock(so, 1);
1657 error = sodisconnectxlocked(so, aid, cid);
1658 socket_unlock(so, 1);
1659 return (error);
1660 }
1661
1662 int
1663 sopeelofflocked(struct socket *so, sae_associd_t aid, struct socket **psop)
1664 {
1665 return ((*so->so_proto->pr_usrreqs->pru_peeloff)(so, aid, psop));
1666 }
1667
1668 #define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1669
1670 /*
1671 * sosendcheck will lock the socket buffer if it isn't locked and
1672 * verify that there is space for the data being inserted.
1673 *
1674 * Returns: 0 Success
1675 * EPIPE
1676 * sblock:EWOULDBLOCK
1677 * sblock:EINTR
1678 * sbwait:EBADF
1679 * sbwait:EINTR
1680 * [so_error]:???
1681 */
1682 int
1683 sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1684 int32_t clen, int32_t atomic, int flags, int *sblocked,
1685 struct mbuf *control)
1686 {
1687 int error = 0;
1688 int32_t space;
1689 int assumelock = 0;
1690
1691 restart:
1692 if (*sblocked == 0) {
1693 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1694 so->so_send_filt_thread != 0 &&
1695 so->so_send_filt_thread == current_thread()) {
1696 /*
1697 * We're being called recursively from a filter,
1698 * allow this to continue. Radar 4150520.
1699 * Don't set sblocked because we don't want
1700 * to perform an unlock later.
1701 */
1702 assumelock = 1;
1703 } else {
1704 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1705 if (error) {
1706 if (so->so_flags & SOF_DEFUNCT)
1707 goto defunct;
1708 return (error);
1709 }
1710 *sblocked = 1;
1711 }
1712 }
1713
1714 /*
1715 * If a send attempt is made on a socket that has been marked
1716 * as inactive (disconnected), reject the request.
1717 */
1718 if (so->so_flags & SOF_DEFUNCT) {
1719 defunct:
1720 error = EPIPE;
1721 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
1722 __func__, proc_selfpid(),
1723 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1724 SOCK_DOM(so), SOCK_TYPE(so), error));
1725 return (error);
1726 }
1727
1728 if (so->so_state & SS_CANTSENDMORE) {
1729 #if CONTENT_FILTER
1730 /*
1731 * Can re-inject data of half closed connections
1732 */
1733 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1734 so->so_snd.sb_cfil_thread == current_thread() &&
1735 cfil_sock_data_pending(&so->so_snd) != 0)
1736 CFIL_LOG(LOG_INFO,
1737 "so %llx ignore SS_CANTSENDMORE",
1738 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1739 else
1740 #endif /* CONTENT_FILTER */
1741 return (EPIPE);
1742 }
1743 if (so->so_error) {
1744 error = so->so_error;
1745 so->so_error = 0;
1746 return (error);
1747 }
1748
1749 if ((so->so_state & SS_ISCONNECTED) == 0) {
1750 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1751 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1752 (resid != 0 || clen == 0) &&
1753 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1754 #if MPTCP
1755 /*
1756 * MPTCP Fast Join sends data before the
1757 * socket is truly connected.
1758 */
1759 if ((so->so_flags & (SOF_MP_SUBFLOW |
1760 SOF_MPTCP_FASTJOIN)) !=
1761 (SOF_MP_SUBFLOW | SOF_MPTCP_FASTJOIN))
1762 #endif /* MPTCP */
1763 return (ENOTCONN);
1764 }
1765 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1766 return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1767 ENOTCONN : EDESTADDRREQ);
1768 }
1769 }
1770
1771 if (so->so_flags & SOF_ENABLE_MSGS)
1772 space = msgq_sbspace(so, control);
1773 else
1774 space = sbspace(&so->so_snd);
1775
1776 if (flags & MSG_OOB)
1777 space += 1024;
1778 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1779 clen > so->so_snd.sb_hiwat)
1780 return (EMSGSIZE);
1781
1782 if ((space < resid + clen &&
1783 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
1784 space < clen)) ||
1785 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1786 /*
1787 * don't block the connectx call when there's more data
1788 * than can be copied.
1789 */
1790 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
1791 if (space == 0) {
1792 return (EWOULDBLOCK);
1793 }
1794 if (space < (int32_t)so->so_snd.sb_lowat) {
1795 return (0);
1796 }
1797 }
1798 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1799 assumelock) {
1800 return (EWOULDBLOCK);
1801 }
1802 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
1803 *sblocked = 0;
1804 error = sbwait(&so->so_snd);
1805 if (error) {
1806 if (so->so_flags & SOF_DEFUNCT)
1807 goto defunct;
1808 return (error);
1809 }
1810 goto restart;
1811 }
1812 return (0);
1813 }
1814
1815 /*
1816 * Send on a socket.
1817 * If send must go all at once and message is larger than
1818 * send buffering, then hard error.
1819 * Lock against other senders.
1820 * If must go all at once and not enough room now, then
1821 * inform user that this would block and do nothing.
1822 * Otherwise, if nonblocking, send as much as possible.
1823 * The data to be sent is described by "uio" if nonzero,
1824 * otherwise by the mbuf chain "top" (which must be null
1825 * if uio is not). Data provided in mbuf chain must be small
1826 * enough to send all at once.
1827 *
1828 * Returns nonzero on error, timeout or signal; callers
1829 * must check for short counts if EINTR/ERESTART are returned.
1830 * Data and control buffers are freed on return.
1831 * Experiment:
1832 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1833 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1834 * point at the mbuf chain being constructed and go from there.
1835 *
1836 * Returns: 0 Success
1837 * EOPNOTSUPP
1838 * EINVAL
1839 * ENOBUFS
1840 * uiomove:EFAULT
1841 * sosendcheck:EPIPE
1842 * sosendcheck:EWOULDBLOCK
1843 * sosendcheck:EINTR
1844 * sosendcheck:EBADF
1845 * sosendcheck:EINTR
1846 * sosendcheck:??? [value from so_error]
1847 * <pru_send>:ECONNRESET[TCP]
1848 * <pru_send>:EINVAL[TCP]
1849 * <pru_send>:ENOBUFS[TCP]
1850 * <pru_send>:EADDRINUSE[TCP]
1851 * <pru_send>:EADDRNOTAVAIL[TCP]
1852 * <pru_send>:EAFNOSUPPORT[TCP]
1853 * <pru_send>:EACCES[TCP]
1854 * <pru_send>:EAGAIN[TCP]
1855 * <pru_send>:EPERM[TCP]
1856 * <pru_send>:EMSGSIZE[TCP]
1857 * <pru_send>:EHOSTUNREACH[TCP]
1858 * <pru_send>:ENETUNREACH[TCP]
1859 * <pru_send>:ENETDOWN[TCP]
1860 * <pru_send>:ENOMEM[TCP]
1861 * <pru_send>:ENOBUFS[TCP]
1862 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
1863 * <pru_send>:EINVAL[AF_UNIX]
1864 * <pru_send>:EOPNOTSUPP[AF_UNIX]
1865 * <pru_send>:EPIPE[AF_UNIX]
1866 * <pru_send>:ENOTCONN[AF_UNIX]
1867 * <pru_send>:EISCONN[AF_UNIX]
1868 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
1869 * <sf_data_out>:??? [whatever a filter author chooses]
1870 *
1871 * Notes: Other <pru_send> returns depend on the protocol family; all
1872 * <sf_data_out> returns depend on what the filter author causes
1873 * their filter to return.
1874 */
1875 int
1876 sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1877 struct mbuf *top, struct mbuf *control, int flags)
1878 {
1879 struct mbuf **mp;
1880 struct mbuf *m, *freelist = NULL;
1881 user_ssize_t space, len, resid, orig_resid;
1882 int clen = 0, error, dontroute, mlen, sendflags;
1883 int atomic = sosendallatonce(so) || top;
1884 int sblocked = 0;
1885 struct proc *p = current_proc();
1886 struct mbuf *control_copy = NULL;
1887 uint16_t headroom = 0;
1888 boolean_t en_tracing = FALSE;
1889
1890 if (uio != NULL)
1891 resid = uio_resid(uio);
1892 else
1893 resid = top->m_pkthdr.len;
1894
1895 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
1896 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
1897
1898 socket_lock(so, 1);
1899
1900 /*
1901 * trace if tracing & network (vs. unix) sockets & and
1902 * non-loopback
1903 */
1904 if (ENTR_SHOULDTRACE &&
1905 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
1906 struct inpcb *inp = sotoinpcb(so);
1907 if (inp->inp_last_outifp != NULL &&
1908 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
1909 en_tracing = TRUE;
1910 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
1911 VM_KERNEL_ADDRPERM(so),
1912 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
1913 (int64_t)resid);
1914 orig_resid = resid;
1915 }
1916 }
1917
1918 /*
1919 * Re-injection should not affect process accounting
1920 */
1921 if ((flags & MSG_SKIPCFIL) == 0) {
1922 so_update_last_owner_locked(so, p);
1923 so_update_policy(so);
1924
1925 #if NECP
1926 so_update_necp_policy(so, NULL, addr);
1927 #endif /* NECP */
1928 }
1929
1930 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
1931 error = EOPNOTSUPP;
1932 socket_unlock(so, 1);
1933 goto out;
1934 }
1935
1936 /*
1937 * In theory resid should be unsigned.
1938 * However, space must be signed, as it might be less than 0
1939 * if we over-committed, and we must use a signed comparison
1940 * of space and resid. On the other hand, a negative resid
1941 * causes us to loop sending 0-length segments to the protocol.
1942 *
1943 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
1944 * But it will be used by sockets doing message delivery.
1945 *
1946 * Note: We limit resid to be a positive int value as we use
1947 * imin() to set bytes_to_copy -- radr://14558484
1948 */
1949 if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
1950 !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
1951 error = EINVAL;
1952 socket_unlock(so, 1);
1953 goto out;
1954 }
1955
1956 dontroute = (flags & MSG_DONTROUTE) &&
1957 (so->so_options & SO_DONTROUTE) == 0 &&
1958 (so->so_proto->pr_flags & PR_ATOMIC);
1959 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
1960
1961 if (control != NULL)
1962 clen = control->m_len;
1963
1964 if (soreserveheadroom != 0)
1965 headroom = so->so_pktheadroom;
1966
1967 do {
1968 error = sosendcheck(so, addr, resid, clen, atomic, flags,
1969 &sblocked, control);
1970 if (error)
1971 goto release;
1972
1973 mp = &top;
1974 if (so->so_flags & SOF_ENABLE_MSGS)
1975 space = msgq_sbspace(so, control);
1976 else
1977 space = sbspace(&so->so_snd) - clen;
1978 space += ((flags & MSG_OOB) ? 1024 : 0);
1979
1980 do {
1981 if (uio == NULL) {
1982 /*
1983 * Data is prepackaged in "top".
1984 */
1985 resid = 0;
1986 if (flags & MSG_EOR)
1987 top->m_flags |= M_EOR;
1988 } else {
1989 int chainlength;
1990 int bytes_to_copy;
1991 boolean_t jumbocl;
1992 boolean_t bigcl;
1993 int bytes_to_alloc;
1994
1995 bytes_to_copy = imin(resid, space);
1996
1997 bytes_to_alloc = bytes_to_copy;
1998 if (top == NULL)
1999 bytes_to_alloc += headroom;
2000
2001 if (sosendminchain > 0)
2002 chainlength = 0;
2003 else
2004 chainlength = sosendmaxchain;
2005
2006 /*
2007 * Use big 4 KB cluster when the outgoing interface
2008 * does not prefer 2 KB clusters
2009 */
2010 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2011 sosendbigcl_ignore_capab;
2012
2013 /*
2014 * Attempt to use larger than system page-size
2015 * clusters for large writes only if there is
2016 * a jumbo cluster pool and if the socket is
2017 * marked accordingly.
2018 */
2019 jumbocl = sosendjcl && njcl > 0 &&
2020 ((so->so_flags & SOF_MULTIPAGES) ||
2021 sosendjcl_ignore_capab) &&
2022 bigcl;
2023
2024 socket_unlock(so, 0);
2025
2026 do {
2027 int num_needed;
2028 int hdrs_needed = (top == NULL) ? 1 : 0;
2029
2030 /*
2031 * try to maintain a local cache of mbuf
2032 * clusters needed to complete this
2033 * write the list is further limited to
2034 * the number that are currently needed
2035 * to fill the socket this mechanism
2036 * allows a large number of mbufs/
2037 * clusters to be grabbed under a single
2038 * mbuf lock... if we can't get any
2039 * clusters, than fall back to trying
2040 * for mbufs if we fail early (or
2041 * miscalcluate the number needed) make
2042 * sure to release any clusters we
2043 * haven't yet consumed.
2044 */
2045 if (freelist == NULL &&
2046 bytes_to_alloc > MBIGCLBYTES &&
2047 jumbocl) {
2048 num_needed =
2049 bytes_to_alloc / M16KCLBYTES;
2050
2051 if ((bytes_to_alloc -
2052 (num_needed * M16KCLBYTES))
2053 >= MINCLSIZE)
2054 num_needed++;
2055
2056 freelist =
2057 m_getpackets_internal(
2058 (unsigned int *)&num_needed,
2059 hdrs_needed, M_WAIT, 0,
2060 M16KCLBYTES);
2061 /*
2062 * Fall back to 4K cluster size
2063 * if allocation failed
2064 */
2065 }
2066
2067 if (freelist == NULL &&
2068 bytes_to_alloc > MCLBYTES &&
2069 bigcl) {
2070 num_needed =
2071 bytes_to_alloc / MBIGCLBYTES;
2072
2073 if ((bytes_to_alloc -
2074 (num_needed * MBIGCLBYTES)) >=
2075 MINCLSIZE)
2076 num_needed++;
2077
2078 freelist =
2079 m_getpackets_internal(
2080 (unsigned int *)&num_needed,
2081 hdrs_needed, M_WAIT, 0,
2082 MBIGCLBYTES);
2083 /*
2084 * Fall back to cluster size
2085 * if allocation failed
2086 */
2087 }
2088
2089 /*
2090 * Allocate a cluster as we want to
2091 * avoid to split the data in more
2092 * that one segment and using MINCLSIZE
2093 * would lead us to allocate two mbufs
2094 */
2095 if (soreserveheadroom != 0 &&
2096 freelist == NULL &&
2097 ((top == NULL &&
2098 bytes_to_alloc > _MHLEN) ||
2099 bytes_to_alloc > _MLEN)) {
2100 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2101 MCLBYTES;
2102 freelist =
2103 m_getpackets_internal(
2104 (unsigned int *)&num_needed,
2105 hdrs_needed, M_WAIT, 0,
2106 MCLBYTES);
2107 /*
2108 * Fall back to a single mbuf
2109 * if allocation failed
2110 */
2111 } else if (freelist == NULL &&
2112 bytes_to_alloc > MINCLSIZE) {
2113 num_needed =
2114 bytes_to_alloc / MCLBYTES;
2115
2116 if ((bytes_to_alloc -
2117 (num_needed * MCLBYTES)) >=
2118 MINCLSIZE)
2119 num_needed++;
2120
2121 freelist =
2122 m_getpackets_internal(
2123 (unsigned int *)&num_needed,
2124 hdrs_needed, M_WAIT, 0,
2125 MCLBYTES);
2126 /*
2127 * Fall back to a single mbuf
2128 * if allocation failed
2129 */
2130 }
2131 /*
2132 * For datagram protocols, leave
2133 * headroom for protocol headers
2134 * in the first cluster of the chain
2135 */
2136 if (freelist != NULL && atomic &&
2137 top == NULL && headroom > 0) {
2138 freelist->m_data += headroom;
2139 }
2140
2141 /*
2142 * Fall back to regular mbufs without
2143 * reserving the socket headroom
2144 */
2145 if (freelist == NULL) {
2146 if (top == NULL)
2147 MGETHDR(freelist,
2148 M_WAIT, MT_DATA);
2149 else
2150 MGET(freelist,
2151 M_WAIT, MT_DATA);
2152
2153 if (freelist == NULL) {
2154 error = ENOBUFS;
2155 socket_lock(so, 0);
2156 goto release;
2157 }
2158 /*
2159 * For datagram protocols,
2160 * leave room for protocol
2161 * headers in first mbuf.
2162 */
2163 if (atomic && top == NULL &&
2164 bytes_to_copy < MHLEN) {
2165 MH_ALIGN(freelist,
2166 bytes_to_copy);
2167 }
2168 }
2169 m = freelist;
2170 freelist = m->m_next;
2171 m->m_next = NULL;
2172
2173 if ((m->m_flags & M_EXT))
2174 mlen = m->m_ext.ext_size -
2175 m_leadingspace(m);
2176 else if ((m->m_flags & M_PKTHDR))
2177 mlen =
2178 MHLEN - m_leadingspace(m);
2179 else
2180 mlen = MLEN - m_leadingspace(m);
2181 len = imin(mlen, bytes_to_copy);
2182
2183 chainlength += len;
2184
2185 space -= len;
2186
2187 error = uiomove(mtod(m, caddr_t),
2188 len, uio);
2189
2190 resid = uio_resid(uio);
2191
2192 m->m_len = len;
2193 *mp = m;
2194 top->m_pkthdr.len += len;
2195 if (error)
2196 break;
2197 mp = &m->m_next;
2198 if (resid <= 0) {
2199 if (flags & MSG_EOR)
2200 top->m_flags |= M_EOR;
2201 break;
2202 }
2203 bytes_to_copy = min(resid, space);
2204
2205 } while (space > 0 &&
2206 (chainlength < sosendmaxchain || atomic ||
2207 resid < MINCLSIZE));
2208
2209 socket_lock(so, 0);
2210
2211 if (error)
2212 goto release;
2213 }
2214
2215 if (flags & (MSG_HOLD|MSG_SEND)) {
2216 /* Enqueue for later, go away if HOLD */
2217 struct mbuf *mb1;
2218 if (so->so_temp && (flags & MSG_FLUSH)) {
2219 m_freem(so->so_temp);
2220 so->so_temp = NULL;
2221 }
2222 if (so->so_temp)
2223 so->so_tail->m_next = top;
2224 else
2225 so->so_temp = top;
2226 mb1 = top;
2227 while (mb1->m_next)
2228 mb1 = mb1->m_next;
2229 so->so_tail = mb1;
2230 if (flags & MSG_HOLD) {
2231 top = NULL;
2232 goto release;
2233 }
2234 top = so->so_temp;
2235 }
2236 if (dontroute)
2237 so->so_options |= SO_DONTROUTE;
2238
2239 /*
2240 * Compute flags here, for pru_send and NKEs
2241 *
2242 * If the user set MSG_EOF, the protocol
2243 * understands this flag and nothing left to
2244 * send then use PRU_SEND_EOF instead of PRU_SEND.
2245 */
2246 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2247 ((flags & MSG_EOF) &&
2248 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2249 (resid <= 0)) ? PRUS_EOF :
2250 /* If there is more to send set PRUS_MORETOCOME */
2251 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2252
2253 if ((flags & MSG_SKIPCFIL) == 0) {
2254 /*
2255 * Socket filter processing
2256 */
2257 error = sflt_data_out(so, addr, &top,
2258 &control, (sendflags & MSG_OOB) ?
2259 sock_data_filt_flag_oob : 0);
2260 if (error) {
2261 if (error == EJUSTRETURN) {
2262 error = 0;
2263 clen = 0;
2264 control = NULL;
2265 top = NULL;
2266 }
2267 goto release;
2268 }
2269 #if CONTENT_FILTER
2270 /*
2271 * Content filter processing
2272 */
2273 error = cfil_sock_data_out(so, addr, top,
2274 control, (sendflags & MSG_OOB) ?
2275 sock_data_filt_flag_oob : 0);
2276 if (error) {
2277 if (error == EJUSTRETURN) {
2278 error = 0;
2279 clen = 0;
2280 control = NULL;
2281 top = NULL;
2282 }
2283 goto release;
2284 }
2285 #endif /* CONTENT_FILTER */
2286 }
2287 if (so->so_flags & SOF_ENABLE_MSGS) {
2288 /*
2289 * Make a copy of control mbuf,
2290 * so that msg priority can be
2291 * passed to subsequent mbufs.
2292 */
2293 control_copy = m_dup(control, M_NOWAIT);
2294 }
2295 error = (*so->so_proto->pr_usrreqs->pru_send)
2296 (so, sendflags, top, addr, control, p);
2297
2298 if (flags & MSG_SEND)
2299 so->so_temp = NULL;
2300
2301 if (dontroute)
2302 so->so_options &= ~SO_DONTROUTE;
2303
2304 clen = 0;
2305 control = control_copy;
2306 control_copy = NULL;
2307 top = NULL;
2308 mp = &top;
2309 if (error)
2310 goto release;
2311 } while (resid && space > 0);
2312 } while (resid);
2313
2314 release:
2315 if (sblocked)
2316 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2317 else
2318 socket_unlock(so, 1);
2319 out:
2320 if (top != NULL)
2321 m_freem(top);
2322 if (control != NULL)
2323 m_freem(control);
2324 if (freelist != NULL)
2325 m_freem_list(freelist);
2326 if (control_copy != NULL)
2327 m_freem(control_copy);
2328
2329 /*
2330 * One write has been done. This was enough. Get back to "normal"
2331 * behavior.
2332 */
2333 if (so->so_flags1 & SOF1_PRECONNECT_DATA)
2334 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
2335
2336 if (en_tracing) {
2337 /* resid passed here is the bytes left in uio */
2338 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2339 VM_KERNEL_ADDRPERM(so),
2340 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2341 (int64_t)(orig_resid - resid));
2342 }
2343 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2344 so->so_snd.sb_cc, space, error);
2345
2346 return (error);
2347 }
2348
2349 /*
2350 * Supported only connected sockets (no address) without ancillary data
2351 * (control mbuf) for atomic protocols
2352 */
2353 int
2354 sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2355 {
2356 struct mbuf *m, *freelist = NULL;
2357 user_ssize_t len, resid;
2358 int error, dontroute, mlen;
2359 int atomic = sosendallatonce(so);
2360 int sblocked = 0;
2361 struct proc *p = current_proc();
2362 u_int uiofirst = 0;
2363 u_int uiolast = 0;
2364 struct mbuf *top = NULL;
2365 uint16_t headroom = 0;
2366 boolean_t bigcl;
2367
2368 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2369 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2370
2371 if (so->so_type != SOCK_DGRAM) {
2372 error = EINVAL;
2373 goto out;
2374 }
2375 if (atomic == 0) {
2376 error = EINVAL;
2377 goto out;
2378 }
2379 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2380 error = EPROTONOSUPPORT;
2381 goto out;
2382 }
2383 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2384 error = EINVAL;
2385 goto out;
2386 }
2387 resid = uio_array_resid(uioarray, uiocnt);
2388
2389 /*
2390 * In theory resid should be unsigned.
2391 * However, space must be signed, as it might be less than 0
2392 * if we over-committed, and we must use a signed comparison
2393 * of space and resid. On the other hand, a negative resid
2394 * causes us to loop sending 0-length segments to the protocol.
2395 *
2396 * Note: We limit resid to be a positive int value as we use
2397 * imin() to set bytes_to_copy -- radr://14558484
2398 */
2399 if (resid < 0 || resid > INT_MAX) {
2400 error = EINVAL;
2401 goto out;
2402 }
2403
2404 socket_lock(so, 1);
2405 so_update_last_owner_locked(so, p);
2406 so_update_policy(so);
2407
2408 #if NECP
2409 so_update_necp_policy(so, NULL, NULL);
2410 #endif /* NECP */
2411
2412 dontroute = (flags & MSG_DONTROUTE) &&
2413 (so->so_options & SO_DONTROUTE) == 0 &&
2414 (so->so_proto->pr_flags & PR_ATOMIC);
2415 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2416
2417 error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2418 &sblocked, NULL);
2419 if (error)
2420 goto release;
2421
2422 /*
2423 * Use big 4 KB clusters when the outgoing interface does not prefer
2424 * 2 KB clusters
2425 */
2426 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2427
2428 if (soreserveheadroom != 0)
2429 headroom = so->so_pktheadroom;
2430
2431 do {
2432 int i;
2433 int num_needed = 0;
2434 int chainlength;
2435 size_t maxpktlen = 0;
2436 int bytes_to_alloc;
2437
2438 if (sosendminchain > 0)
2439 chainlength = 0;
2440 else
2441 chainlength = sosendmaxchain;
2442
2443 socket_unlock(so, 0);
2444
2445 /*
2446 * Find a set of uio that fit in a reasonable number
2447 * of mbuf packets
2448 */
2449 for (i = uiofirst; i < uiocnt; i++) {
2450 struct uio *auio = uioarray[i];
2451
2452 len = uio_resid(auio);
2453
2454 /* Do nothing for empty messages */
2455 if (len == 0)
2456 continue;
2457
2458 num_needed += 1;
2459 uiolast += 1;
2460
2461 if (len > maxpktlen)
2462 maxpktlen = len;
2463
2464 chainlength += len;
2465 if (chainlength > sosendmaxchain)
2466 break;
2467 }
2468 /*
2469 * Nothing left to send
2470 */
2471 if (num_needed == 0) {
2472 socket_lock(so, 0);
2473 break;
2474 }
2475 /*
2476 * Allocate buffer large enough to include headroom space for
2477 * network and link header
2478 *
2479 */
2480 bytes_to_alloc = maxpktlen + headroom;
2481
2482 /*
2483 * Allocate a single contiguous buffer of the smallest available
2484 * size when possible
2485 */
2486 if (bytes_to_alloc > MCLBYTES &&
2487 bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2488 freelist = m_getpackets_internal(
2489 (unsigned int *)&num_needed,
2490 num_needed, M_WAIT, 1,
2491 MBIGCLBYTES);
2492 } else if (bytes_to_alloc > _MHLEN &&
2493 bytes_to_alloc <= MCLBYTES) {
2494 freelist = m_getpackets_internal(
2495 (unsigned int *)&num_needed,
2496 num_needed, M_WAIT, 1,
2497 MCLBYTES);
2498 } else {
2499 freelist = m_allocpacket_internal(
2500 (unsigned int *)&num_needed,
2501 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2502 }
2503
2504 if (freelist == NULL) {
2505 socket_lock(so, 0);
2506 error = ENOMEM;
2507 goto release;
2508 }
2509 /*
2510 * Copy each uio of the set into its own mbuf packet
2511 */
2512 for (i = uiofirst, m = freelist;
2513 i < uiolast && m != NULL;
2514 i++) {
2515 int bytes_to_copy;
2516 struct mbuf *n;
2517 struct uio *auio = uioarray[i];
2518
2519 bytes_to_copy = uio_resid(auio);
2520
2521 /* Do nothing for empty messages */
2522 if (bytes_to_copy == 0)
2523 continue;
2524 /*
2525 * Leave headroom for protocol headers
2526 * in the first mbuf of the chain
2527 */
2528 m->m_data += headroom;
2529
2530 for (n = m; n != NULL; n = n->m_next) {
2531 if ((m->m_flags & M_EXT))
2532 mlen = m->m_ext.ext_size -
2533 m_leadingspace(m);
2534 else if ((m->m_flags & M_PKTHDR))
2535 mlen =
2536 MHLEN - m_leadingspace(m);
2537 else
2538 mlen = MLEN - m_leadingspace(m);
2539 len = imin(mlen, bytes_to_copy);
2540
2541 /*
2542 * Note: uiomove() decrements the iovec
2543 * length
2544 */
2545 error = uiomove(mtod(n, caddr_t),
2546 len, auio);
2547 if (error != 0)
2548 break;
2549 n->m_len = len;
2550 m->m_pkthdr.len += len;
2551
2552 VERIFY(m->m_pkthdr.len <= maxpktlen);
2553
2554 bytes_to_copy -= len;
2555 resid -= len;
2556 }
2557 if (m->m_pkthdr.len == 0) {
2558 printf(
2559 "%s:%d so %llx pkt %llx type %u len null\n",
2560 __func__, __LINE__,
2561 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2562 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2563 m->m_type);
2564 }
2565 if (error != 0)
2566 break;
2567 m = m->m_nextpkt;
2568 }
2569
2570 socket_lock(so, 0);
2571
2572 if (error)
2573 goto release;
2574 top = freelist;
2575 freelist = NULL;
2576
2577 if (dontroute)
2578 so->so_options |= SO_DONTROUTE;
2579
2580 if ((flags & MSG_SKIPCFIL) == 0) {
2581 struct mbuf **prevnextp = NULL;
2582
2583 for (i = uiofirst, m = top;
2584 i < uiolast && m != NULL;
2585 i++) {
2586 struct mbuf *nextpkt = m->m_nextpkt;
2587
2588 /*
2589 * Socket filter processing
2590 */
2591 error = sflt_data_out(so, NULL, &m,
2592 NULL, 0);
2593 if (error != 0 && error != EJUSTRETURN)
2594 goto release;
2595
2596 #if CONTENT_FILTER
2597 if (error == 0) {
2598 /*
2599 * Content filter processing
2600 */
2601 error = cfil_sock_data_out(so, NULL, m,
2602 NULL, 0);
2603 if (error != 0 && error != EJUSTRETURN)
2604 goto release;
2605 }
2606 #endif /* CONTENT_FILTER */
2607 /*
2608 * Remove packet from the list when
2609 * swallowed by a filter
2610 */
2611 if (error == EJUSTRETURN) {
2612 error = 0;
2613 if (prevnextp != NULL)
2614 *prevnextp = nextpkt;
2615 else
2616 top = nextpkt;
2617 }
2618
2619 m = nextpkt;
2620 if (m != NULL)
2621 prevnextp = &m->m_nextpkt;
2622 }
2623 }
2624 if (top != NULL)
2625 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2626 (so, 0, top, NULL, NULL, p);
2627
2628 if (dontroute)
2629 so->so_options &= ~SO_DONTROUTE;
2630
2631 top = NULL;
2632 uiofirst = uiolast;
2633 } while (resid > 0 && error == 0);
2634 release:
2635 if (sblocked)
2636 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2637 else
2638 socket_unlock(so, 1);
2639 out:
2640 if (top != NULL)
2641 m_freem(top);
2642 if (freelist != NULL)
2643 m_freem_list(freelist);
2644
2645 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2646 so->so_snd.sb_cc, 0, error);
2647
2648 return (error);
2649 }
2650
2651 /*
2652 * May return ERESTART when packet is dropped by MAC policy check
2653 */
2654 static int
2655 soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2656 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2657 {
2658 int error = 0;
2659 struct mbuf *m = *mp;
2660 struct mbuf *nextrecord = *nextrecordp;
2661
2662 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2663 #if CONFIG_MACF_SOCKET_SUBSET
2664 /*
2665 * Call the MAC framework for policy checking if we're in
2666 * the user process context and the socket isn't connected.
2667 */
2668 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2669 struct mbuf *m0 = m;
2670 /*
2671 * Dequeue this record (temporarily) from the receive
2672 * list since we're about to drop the socket's lock
2673 * where a new record may arrive and be appended to
2674 * the list. Upon MAC policy failure, the record
2675 * will be freed. Otherwise, we'll add it back to
2676 * the head of the list. We cannot rely on SB_LOCK
2677 * because append operation uses the socket's lock.
2678 */
2679 do {
2680 m->m_nextpkt = NULL;
2681 sbfree(&so->so_rcv, m);
2682 m = m->m_next;
2683 } while (m != NULL);
2684 m = m0;
2685 so->so_rcv.sb_mb = nextrecord;
2686 SB_EMPTY_FIXUP(&so->so_rcv);
2687 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2688 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2689 socket_unlock(so, 0);
2690
2691 if (mac_socket_check_received(proc_ucred(p), so,
2692 mtod(m, struct sockaddr *)) != 0) {
2693 /*
2694 * MAC policy failure; free this record and
2695 * process the next record (or block until
2696 * one is available). We have adjusted sb_cc
2697 * and sb_mbcnt above so there is no need to
2698 * call sbfree() again.
2699 */
2700 m_freem(m);
2701 /*
2702 * Clear SB_LOCK but don't unlock the socket.
2703 * Process the next record or wait for one.
2704 */
2705 socket_lock(so, 0);
2706 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2707 error = ERESTART;
2708 goto done;
2709 }
2710 socket_lock(so, 0);
2711 /*
2712 * If the socket has been defunct'd, drop it.
2713 */
2714 if (so->so_flags & SOF_DEFUNCT) {
2715 m_freem(m);
2716 error = ENOTCONN;
2717 goto done;
2718 }
2719 /*
2720 * Re-adjust the socket receive list and re-enqueue
2721 * the record in front of any packets which may have
2722 * been appended while we dropped the lock.
2723 */
2724 for (m = m0; m->m_next != NULL; m = m->m_next)
2725 sballoc(&so->so_rcv, m);
2726 sballoc(&so->so_rcv, m);
2727 if (so->so_rcv.sb_mb == NULL) {
2728 so->so_rcv.sb_lastrecord = m0;
2729 so->so_rcv.sb_mbtail = m;
2730 }
2731 m = m0;
2732 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2733 so->so_rcv.sb_mb = m;
2734 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2735 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2736 }
2737 #endif /* CONFIG_MACF_SOCKET_SUBSET */
2738 if (psa != NULL) {
2739 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
2740 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2741 error = EWOULDBLOCK;
2742 goto done;
2743 }
2744 }
2745 if (flags & MSG_PEEK) {
2746 m = m->m_next;
2747 } else {
2748 sbfree(&so->so_rcv, m);
2749 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2750 panic("%s: about to create invalid socketbuf",
2751 __func__);
2752 /* NOTREACHED */
2753 }
2754 MFREE(m, so->so_rcv.sb_mb);
2755 m = so->so_rcv.sb_mb;
2756 if (m != NULL) {
2757 m->m_nextpkt = nextrecord;
2758 } else {
2759 so->so_rcv.sb_mb = nextrecord;
2760 SB_EMPTY_FIXUP(&so->so_rcv);
2761 }
2762 }
2763 done:
2764 *mp = m;
2765 *nextrecordp = nextrecord;
2766
2767 return (error);
2768 }
2769
2770 /*
2771 * Process one or more MT_CONTROL mbufs present before any data mbufs
2772 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2773 * just copy the data; if !MSG_PEEK, we call into the protocol to
2774 * perform externalization.
2775 */
2776 static int
2777 soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
2778 struct mbuf **mp, struct mbuf **nextrecordp)
2779 {
2780 int error = 0;
2781 struct mbuf *cm = NULL, *cmn;
2782 struct mbuf **cme = &cm;
2783 struct sockbuf *sb_rcv = &so->so_rcv;
2784 struct mbuf **msgpcm = NULL;
2785 struct mbuf *m = *mp;
2786 struct mbuf *nextrecord = *nextrecordp;
2787 struct protosw *pr = so->so_proto;
2788
2789 /*
2790 * Externalizing the control messages would require us to
2791 * drop the socket's lock below. Once we re-acquire the
2792 * lock, the mbuf chain might change. In order to preserve
2793 * consistency, we unlink all control messages from the
2794 * first mbuf chain in one shot and link them separately
2795 * onto a different chain.
2796 */
2797 do {
2798 if (flags & MSG_PEEK) {
2799 if (controlp != NULL) {
2800 if (*controlp == NULL) {
2801 msgpcm = controlp;
2802 }
2803 *controlp = m_copy(m, 0, m->m_len);
2804
2805 /*
2806 * If we failed to allocate an mbuf,
2807 * release any previously allocated
2808 * mbufs for control data. Return
2809 * an error. Keep the mbufs in the
2810 * socket as this is using
2811 * MSG_PEEK flag.
2812 */
2813 if (*controlp == NULL) {
2814 m_freem(*msgpcm);
2815 error = ENOBUFS;
2816 goto done;
2817 }
2818 controlp = &(*controlp)->m_next;
2819 }
2820 m = m->m_next;
2821 } else {
2822 m->m_nextpkt = NULL;
2823 sbfree(sb_rcv, m);
2824 sb_rcv->sb_mb = m->m_next;
2825 m->m_next = NULL;
2826 *cme = m;
2827 cme = &(*cme)->m_next;
2828 m = sb_rcv->sb_mb;
2829 }
2830 } while (m != NULL && m->m_type == MT_CONTROL);
2831
2832 if (!(flags & MSG_PEEK)) {
2833 if (sb_rcv->sb_mb != NULL) {
2834 sb_rcv->sb_mb->m_nextpkt = nextrecord;
2835 } else {
2836 sb_rcv->sb_mb = nextrecord;
2837 SB_EMPTY_FIXUP(sb_rcv);
2838 }
2839 if (nextrecord == NULL)
2840 sb_rcv->sb_lastrecord = m;
2841 }
2842
2843 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
2844 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
2845
2846 while (cm != NULL) {
2847 int cmsg_type;
2848
2849 cmn = cm->m_next;
2850 cm->m_next = NULL;
2851 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
2852
2853 /*
2854 * Call the protocol to externalize SCM_RIGHTS message
2855 * and return the modified message to the caller upon
2856 * success. Otherwise, all other control messages are
2857 * returned unmodified to the caller. Note that we
2858 * only get into this loop if MSG_PEEK is not set.
2859 */
2860 if (pr->pr_domain->dom_externalize != NULL &&
2861 cmsg_type == SCM_RIGHTS) {
2862 /*
2863 * Release socket lock: see 3903171. This
2864 * would also allow more records to be appended
2865 * to the socket buffer. We still have SB_LOCK
2866 * set on it, so we can be sure that the head
2867 * of the mbuf chain won't change.
2868 */
2869 socket_unlock(so, 0);
2870 error = (*pr->pr_domain->dom_externalize)(cm);
2871 socket_lock(so, 0);
2872 } else {
2873 error = 0;
2874 }
2875
2876 if (controlp != NULL && error == 0) {
2877 *controlp = cm;
2878 controlp = &(*controlp)->m_next;
2879 } else {
2880 (void) m_free(cm);
2881 }
2882 cm = cmn;
2883 }
2884 /*
2885 * Update the value of nextrecord in case we received new
2886 * records when the socket was unlocked above for
2887 * externalizing SCM_RIGHTS.
2888 */
2889 if (m != NULL)
2890 nextrecord = sb_rcv->sb_mb->m_nextpkt;
2891 else
2892 nextrecord = sb_rcv->sb_mb;
2893
2894 done:
2895 *mp = m;
2896 *nextrecordp = nextrecord;
2897
2898 return (error);
2899 }
2900
2901 /*
2902 * Implement receive operations on a socket.
2903 * We depend on the way that records are added to the sockbuf
2904 * by sbappend*. In particular, each record (mbufs linked through m_next)
2905 * must begin with an address if the protocol so specifies,
2906 * followed by an optional mbuf or mbufs containing ancillary data,
2907 * and then zero or more mbufs of data.
2908 * In order to avoid blocking network interrupts for the entire time here,
2909 * we splx() while doing the actual copy to user space.
2910 * Although the sockbuf is locked, new data may still be appended,
2911 * and thus we must maintain consistency of the sockbuf during that time.
2912 *
2913 * The caller may receive the data as a single mbuf chain by supplying
2914 * an mbuf **mp0 for use in returning the chain. The uio is then used
2915 * only for the count in uio_resid.
2916 *
2917 * Returns: 0 Success
2918 * ENOBUFS
2919 * ENOTCONN
2920 * EWOULDBLOCK
2921 * uiomove:EFAULT
2922 * sblock:EWOULDBLOCK
2923 * sblock:EINTR
2924 * sbwait:EBADF
2925 * sbwait:EINTR
2926 * sodelayed_copy:EFAULT
2927 * <pru_rcvoob>:EINVAL[TCP]
2928 * <pru_rcvoob>:EWOULDBLOCK[TCP]
2929 * <pru_rcvoob>:???
2930 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
2931 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
2932 * <pr_domain->dom_externalize>:???
2933 *
2934 * Notes: Additional return values from calls through <pru_rcvoob> and
2935 * <pr_domain->dom_externalize> depend on protocols other than
2936 * TCP or AF_UNIX, which are documented above.
2937 */
2938 int
2939 soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
2940 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2941 {
2942 struct mbuf *m, **mp, *ml = NULL;
2943 struct mbuf *nextrecord, *free_list;
2944 int flags, error, offset;
2945 user_ssize_t len;
2946 struct protosw *pr = so->so_proto;
2947 int moff, type = 0;
2948 user_ssize_t orig_resid = uio_resid(uio);
2949 user_ssize_t delayed_copy_len;
2950 int can_delay;
2951 int need_event;
2952 struct proc *p = current_proc();
2953 boolean_t en_tracing = FALSE;
2954
2955 /*
2956 * Sanity check on the length passed by caller as we are making 'int'
2957 * comparisons
2958 */
2959 if (orig_resid < 0 || orig_resid > INT_MAX)
2960 return (EINVAL);
2961
2962 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
2963 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
2964 so->so_rcv.sb_hiwat);
2965
2966 socket_lock(so, 1);
2967 so_update_last_owner_locked(so, p);
2968 so_update_policy(so);
2969
2970 #ifdef MORE_LOCKING_DEBUG
2971 if (so->so_usecount == 1) {
2972 panic("%s: so=%x no other reference on socket\n", __func__, so);
2973 /* NOTREACHED */
2974 }
2975 #endif
2976 mp = mp0;
2977 if (psa != NULL)
2978 *psa = NULL;
2979 if (controlp != NULL)
2980 *controlp = NULL;
2981 if (flagsp != NULL)
2982 flags = *flagsp &~ MSG_EOR;
2983 else
2984 flags = 0;
2985
2986 /*
2987 * If a recv attempt is made on a previously-accepted socket
2988 * that has been marked as inactive (disconnected), reject
2989 * the request.
2990 */
2991 if (so->so_flags & SOF_DEFUNCT) {
2992 struct sockbuf *sb = &so->so_rcv;
2993
2994 error = ENOTCONN;
2995 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
2996 __func__, proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2997 SOCK_DOM(so), SOCK_TYPE(so), error));
2998 /*
2999 * This socket should have been disconnected and flushed
3000 * prior to being returned from sodefunct(); there should
3001 * be no data on its receive list, so panic otherwise.
3002 */
3003 if (so->so_state & SS_DEFUNCT)
3004 sb_empty_assert(sb, __func__);
3005 socket_unlock(so, 1);
3006 return (error);
3007 }
3008
3009 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3010 pr->pr_usrreqs->pru_preconnect) {
3011 /*
3012 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3013 * calling write() right after this. *If* the app calls a read
3014 * we do not want to block this read indefinetely. Thus,
3015 * we trigger a connect so that the session gets initiated.
3016 */
3017 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3018
3019 if (error) {
3020 socket_unlock(so, 1);
3021 return (error);
3022 }
3023 }
3024
3025 if (ENTR_SHOULDTRACE &&
3026 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3027 /*
3028 * enable energy tracing for inet sockets that go over
3029 * non-loopback interfaces only.
3030 */
3031 struct inpcb *inp = sotoinpcb(so);
3032 if (inp->inp_last_outifp != NULL &&
3033 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3034 en_tracing = TRUE;
3035 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3036 VM_KERNEL_ADDRPERM(so),
3037 ((so->so_state & SS_NBIO) ?
3038 kEnTrFlagNonBlocking : 0),
3039 (int64_t)orig_resid);
3040 }
3041 }
3042
3043 /*
3044 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3045 * regardless of the flags argument. Here is the case were
3046 * out-of-band data is not inline.
3047 */
3048 if ((flags & MSG_OOB) ||
3049 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3050 (so->so_options & SO_OOBINLINE) == 0 &&
3051 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3052 m = m_get(M_WAIT, MT_DATA);
3053 if (m == NULL) {
3054 socket_unlock(so, 1);
3055 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3056 ENOBUFS, 0, 0, 0, 0);
3057 return (ENOBUFS);
3058 }
3059 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3060 if (error)
3061 goto bad;
3062 socket_unlock(so, 0);
3063 do {
3064 error = uiomove(mtod(m, caddr_t),
3065 imin(uio_resid(uio), m->m_len), uio);
3066 m = m_free(m);
3067 } while (uio_resid(uio) && error == 0 && m != NULL);
3068 socket_lock(so, 0);
3069 bad:
3070 if (m != NULL)
3071 m_freem(m);
3072
3073 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3074 if (error == EWOULDBLOCK || error == EINVAL) {
3075 /*
3076 * Let's try to get normal data:
3077 * EWOULDBLOCK: out-of-band data not
3078 * receive yet. EINVAL: out-of-band data
3079 * already read.
3080 */
3081 error = 0;
3082 goto nooob;
3083 } else if (error == 0 && flagsp != NULL) {
3084 *flagsp |= MSG_OOB;
3085 }
3086 }
3087 socket_unlock(so, 1);
3088 if (en_tracing) {
3089 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3090 VM_KERNEL_ADDRPERM(so), 0,
3091 (int64_t)(orig_resid - uio_resid(uio)));
3092 }
3093 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3094 0, 0, 0, 0);
3095
3096 return (error);
3097 }
3098 nooob:
3099 if (mp != NULL)
3100 *mp = NULL;
3101
3102 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3103 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3104 }
3105
3106 free_list = NULL;
3107 delayed_copy_len = 0;
3108 restart:
3109 #ifdef MORE_LOCKING_DEBUG
3110 if (so->so_usecount <= 1)
3111 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3112 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3113 #endif
3114 /*
3115 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3116 * and if so just return to the caller. This could happen when
3117 * soreceive() is called by a socket upcall function during the
3118 * time the socket is freed. The socket buffer would have been
3119 * locked across the upcall, therefore we cannot put this thread
3120 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3121 * we may livelock), because the lock on the socket buffer will
3122 * only be released when the upcall routine returns to its caller.
3123 * Because the socket has been officially closed, there can be
3124 * no further read on it.
3125 *
3126 * A multipath subflow socket would have its SS_NOFDREF set by
3127 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3128 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3129 */
3130 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3131 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3132 socket_unlock(so, 1);
3133 return (0);
3134 }
3135
3136 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3137 if (error) {
3138 socket_unlock(so, 1);
3139 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3140 0, 0, 0, 0);
3141 if (en_tracing) {
3142 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3143 VM_KERNEL_ADDRPERM(so), 0,
3144 (int64_t)(orig_resid - uio_resid(uio)));
3145 }
3146 return (error);
3147 }
3148
3149 m = so->so_rcv.sb_mb;
3150 /*
3151 * If we have less data than requested, block awaiting more
3152 * (subject to any timeout) if:
3153 * 1. the current count is less than the low water mark, or
3154 * 2. MSG_WAITALL is set, and it is possible to do the entire
3155 * receive operation at once if we block (resid <= hiwat).
3156 * 3. MSG_DONTWAIT is not set
3157 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3158 * we have to do the receive in sections, and thus risk returning
3159 * a short count if a timeout or signal occurs after we start.
3160 */
3161 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3162 so->so_rcv.sb_cc < uio_resid(uio)) &&
3163 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3164 ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
3165 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
3166 /*
3167 * Panic if we notice inconsistencies in the socket's
3168 * receive list; both sb_mb and sb_cc should correctly
3169 * reflect the contents of the list, otherwise we may
3170 * end up with false positives during select() or poll()
3171 * which could put the application in a bad state.
3172 */
3173 SB_MB_CHECK(&so->so_rcv);
3174
3175 if (so->so_error) {
3176 if (m != NULL)
3177 goto dontblock;
3178 error = so->so_error;
3179 if ((flags & MSG_PEEK) == 0)
3180 so->so_error = 0;
3181 goto release;
3182 }
3183 if (so->so_state & SS_CANTRCVMORE) {
3184 #if CONTENT_FILTER
3185 /*
3186 * Deal with half closed connections
3187 */
3188 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3189 cfil_sock_data_pending(&so->so_rcv) != 0)
3190 CFIL_LOG(LOG_INFO,
3191 "so %llx ignore SS_CANTRCVMORE",
3192 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3193 else
3194 #endif /* CONTENT_FILTER */
3195 if (m != NULL)
3196 goto dontblock;
3197 else
3198 goto release;
3199 }
3200 for (; m != NULL; m = m->m_next)
3201 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3202 m = so->so_rcv.sb_mb;
3203 goto dontblock;
3204 }
3205 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3206 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3207 error = ENOTCONN;
3208 goto release;
3209 }
3210 if (uio_resid(uio) == 0)
3211 goto release;
3212
3213 if ((so->so_state & SS_NBIO) ||
3214 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3215 error = EWOULDBLOCK;
3216 goto release;
3217 }
3218 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3219 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3220 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3221 #if EVEN_MORE_LOCKING_DEBUG
3222 if (socket_debug)
3223 printf("Waiting for socket data\n");
3224 #endif
3225
3226 error = sbwait(&so->so_rcv);
3227 #if EVEN_MORE_LOCKING_DEBUG
3228 if (socket_debug)
3229 printf("SORECEIVE - sbwait returned %d\n", error);
3230 #endif
3231 if (so->so_usecount < 1) {
3232 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3233 __func__, so, so->so_usecount);
3234 /* NOTREACHED */
3235 }
3236 if (error) {
3237 socket_unlock(so, 1);
3238 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3239 0, 0, 0, 0);
3240 if (en_tracing) {
3241 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3242 VM_KERNEL_ADDRPERM(so), 0,
3243 (int64_t)(orig_resid - uio_resid(uio)));
3244 }
3245 return (error);
3246 }
3247 goto restart;
3248 }
3249 dontblock:
3250 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3251 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3252 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3253 nextrecord = m->m_nextpkt;
3254
3255 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3256 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3257 mp0 == NULL);
3258 if (error == ERESTART)
3259 goto restart;
3260 else if (error != 0)
3261 goto release;
3262 orig_resid = 0;
3263 }
3264
3265 /*
3266 * Process one or more MT_CONTROL mbufs present before any data mbufs
3267 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3268 * just copy the data; if !MSG_PEEK, we call into the protocol to
3269 * perform externalization.
3270 */
3271 if (m != NULL && m->m_type == MT_CONTROL) {
3272 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3273 if (error != 0)
3274 goto release;
3275 orig_resid = 0;
3276 }
3277
3278 /*
3279 * If the socket is a TCP socket with message delivery
3280 * enabled, then create a control msg to deliver the
3281 * relative TCP sequence number for this data. Waiting
3282 * until this point will protect against failures to
3283 * allocate an mbuf for control msgs.
3284 */
3285 if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3286 (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3287 struct mbuf *seq_cm;
3288
3289 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
3290 sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
3291 if (seq_cm == NULL) {
3292 /* unable to allocate a control mbuf */
3293 error = ENOBUFS;
3294 goto release;
3295 }
3296 *controlp = seq_cm;
3297 controlp = &seq_cm->m_next;
3298 }
3299
3300 if (m != NULL) {
3301 if (!(flags & MSG_PEEK)) {
3302 /*
3303 * We get here because m points to an mbuf following
3304 * any MT_SONAME or MT_CONTROL mbufs which have been
3305 * processed above. In any case, m should be pointing
3306 * to the head of the mbuf chain, and the nextrecord
3307 * should be either NULL or equal to m->m_nextpkt.
3308 * See comments above about SB_LOCK.
3309 */
3310 if (m != so->so_rcv.sb_mb ||
3311 m->m_nextpkt != nextrecord) {
3312 panic("%s: post-control !sync so=%p m=%p "
3313 "nextrecord=%p\n", __func__, so, m,
3314 nextrecord);
3315 /* NOTREACHED */
3316 }
3317 if (nextrecord == NULL)
3318 so->so_rcv.sb_lastrecord = m;
3319 }
3320 type = m->m_type;
3321 if (type == MT_OOBDATA)
3322 flags |= MSG_OOB;
3323 } else {
3324 if (!(flags & MSG_PEEK)) {
3325 SB_EMPTY_FIXUP(&so->so_rcv);
3326 }
3327 }
3328 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3329 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3330
3331 moff = 0;
3332 offset = 0;
3333
3334 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
3335 can_delay = 1;
3336 else
3337 can_delay = 0;
3338
3339 need_event = 0;
3340
3341 while (m != NULL &&
3342 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3343 if (m->m_type == MT_OOBDATA) {
3344 if (type != MT_OOBDATA)
3345 break;
3346 } else if (type == MT_OOBDATA) {
3347 break;
3348 }
3349 /*
3350 * Make sure to allways set MSG_OOB event when getting
3351 * out of band data inline.
3352 */
3353 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3354 (so->so_options & SO_OOBINLINE) != 0 &&
3355 (so->so_state & SS_RCVATMARK) != 0) {
3356 flags |= MSG_OOB;
3357 }
3358 so->so_state &= ~SS_RCVATMARK;
3359 len = uio_resid(uio) - delayed_copy_len;
3360 if (so->so_oobmark && len > so->so_oobmark - offset)
3361 len = so->so_oobmark - offset;
3362 if (len > m->m_len - moff)
3363 len = m->m_len - moff;
3364 /*
3365 * If mp is set, just pass back the mbufs.
3366 * Otherwise copy them out via the uio, then free.
3367 * Sockbuf must be consistent here (points to current mbuf,
3368 * it points to next record) when we drop priority;
3369 * we must note any additions to the sockbuf when we
3370 * block interrupts again.
3371 */
3372 if (mp == NULL) {
3373 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3374 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3375 if (can_delay && len == m->m_len) {
3376 /*
3377 * only delay the copy if we're consuming the
3378 * mbuf and we're NOT in MSG_PEEK mode
3379 * and we have enough data to make it worthwile
3380 * to drop and retake the lock... can_delay
3381 * reflects the state of the 2 latter
3382 * constraints moff should always be zero
3383 * in these cases
3384 */
3385 delayed_copy_len += len;
3386 } else {
3387 if (delayed_copy_len) {
3388 error = sodelayed_copy(so, uio,
3389 &free_list, &delayed_copy_len);
3390
3391 if (error) {
3392 goto release;
3393 }
3394 /*
3395 * can only get here if MSG_PEEK is not
3396 * set therefore, m should point at the
3397 * head of the rcv queue; if it doesn't,
3398 * it means something drastically
3399 * changed while we were out from behind
3400 * the lock in sodelayed_copy. perhaps
3401 * a RST on the stream. in any event,
3402 * the stream has been interrupted. it's
3403 * probably best just to return whatever
3404 * data we've moved and let the caller
3405 * sort it out...
3406 */
3407 if (m != so->so_rcv.sb_mb) {
3408 break;
3409 }
3410 }
3411 socket_unlock(so, 0);
3412 error = uiomove(mtod(m, caddr_t) + moff,
3413 (int)len, uio);
3414 socket_lock(so, 0);
3415
3416 if (error)
3417 goto release;
3418 }
3419 } else {
3420 uio_setresid(uio, (uio_resid(uio) - len));
3421 }
3422 if (len == m->m_len - moff) {
3423 if (m->m_flags & M_EOR)
3424 flags |= MSG_EOR;
3425 if (flags & MSG_PEEK) {
3426 m = m->m_next;
3427 moff = 0;
3428 } else {
3429 nextrecord = m->m_nextpkt;
3430 sbfree(&so->so_rcv, m);
3431 m->m_nextpkt = NULL;
3432
3433 /*
3434 * If this packet is an unordered packet
3435 * (indicated by M_UNORDERED_DATA flag), remove
3436 * the additional bytes added to the
3437 * receive socket buffer size.
3438 */
3439 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3440 m->m_len &&
3441 (m->m_flags & M_UNORDERED_DATA) &&
3442 sbreserve(&so->so_rcv,
3443 so->so_rcv.sb_hiwat - m->m_len)) {
3444 if (so->so_msg_state->msg_uno_bytes >
3445 m->m_len) {
3446 so->so_msg_state->
3447 msg_uno_bytes -= m->m_len;
3448 } else {
3449 so->so_msg_state->
3450 msg_uno_bytes = 0;
3451 }
3452 m->m_flags &= ~M_UNORDERED_DATA;
3453 }
3454
3455 if (mp != NULL) {
3456 *mp = m;
3457 mp = &m->m_next;
3458 so->so_rcv.sb_mb = m = m->m_next;
3459 *mp = NULL;
3460 } else {
3461 if (free_list == NULL)
3462 free_list = m;
3463 else
3464 ml->m_next = m;
3465 ml = m;
3466 so->so_rcv.sb_mb = m = m->m_next;
3467 ml->m_next = NULL;
3468 }
3469 if (m != NULL) {
3470 m->m_nextpkt = nextrecord;
3471 if (nextrecord == NULL)
3472 so->so_rcv.sb_lastrecord = m;
3473 } else {
3474 so->so_rcv.sb_mb = nextrecord;
3475 SB_EMPTY_FIXUP(&so->so_rcv);
3476 }
3477 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3478 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3479 }
3480 } else {
3481 if (flags & MSG_PEEK) {
3482 moff += len;
3483 } else {
3484 if (mp != NULL) {
3485 int copy_flag;
3486
3487 if (flags & MSG_DONTWAIT)
3488 copy_flag = M_DONTWAIT;
3489 else
3490 copy_flag = M_WAIT;
3491 *mp = m_copym(m, 0, len, copy_flag);
3492 /*
3493 * Failed to allocate an mbuf?
3494 * Adjust uio_resid back, it was
3495 * adjusted down by len bytes which
3496 * we didn't copy over.
3497 */
3498 if (*mp == NULL) {
3499 uio_setresid(uio,
3500 (uio_resid(uio) + len));
3501 break;
3502 }
3503 }
3504 m->m_data += len;
3505 m->m_len -= len;
3506 so->so_rcv.sb_cc -= len;
3507 }
3508 }
3509 if (so->so_oobmark) {
3510 if ((flags & MSG_PEEK) == 0) {
3511 so->so_oobmark -= len;
3512 if (so->so_oobmark == 0) {
3513 so->so_state |= SS_RCVATMARK;
3514 /*
3515 * delay posting the actual event until
3516 * after any delayed copy processing
3517 * has finished
3518 */
3519 need_event = 1;
3520 break;
3521 }
3522 } else {
3523 offset += len;
3524 if (offset == so->so_oobmark)
3525 break;
3526 }
3527 }
3528 if (flags & MSG_EOR)
3529 break;
3530 /*
3531 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3532 * (for non-atomic socket), we must not quit until
3533 * "uio->uio_resid == 0" or an error termination.
3534 * If a signal/timeout occurs, return with a short
3535 * count but without error. Keep sockbuf locked
3536 * against other readers.
3537 */
3538 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL &&
3539 (uio_resid(uio) - delayed_copy_len) > 0 &&
3540 !sosendallatonce(so) && !nextrecord) {
3541 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3542 #if CONTENT_FILTER
3543 && cfil_sock_data_pending(&so->so_rcv) == 0
3544 #endif /* CONTENT_FILTER */
3545 ))
3546 goto release;
3547
3548 /*
3549 * Depending on the protocol (e.g. TCP), the following
3550 * might cause the socket lock to be dropped and later
3551 * be reacquired, and more data could have arrived and
3552 * have been appended to the receive socket buffer by
3553 * the time it returns. Therefore, we only sleep in
3554 * sbwait() below if and only if the socket buffer is
3555 * empty, in order to avoid a false sleep.
3556 */
3557 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3558 (((struct inpcb *)so->so_pcb)->inp_state !=
3559 INPCB_STATE_DEAD))
3560 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3561
3562 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3563 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3564
3565 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3566 error = 0;
3567 goto release;
3568 }
3569 /*
3570 * have to wait until after we get back from the sbwait
3571 * to do the copy because we will drop the lock if we
3572 * have enough data that has been delayed... by dropping
3573 * the lock we open up a window allowing the netisr
3574 * thread to process the incoming packets and to change
3575 * the state of this socket... we're issuing the sbwait
3576 * because the socket is empty and we're expecting the
3577 * netisr thread to wake us up when more packets arrive;
3578 * if we allow that processing to happen and then sbwait
3579 * we could stall forever with packets sitting in the
3580 * socket if no further packets arrive from the remote
3581 * side.
3582 *
3583 * we want to copy before we've collected all the data
3584 * to satisfy this request to allow the copy to overlap
3585 * the incoming packet processing on an MP system
3586 */
3587 if (delayed_copy_len > sorecvmincopy &&
3588 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3589 error = sodelayed_copy(so, uio,
3590 &free_list, &delayed_copy_len);
3591
3592 if (error)
3593 goto release;
3594 }
3595 m = so->so_rcv.sb_mb;
3596 if (m != NULL) {
3597 nextrecord = m->m_nextpkt;
3598 }
3599 SB_MB_CHECK(&so->so_rcv);
3600 }
3601 }
3602 #ifdef MORE_LOCKING_DEBUG
3603 if (so->so_usecount <= 1) {
3604 panic("%s: after big while so=%p ref=%d on socket\n",
3605 __func__, so, so->so_usecount);
3606 /* NOTREACHED */
3607 }
3608 #endif
3609
3610 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3611 if (so->so_options & SO_DONTTRUNC) {
3612 flags |= MSG_RCVMORE;
3613 } else {
3614 flags |= MSG_TRUNC;
3615 if ((flags & MSG_PEEK) == 0)
3616 (void) sbdroprecord(&so->so_rcv);
3617 }
3618 }
3619
3620 /*
3621 * pru_rcvd below (for TCP) may cause more data to be received
3622 * if the socket lock is dropped prior to sending the ACK; some
3623 * legacy OpenTransport applications don't handle this well
3624 * (if it receives less data than requested while MSG_HAVEMORE
3625 * is set), and so we set the flag now based on what we know
3626 * prior to calling pru_rcvd.
3627 */
3628 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
3629 flags |= MSG_HAVEMORE;
3630
3631 if ((flags & MSG_PEEK) == 0) {
3632 if (m == NULL) {
3633 so->so_rcv.sb_mb = nextrecord;
3634 /*
3635 * First part is an inline SB_EMPTY_FIXUP(). Second
3636 * part makes sure sb_lastrecord is up-to-date if
3637 * there is still data in the socket buffer.
3638 */
3639 if (so->so_rcv.sb_mb == NULL) {
3640 so->so_rcv.sb_mbtail = NULL;
3641 so->so_rcv.sb_lastrecord = NULL;
3642 } else if (nextrecord->m_nextpkt == NULL) {
3643 so->so_rcv.sb_lastrecord = nextrecord;
3644 }
3645 SB_MB_CHECK(&so->so_rcv);
3646 }
3647 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3648 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3649 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
3650 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3651 }
3652
3653 if (delayed_copy_len) {
3654 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3655 if (error)
3656 goto release;
3657 }
3658 if (free_list != NULL) {
3659 m_freem_list(free_list);
3660 free_list = NULL;
3661 }
3662 if (need_event)
3663 postevent(so, 0, EV_OOB);
3664
3665 if (orig_resid == uio_resid(uio) && orig_resid &&
3666 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3667 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3668 goto restart;
3669 }
3670
3671 if (flagsp != NULL)
3672 *flagsp |= flags;
3673 release:
3674 #ifdef MORE_LOCKING_DEBUG
3675 if (so->so_usecount <= 1) {
3676 panic("%s: release so=%p ref=%d on socket\n", __func__,
3677 so, so->so_usecount);
3678 /* NOTREACHED */
3679 }
3680 #endif
3681 if (delayed_copy_len)
3682 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3683
3684 if (free_list != NULL)
3685 m_freem_list(free_list);
3686
3687 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
3688
3689 if (en_tracing) {
3690 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3691 VM_KERNEL_ADDRPERM(so),
3692 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
3693 (int64_t)(orig_resid - uio_resid(uio)));
3694 }
3695 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
3696 so->so_rcv.sb_cc, 0, error);
3697
3698 return (error);
3699 }
3700
3701 /*
3702 * Returns: 0 Success
3703 * uiomove:EFAULT
3704 */
3705 static int
3706 sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
3707 user_ssize_t *resid)
3708 {
3709 int error = 0;
3710 struct mbuf *m;
3711
3712 m = *free_list;
3713
3714 socket_unlock(so, 0);
3715
3716 while (m != NULL && error == 0) {
3717 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
3718 m = m->m_next;
3719 }
3720 m_freem_list(*free_list);
3721
3722 *free_list = NULL;
3723 *resid = 0;
3724
3725 socket_lock(so, 0);
3726
3727 return (error);
3728 }
3729
3730 static int
3731 sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
3732 u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
3733 {
3734 #pragma unused(so)
3735 int error = 0;
3736 struct mbuf *ml, *m;
3737 int i = 0;
3738 struct uio *auio;
3739
3740 for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
3741 ml = ml->m_nextpkt, i++) {
3742 auio = msgarray[i].uio;
3743 for (m = ml; m != NULL; m = m->m_next) {
3744 error = uiomove(mtod(m, caddr_t), m->m_len, auio);
3745 if (error != 0)
3746 goto out;
3747 }
3748 }
3749 out:
3750 m_freem_list(*free_list);
3751
3752 *free_list = NULL;
3753 *resid = 0;
3754
3755 return (error);
3756 }
3757
3758 int
3759 soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
3760 int *flagsp)
3761 {
3762 struct mbuf *m;
3763 struct mbuf *nextrecord;
3764 struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
3765 int error;
3766 user_ssize_t len, pktlen, delayed_copy_len = 0;
3767 struct protosw *pr = so->so_proto;
3768 user_ssize_t resid;
3769 struct proc *p = current_proc();
3770 struct uio *auio = NULL;
3771 int npkts = 0;
3772 int sblocked = 0;
3773 struct sockaddr **psa = NULL;
3774 struct mbuf **controlp = NULL;
3775 int can_delay;
3776 int flags;
3777 struct mbuf *free_others = NULL;
3778
3779 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
3780 so, uiocnt,
3781 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
3782
3783 /*
3784 * Sanity checks:
3785 * - Only supports don't wait flags
3786 * - Only support datagram sockets (could be extended to raw)
3787 * - Must be atomic
3788 * - Protocol must support packet chains
3789 * - The uio array is NULL (should we panic?)
3790 */
3791 if (flagsp != NULL)
3792 flags = *flagsp;
3793 else
3794 flags = 0;
3795 if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
3796 MSG_NBIO)) {
3797 printf("%s invalid flags 0x%x\n", __func__, flags);
3798 error = EINVAL;
3799 goto out;
3800 }
3801 if (so->so_type != SOCK_DGRAM) {
3802 error = EINVAL;
3803 goto out;
3804 }
3805 if (sosendallatonce(so) == 0) {
3806 error = EINVAL;
3807 goto out;
3808 }
3809 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
3810 error = EPROTONOSUPPORT;
3811 goto out;
3812 }
3813 if (msgarray == NULL) {
3814 printf("%s uioarray is NULL\n", __func__);
3815 error = EINVAL;
3816 goto out;
3817 }
3818 if (uiocnt == 0) {
3819 printf("%s uiocnt is 0\n", __func__);
3820 error = EINVAL;
3821 goto out;
3822 }
3823 /*
3824 * Sanity check on the length passed by caller as we are making 'int'
3825 * comparisons
3826 */
3827 resid = recv_msg_array_resid(msgarray, uiocnt);
3828 if (resid < 0 || resid > INT_MAX) {
3829 error = EINVAL;
3830 goto out;
3831 }
3832
3833 if (!(flags & MSG_PEEK) && sorecvmincopy > 0)
3834 can_delay = 1;
3835 else
3836 can_delay = 0;
3837
3838 socket_lock(so, 1);
3839 so_update_last_owner_locked(so, p);
3840 so_update_policy(so);
3841
3842 #if NECP
3843 so_update_necp_policy(so, NULL, NULL);
3844 #endif /* NECP */
3845
3846 /*
3847 * If a recv attempt is made on a previously-accepted socket
3848 * that has been marked as inactive (disconnected), reject
3849 * the request.
3850 */
3851 if (so->so_flags & SOF_DEFUNCT) {
3852 struct sockbuf *sb = &so->so_rcv;
3853
3854 error = ENOTCONN;
3855 SODEFUNCTLOG(("%s[%d]: defunct so 0x%llx [%d,%d] (%d)\n",
3856 __func__, proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3857 SOCK_DOM(so), SOCK_TYPE(so), error));
3858 /*
3859 * This socket should have been disconnected and flushed
3860 * prior to being returned from sodefunct(); there should
3861 * be no data on its receive list, so panic otherwise.
3862 */
3863 if (so->so_state & SS_DEFUNCT)
3864 sb_empty_assert(sb, __func__);
3865 goto release;
3866 }
3867
3868 next:
3869 /*
3870 * The uio may be empty
3871 */
3872 if (npkts >= uiocnt) {
3873 error = 0;
3874 goto release;
3875 }
3876 restart:
3877 /*
3878 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3879 * and if so just return to the caller. This could happen when
3880 * soreceive() is called by a socket upcall function during the
3881 * time the socket is freed. The socket buffer would have been
3882 * locked across the upcall, therefore we cannot put this thread
3883 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3884 * we may livelock), because the lock on the socket buffer will
3885 * only be released when the upcall routine returns to its caller.
3886 * Because the socket has been officially closed, there can be
3887 * no further read on it.
3888 */
3889 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3890 (SS_NOFDREF | SS_CANTRCVMORE)) {
3891 error = 0;
3892 goto release;
3893 }
3894
3895 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3896 if (error) {
3897 goto release;
3898 }
3899 sblocked = 1;
3900
3901 m = so->so_rcv.sb_mb;
3902 /*
3903 * Block awaiting more datagram if needed
3904 */
3905 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3906 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3907 ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
3908 /*
3909 * Panic if we notice inconsistencies in the socket's
3910 * receive list; both sb_mb and sb_cc should correctly
3911 * reflect the contents of the list, otherwise we may
3912 * end up with false positives during select() or poll()
3913 * which could put the application in a bad state.
3914 */
3915 SB_MB_CHECK(&so->so_rcv);
3916
3917 if (so->so_error) {
3918 error = so->so_error;
3919 if ((flags & MSG_PEEK) == 0)
3920 so->so_error = 0;
3921 goto release;
3922 }
3923 if (so->so_state & SS_CANTRCVMORE) {
3924 goto release;
3925 }
3926 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3927 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3928 error = ENOTCONN;
3929 goto release;
3930 }
3931 if ((so->so_state & SS_NBIO) ||
3932 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3933 error = EWOULDBLOCK;
3934 goto release;
3935 }
3936 /*
3937 * Do not block if we got some data
3938 */
3939 if (free_list != NULL) {
3940 error = 0;
3941 goto release;
3942 }
3943
3944 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3945 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3946
3947 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3948 sblocked = 0;
3949
3950 error = sbwait(&so->so_rcv);
3951 if (error) {
3952 goto release;
3953 }
3954 goto restart;
3955 }
3956
3957 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3958 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3959 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3960
3961 /*
3962 * Consume the current uio index as we have a datagram
3963 */
3964 auio = msgarray[npkts].uio;
3965 resid = uio_resid(auio);
3966 msgarray[npkts].which |= SOCK_MSG_DATA;
3967 psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
3968 &msgarray[npkts].psa : NULL;
3969 controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
3970 &msgarray[npkts].controlp : NULL;
3971 npkts += 1;
3972 nextrecord = m->m_nextpkt;
3973
3974 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3975 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
3976 if (error == ERESTART)
3977 goto restart;
3978 else if (error != 0)
3979 goto release;
3980 }
3981
3982 if (m != NULL && m->m_type == MT_CONTROL) {
3983 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3984 if (error != 0)
3985 goto release;
3986 }
3987
3988 if (m->m_pkthdr.len == 0) {
3989 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
3990 __func__, __LINE__,
3991 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3992 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
3993 m->m_type);
3994 }
3995
3996 /*
3997 * Loop to copy the mbufs of the current record
3998 * Support zero length packets
3999 */
4000 ml = NULL;
4001 pktlen = 0;
4002 while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4003 if (m->m_len == 0)
4004 panic("%p m_len zero", m);
4005 if (m->m_type == 0)
4006 panic("%p m_type zero", m);
4007 /*
4008 * Clip to the residual length
4009 */
4010 if (len > m->m_len)
4011 len = m->m_len;
4012 pktlen += len;
4013 /*
4014 * Copy the mbufs via the uio or delay the copy
4015 * Sockbuf must be consistent here (points to current mbuf,
4016 * it points to next record) when we drop priority;
4017 * we must note any additions to the sockbuf when we
4018 * block interrupts again.
4019 */
4020 if (len > 0 && can_delay == 0) {
4021 socket_unlock(so, 0);
4022 error = uiomove(mtod(m, caddr_t), (int)len, auio);
4023 socket_lock(so, 0);
4024 if (error)
4025 goto release;
4026 } else {
4027 delayed_copy_len += len;
4028 }
4029
4030 if (len == m->m_len) {
4031 /*
4032 * m was entirely copied
4033 */
4034 sbfree(&so->so_rcv, m);
4035 nextrecord = m->m_nextpkt;
4036 m->m_nextpkt = NULL;
4037
4038 /*
4039 * Set the first packet to the head of the free list
4040 */
4041 if (free_list == NULL)
4042 free_list = m;
4043 /*
4044 * Link current packet to tail of free list
4045 */
4046 if (ml == NULL) {
4047 if (free_tail != NULL)
4048 free_tail->m_nextpkt = m;
4049 free_tail = m;
4050 }
4051 /*
4052 * Link current mbuf to last mbuf of current packet
4053 */
4054 if (ml != NULL)
4055 ml->m_next = m;
4056 ml = m;
4057
4058 /*
4059 * Move next buf to head of socket buffer
4060 */
4061 so->so_rcv.sb_mb = m = ml->m_next;
4062 ml->m_next = NULL;
4063
4064 if (m != NULL) {
4065 m->m_nextpkt = nextrecord;
4066 if (nextrecord == NULL)
4067 so->so_rcv.sb_lastrecord = m;
4068 } else {
4069 so->so_rcv.sb_mb = nextrecord;
4070 SB_EMPTY_FIXUP(&so->so_rcv);
4071 }
4072 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4073 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4074 } else {
4075 /*
4076 * Stop the loop on partial copy
4077 */
4078 break;
4079 }
4080 }
4081 #ifdef MORE_LOCKING_DEBUG
4082 if (so->so_usecount <= 1) {
4083 panic("%s: after big while so=%llx ref=%d on socket\n",
4084 __func__,
4085 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4086 /* NOTREACHED */
4087 }
4088 #endif
4089 /*
4090 * Tell the caller we made a partial copy
4091 */
4092 if (m != NULL) {
4093 if (so->so_options & SO_DONTTRUNC) {
4094 /*
4095 * Copyout first the freelist then the partial mbuf
4096 */
4097 socket_unlock(so, 0);
4098 if (delayed_copy_len)
4099 error = sodelayed_copy_list(so, msgarray,
4100 uiocnt, &free_list, &delayed_copy_len);
4101
4102 if (error == 0) {
4103 error = uiomove(mtod(m, caddr_t), (int)len,
4104 auio);
4105 }
4106 socket_lock(so, 0);
4107 if (error)
4108 goto release;
4109
4110 m->m_data += len;
4111 m->m_len -= len;
4112 so->so_rcv.sb_cc -= len;
4113 flags |= MSG_RCVMORE;
4114 } else {
4115 (void) sbdroprecord(&so->so_rcv);
4116 nextrecord = so->so_rcv.sb_mb;
4117 m = NULL;
4118 flags |= MSG_TRUNC;
4119 }
4120 }
4121
4122 if (m == NULL) {
4123 so->so_rcv.sb_mb = nextrecord;
4124 /*
4125 * First part is an inline SB_EMPTY_FIXUP(). Second
4126 * part makes sure sb_lastrecord is up-to-date if
4127 * there is still data in the socket buffer.
4128 */
4129 if (so->so_rcv.sb_mb == NULL) {
4130 so->so_rcv.sb_mbtail = NULL;
4131 so->so_rcv.sb_lastrecord = NULL;
4132 } else if (nextrecord->m_nextpkt == NULL) {
4133 so->so_rcv.sb_lastrecord = nextrecord;
4134 }
4135 SB_MB_CHECK(&so->so_rcv);
4136 }
4137 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4138 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4139
4140 /*
4141 * We can continue to the next packet as long as:
4142 * - We haven't exhausted the uio array
4143 * - There was no error
4144 * - A packet was not truncated
4145 * - We can still receive more data
4146 */
4147 if (npkts < uiocnt && error == 0 &&
4148 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4149 (so->so_state & SS_CANTRCVMORE) == 0) {
4150 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4151 sblocked = 0;
4152
4153 goto next;
4154 }
4155 if (flagsp != NULL)
4156 *flagsp |= flags;
4157
4158 release:
4159 /*
4160 * pru_rcvd may cause more data to be received if the socket lock
4161 * is dropped so we set MSG_HAVEMORE now based on what we know.
4162 * That way the caller won't be surprised if it receives less data
4163 * than requested.
4164 */
4165 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
4166 flags |= MSG_HAVEMORE;
4167
4168 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
4169 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4170
4171 if (sblocked)
4172 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4173 else
4174 socket_unlock(so, 1);
4175
4176 if (delayed_copy_len)
4177 error = sodelayed_copy_list(so, msgarray, uiocnt,
4178 &free_list, &delayed_copy_len);
4179 out:
4180 /*
4181 * Amortize the cost of freeing the mbufs
4182 */
4183 if (free_list != NULL)
4184 m_freem_list(free_list);
4185 if (free_others != NULL)
4186 m_freem_list(free_others);
4187
4188 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4189 0, 0, 0, 0);
4190 return (error);
4191 }
4192
4193 /*
4194 * Returns: 0 Success
4195 * EINVAL
4196 * ENOTCONN
4197 * <pru_shutdown>:EINVAL
4198 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4199 * <pru_shutdown>:ENOBUFS[TCP]
4200 * <pru_shutdown>:EMSGSIZE[TCP]
4201 * <pru_shutdown>:EHOSTUNREACH[TCP]
4202 * <pru_shutdown>:ENETUNREACH[TCP]
4203 * <pru_shutdown>:ENETDOWN[TCP]
4204 * <pru_shutdown>:ENOMEM[TCP]
4205 * <pru_shutdown>:EACCES[TCP]
4206 * <pru_shutdown>:EMSGSIZE[TCP]
4207 * <pru_shutdown>:ENOBUFS[TCP]
4208 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4209 * <pru_shutdown>:??? [other protocol families]
4210 */
4211 int
4212 soshutdown(struct socket *so, int how)
4213 {
4214 int error;
4215
4216 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4217
4218 switch (how) {
4219 case SHUT_RD:
4220 case SHUT_WR:
4221 case SHUT_RDWR:
4222 socket_lock(so, 1);
4223 if ((so->so_state &
4224 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
4225 error = ENOTCONN;
4226 } else {
4227 error = soshutdownlock(so, how);
4228 }
4229 socket_unlock(so, 1);
4230 break;
4231 default:
4232 error = EINVAL;
4233 break;
4234 }
4235
4236 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4237
4238 return (error);
4239 }
4240
4241 int
4242 soshutdownlock_final(struct socket *so, int how)
4243 {
4244 struct protosw *pr = so->so_proto;
4245 int error = 0;
4246
4247 sflt_notify(so, sock_evt_shutdown, &how);
4248
4249 if (how != SHUT_WR) {
4250 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4251 /* read already shut down */
4252 error = ENOTCONN;
4253 goto done;
4254 }
4255 sorflush(so);
4256 postevent(so, 0, EV_RCLOSED);
4257 }
4258 if (how != SHUT_RD) {
4259 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4260 /* write already shut down */
4261 error = ENOTCONN;
4262 goto done;
4263 }
4264 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4265 postevent(so, 0, EV_WCLOSED);
4266 }
4267 done:
4268 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4269 return (error);
4270 }
4271
4272 int
4273 soshutdownlock(struct socket *so, int how)
4274 {
4275 int error = 0;
4276
4277 #if CONTENT_FILTER
4278 /*
4279 * A content filter may delay the actual shutdown until it
4280 * has processed the pending data
4281 */
4282 if (so->so_flags & SOF_CONTENT_FILTER) {
4283 error = cfil_sock_shutdown(so, &how);
4284 if (error == EJUSTRETURN) {
4285 error = 0;
4286 goto done;
4287 } else if (error != 0) {
4288 goto done;
4289 }
4290 }
4291 #endif /* CONTENT_FILTER */
4292
4293 error = soshutdownlock_final(so, how);
4294
4295 done:
4296 return (error);
4297 }
4298
4299 void
4300 sowflush(struct socket *so)
4301 {
4302 struct sockbuf *sb = &so->so_snd;
4303 #ifdef notyet
4304 lck_mtx_t *mutex_held;
4305 /*
4306 * XXX: This code is currently commented out, because we may get here
4307 * as part of sofreelastref(), and at that time, pr_getlock() may no
4308 * longer be able to return us the lock; this will be fixed in future.
4309 */
4310 if (so->so_proto->pr_getlock != NULL)
4311 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4312 else
4313 mutex_held = so->so_proto->pr_domain->dom_mtx;
4314
4315 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4316 #endif /* notyet */
4317
4318 /*
4319 * Obtain lock on the socket buffer (SB_LOCK). This is required
4320 * to prevent the socket buffer from being unexpectedly altered
4321 * while it is used by another thread in socket send/receive.
4322 *
4323 * sblock() must not fail here, hence the assertion.
4324 */
4325 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4326 VERIFY(sb->sb_flags & SB_LOCK);
4327
4328 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
4329 sb->sb_flags |= SB_DROP;
4330 sb->sb_upcall = NULL;
4331 sb->sb_upcallarg = NULL;
4332
4333 sbunlock(sb, TRUE); /* keep socket locked */
4334
4335 selthreadclear(&sb->sb_sel);
4336 sbrelease(sb);
4337 }
4338
4339 void
4340 sorflush(struct socket *so)
4341 {
4342 struct sockbuf *sb = &so->so_rcv;
4343 struct protosw *pr = so->so_proto;
4344 struct sockbuf asb;
4345 #ifdef notyet
4346 lck_mtx_t *mutex_held;
4347 /*
4348 * XXX: This code is currently commented out, because we may get here
4349 * as part of sofreelastref(), and at that time, pr_getlock() may no
4350 * longer be able to return us the lock; this will be fixed in future.
4351 */
4352 if (so->so_proto->pr_getlock != NULL)
4353 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4354 else
4355 mutex_held = so->so_proto->pr_domain->dom_mtx;
4356
4357 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
4358 #endif /* notyet */
4359
4360 sflt_notify(so, sock_evt_flush_read, NULL);
4361
4362 socantrcvmore(so);
4363
4364 /*
4365 * Obtain lock on the socket buffer (SB_LOCK). This is required
4366 * to prevent the socket buffer from being unexpectedly altered
4367 * while it is used by another thread in socket send/receive.
4368 *
4369 * sblock() must not fail here, hence the assertion.
4370 */
4371 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4372 VERIFY(sb->sb_flags & SB_LOCK);
4373
4374 /*
4375 * Copy only the relevant fields from "sb" to "asb" which we
4376 * need for sbrelease() to function. In particular, skip
4377 * sb_sel as it contains the wait queue linkage, which would
4378 * wreak havoc if we were to issue selthreadclear() on "asb".
4379 * Make sure to not carry over SB_LOCK in "asb", as we need
4380 * to acquire it later as part of sbrelease().
4381 */
4382 bzero(&asb, sizeof (asb));
4383 asb.sb_cc = sb->sb_cc;
4384 asb.sb_hiwat = sb->sb_hiwat;
4385 asb.sb_mbcnt = sb->sb_mbcnt;
4386 asb.sb_mbmax = sb->sb_mbmax;
4387 asb.sb_ctl = sb->sb_ctl;
4388 asb.sb_lowat = sb->sb_lowat;
4389 asb.sb_mb = sb->sb_mb;
4390 asb.sb_mbtail = sb->sb_mbtail;
4391 asb.sb_lastrecord = sb->sb_lastrecord;
4392 asb.sb_so = sb->sb_so;
4393 asb.sb_flags = sb->sb_flags;
4394 asb.sb_flags &= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL);
4395 asb.sb_flags |= SB_DROP;
4396
4397 /*
4398 * Ideally we'd bzero() these and preserve the ones we need;
4399 * but to do that we'd need to shuffle things around in the
4400 * sockbuf, and we can't do it now because there are KEXTS
4401 * that are directly referring to the socket structure.
4402 *
4403 * Setting SB_DROP acts as a barrier to prevent further appends.
4404 * Clearing SB_SEL is done for selthreadclear() below.
4405 */
4406 sb->sb_cc = 0;
4407 sb->sb_hiwat = 0;
4408 sb->sb_mbcnt = 0;
4409 sb->sb_mbmax = 0;
4410 sb->sb_ctl = 0;
4411 sb->sb_lowat = 0;
4412 sb->sb_mb = NULL;
4413 sb->sb_mbtail = NULL;
4414 sb->sb_lastrecord = NULL;
4415 sb->sb_timeo.tv_sec = 0;
4416 sb->sb_timeo.tv_usec = 0;
4417 sb->sb_upcall = NULL;
4418 sb->sb_upcallarg = NULL;
4419 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
4420 sb->sb_flags |= SB_DROP;
4421
4422 sbunlock(sb, TRUE); /* keep socket locked */
4423
4424 /*
4425 * Note that selthreadclear() is called on the original "sb" and
4426 * not the local "asb" because of the way wait queue linkage is
4427 * implemented. Given that selwakeup() may be triggered, SB_SEL
4428 * should no longer be set (cleared above.)
4429 */
4430 selthreadclear(&sb->sb_sel);
4431
4432 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
4433 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4434
4435 sbrelease(&asb);
4436 }
4437
4438 /*
4439 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4440 * an additional variant to handle the case where the option value needs
4441 * to be some kind of integer, but not a specific size.
4442 * In addition to their use here, these functions are also called by the
4443 * protocol-level pr_ctloutput() routines.
4444 *
4445 * Returns: 0 Success
4446 * EINVAL
4447 * copyin:EFAULT
4448 */
4449 int
4450 sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4451 {
4452 size_t valsize;
4453
4454 /*
4455 * If the user gives us more than we wanted, we ignore it,
4456 * but if we don't get the minimum length the caller
4457 * wants, we return EINVAL. On success, sopt->sopt_valsize
4458 * is set to however much we actually retrieved.
4459 */
4460 if ((valsize = sopt->sopt_valsize) < minlen)
4461 return (EINVAL);
4462 if (valsize > len)
4463 sopt->sopt_valsize = valsize = len;
4464
4465 if (sopt->sopt_p != kernproc)
4466 return (copyin(sopt->sopt_val, buf, valsize));
4467
4468 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4469 return (0);
4470 }
4471
4472 /*
4473 * sooptcopyin_timeval
4474 * Copy in a timeval value into tv_p, and take into account whether the
4475 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4476 * code here so that we can verify the 64-bit tv_sec value before we lose
4477 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4478 */
4479 static int
4480 sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4481 {
4482 int error;
4483
4484 if (proc_is64bit(sopt->sopt_p)) {
4485 struct user64_timeval tv64;
4486
4487 if (sopt->sopt_valsize < sizeof (tv64))
4488 return (EINVAL);
4489
4490 sopt->sopt_valsize = sizeof (tv64);
4491 if (sopt->sopt_p != kernproc) {
4492 error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
4493 if (error != 0)
4494 return (error);
4495 } else {
4496 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4497 sizeof (tv64));
4498 }
4499 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4500 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000)
4501 return (EDOM);
4502
4503 tv_p->tv_sec = tv64.tv_sec;
4504 tv_p->tv_usec = tv64.tv_usec;
4505 } else {
4506 struct user32_timeval tv32;
4507
4508 if (sopt->sopt_valsize < sizeof (tv32))
4509 return (EINVAL);
4510
4511 sopt->sopt_valsize = sizeof (tv32);
4512 if (sopt->sopt_p != kernproc) {
4513 error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
4514 if (error != 0) {
4515 return (error);
4516 }
4517 } else {
4518 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4519 sizeof (tv32));
4520 }
4521 #ifndef __LP64__
4522 /*
4523 * K64todo "comparison is always false due to
4524 * limited range of data type"
4525 */
4526 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4527 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000)
4528 return (EDOM);
4529 #endif
4530 tv_p->tv_sec = tv32.tv_sec;
4531 tv_p->tv_usec = tv32.tv_usec;
4532 }
4533 return (0);
4534 }
4535
4536 /*
4537 * Returns: 0 Success
4538 * EINVAL
4539 * ENOPROTOOPT
4540 * ENOBUFS
4541 * EDOM
4542 * sooptcopyin:EINVAL
4543 * sooptcopyin:EFAULT
4544 * sooptcopyin_timeval:EINVAL
4545 * sooptcopyin_timeval:EFAULT
4546 * sooptcopyin_timeval:EDOM
4547 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4548 * <pr_ctloutput>:???w
4549 * sflt_attach_private:??? [whatever a filter author chooses]
4550 * <sf_setoption>:??? [whatever a filter author chooses]
4551 *
4552 * Notes: Other <pru_listen> returns depend on the protocol family; all
4553 * <sf_listen> returns depend on what the filter author causes
4554 * their filter to return.
4555 */
4556 int
4557 sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4558 {
4559 int error, optval;
4560 struct linger l;
4561 struct timeval tv;
4562 #if CONFIG_MACF_SOCKET
4563 struct mac extmac;
4564 #endif /* MAC_SOCKET */
4565
4566 if (sopt->sopt_dir != SOPT_SET)
4567 sopt->sopt_dir = SOPT_SET;
4568
4569 if (dolock)
4570 socket_lock(so, 1);
4571
4572 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4573 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4574 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4575 /* the socket has been shutdown, no more sockopt's */
4576 error = EINVAL;
4577 goto out;
4578 }
4579
4580 error = sflt_setsockopt(so, sopt);
4581 if (error != 0) {
4582 if (error == EJUSTRETURN)
4583 error = 0;
4584 goto out;
4585 }
4586
4587 if (sopt->sopt_level != SOL_SOCKET) {
4588 if (so->so_proto != NULL &&
4589 so->so_proto->pr_ctloutput != NULL) {
4590 error = (*so->so_proto->pr_ctloutput)(so, sopt);
4591 goto out;
4592 }
4593 error = ENOPROTOOPT;
4594 } else {
4595 /*
4596 * Allow socket-level (SOL_SOCKET) options to be filtered by
4597 * the protocol layer, if needed. A zero value returned from
4598 * the handler means use default socket-level processing as
4599 * done by the rest of this routine. Otherwise, any other
4600 * return value indicates that the option is unsupported.
4601 */
4602 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4603 pru_socheckopt(so, sopt)) != 0)
4604 goto out;
4605
4606 error = 0;
4607 switch (sopt->sopt_name) {
4608 case SO_LINGER:
4609 case SO_LINGER_SEC:
4610 error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
4611 if (error != 0)
4612 goto out;
4613
4614 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
4615 l.l_linger : l.l_linger * hz;
4616 if (l.l_onoff != 0)
4617 so->so_options |= SO_LINGER;
4618 else
4619 so->so_options &= ~SO_LINGER;
4620 break;
4621
4622 case SO_DEBUG:
4623 case SO_KEEPALIVE:
4624 case SO_DONTROUTE:
4625 case SO_USELOOPBACK:
4626 case SO_BROADCAST:
4627 case SO_REUSEADDR:
4628 case SO_REUSEPORT:
4629 case SO_OOBINLINE:
4630 case SO_TIMESTAMP:
4631 case SO_TIMESTAMP_MONOTONIC:
4632 case SO_DONTTRUNC:
4633 case SO_WANTMORE:
4634 case SO_WANTOOBFLAG:
4635 case SO_NOWAKEFROMSLEEP:
4636 error = sooptcopyin(sopt, &optval, sizeof (optval),
4637 sizeof (optval));
4638 if (error != 0)
4639 goto out;
4640 if (optval)
4641 so->so_options |= sopt->sopt_name;
4642 else
4643 so->so_options &= ~sopt->sopt_name;
4644 break;
4645
4646 case SO_SNDBUF:
4647 case SO_RCVBUF:
4648 case SO_SNDLOWAT:
4649 case SO_RCVLOWAT:
4650 error = sooptcopyin(sopt, &optval, sizeof (optval),
4651 sizeof (optval));
4652 if (error != 0)
4653 goto out;
4654
4655 /*
4656 * Values < 1 make no sense for any of these
4657 * options, so disallow them.
4658 */
4659 if (optval < 1) {
4660 error = EINVAL;
4661 goto out;
4662 }
4663
4664 switch (sopt->sopt_name) {
4665 case SO_SNDBUF:
4666 case SO_RCVBUF: {
4667 struct sockbuf *sb =
4668 (sopt->sopt_name == SO_SNDBUF) ?
4669 &so->so_snd : &so->so_rcv;
4670 if (sbreserve(sb, (u_int32_t)optval) == 0) {
4671 error = ENOBUFS;
4672 goto out;
4673 }
4674 sb->sb_flags |= SB_USRSIZE;
4675 sb->sb_flags &= ~SB_AUTOSIZE;
4676 sb->sb_idealsize = (u_int32_t)optval;
4677 break;
4678 }
4679 /*
4680 * Make sure the low-water is never greater than
4681 * the high-water.
4682 */
4683 case SO_SNDLOWAT: {
4684 int space = sbspace(&so->so_snd);
4685 u_int32_t hiwat = so->so_snd.sb_hiwat;
4686
4687 if (so->so_snd.sb_flags & SB_UNIX) {
4688 struct unpcb *unp =
4689 (struct unpcb *)(so->so_pcb);
4690 if (unp != NULL &&
4691 unp->unp_conn != NULL) {
4692 hiwat += unp->unp_conn->unp_cc;
4693 }
4694 }
4695
4696 so->so_snd.sb_lowat =
4697 (optval > hiwat) ?
4698 hiwat : optval;
4699
4700 if (space >= so->so_snd.sb_lowat) {
4701 sowwakeup(so);
4702 }
4703 break;
4704 }
4705 case SO_RCVLOWAT: {
4706 int64_t data_len;
4707 so->so_rcv.sb_lowat =
4708 (optval > so->so_rcv.sb_hiwat) ?
4709 so->so_rcv.sb_hiwat : optval;
4710 data_len = so->so_rcv.sb_cc
4711 - so->so_rcv.sb_ctl;
4712 if (data_len >= so->so_rcv.sb_lowat)
4713 sorwakeup(so);
4714 break;
4715 }
4716 }
4717 break;
4718
4719 case SO_SNDTIMEO:
4720 case SO_RCVTIMEO:
4721 error = sooptcopyin_timeval(sopt, &tv);
4722 if (error != 0)
4723 goto out;
4724
4725 switch (sopt->sopt_name) {
4726 case SO_SNDTIMEO:
4727 so->so_snd.sb_timeo = tv;
4728 break;
4729 case SO_RCVTIMEO:
4730 so->so_rcv.sb_timeo = tv;
4731 break;
4732 }
4733 break;
4734
4735 case SO_NKE: {
4736 struct so_nke nke;
4737
4738 error = sooptcopyin(sopt, &nke, sizeof (nke),
4739 sizeof (nke));
4740 if (error != 0)
4741 goto out;
4742
4743 error = sflt_attach_internal(so, nke.nke_handle);
4744 break;
4745 }
4746
4747 case SO_NOSIGPIPE:
4748 error = sooptcopyin(sopt, &optval, sizeof (optval),
4749 sizeof (optval));
4750 if (error != 0)
4751 goto out;
4752 if (optval != 0)
4753 so->so_flags |= SOF_NOSIGPIPE;
4754 else
4755 so->so_flags &= ~SOF_NOSIGPIPE;
4756 break;
4757
4758 case SO_NOADDRERR:
4759 error = sooptcopyin(sopt, &optval, sizeof (optval),
4760 sizeof (optval));
4761 if (error != 0)
4762 goto out;
4763 if (optval != 0)
4764 so->so_flags |= SOF_NOADDRAVAIL;
4765 else
4766 so->so_flags &= ~SOF_NOADDRAVAIL;
4767 break;
4768
4769 case SO_REUSESHAREUID:
4770 error = sooptcopyin(sopt, &optval, sizeof (optval),
4771 sizeof (optval));
4772 if (error != 0)
4773 goto out;
4774 if (optval != 0)
4775 so->so_flags |= SOF_REUSESHAREUID;
4776 else
4777 so->so_flags &= ~SOF_REUSESHAREUID;
4778 break;
4779
4780 case SO_NOTIFYCONFLICT:
4781 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
4782 error = EPERM;
4783 goto out;
4784 }
4785 error = sooptcopyin(sopt, &optval, sizeof (optval),
4786 sizeof (optval));
4787 if (error != 0)
4788 goto out;
4789 if (optval != 0)
4790 so->so_flags |= SOF_NOTIFYCONFLICT;
4791 else
4792 so->so_flags &= ~SOF_NOTIFYCONFLICT;
4793 break;
4794
4795 case SO_RESTRICTIONS:
4796 error = sooptcopyin(sopt, &optval, sizeof (optval),
4797 sizeof (optval));
4798 if (error != 0)
4799 goto out;
4800
4801 error = so_set_restrictions(so, optval);
4802 break;
4803
4804 case SO_AWDL_UNRESTRICTED:
4805 if (SOCK_DOM(so) != PF_INET &&
4806 SOCK_DOM(so) != PF_INET6) {
4807 error = EOPNOTSUPP;
4808 goto out;
4809 }
4810 error = sooptcopyin(sopt, &optval, sizeof(optval),
4811 sizeof(optval));
4812 if (error != 0)
4813 goto out;
4814 if (optval != 0) {
4815 kauth_cred_t cred = NULL;
4816 proc_t ep = PROC_NULL;
4817
4818 if (so->so_flags & SOF_DELEGATED) {
4819 ep = proc_find(so->e_pid);
4820 if (ep)
4821 cred = kauth_cred_proc_ref(ep);
4822 }
4823 error = priv_check_cred(
4824 cred ? cred : so->so_cred,
4825 PRIV_NET_RESTRICTED_AWDL, 0);
4826 if (error == 0)
4827 inp_set_awdl_unrestricted(
4828 sotoinpcb(so));
4829 if (cred)
4830 kauth_cred_unref(&cred);
4831 if (ep != PROC_NULL)
4832 proc_rele(ep);
4833 } else
4834 inp_clear_awdl_unrestricted(sotoinpcb(so));
4835 break;
4836
4837 case SO_LABEL:
4838 #if CONFIG_MACF_SOCKET
4839 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
4840 sizeof (extmac))) != 0)
4841 goto out;
4842
4843 error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
4844 so, &extmac);
4845 #else
4846 error = EOPNOTSUPP;
4847 #endif /* MAC_SOCKET */
4848 break;
4849
4850 case SO_UPCALLCLOSEWAIT:
4851 error = sooptcopyin(sopt, &optval, sizeof (optval),
4852 sizeof (optval));
4853 if (error != 0)
4854 goto out;
4855 if (optval != 0)
4856 so->so_flags |= SOF_UPCALLCLOSEWAIT;
4857 else
4858 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
4859 break;
4860
4861 case SO_RANDOMPORT:
4862 error = sooptcopyin(sopt, &optval, sizeof (optval),
4863 sizeof (optval));
4864 if (error != 0)
4865 goto out;
4866 if (optval != 0)
4867 so->so_flags |= SOF_BINDRANDOMPORT;
4868 else
4869 so->so_flags &= ~SOF_BINDRANDOMPORT;
4870 break;
4871
4872 case SO_NP_EXTENSIONS: {
4873 struct so_np_extensions sonpx;
4874
4875 error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
4876 sizeof (sonpx));
4877 if (error != 0)
4878 goto out;
4879 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
4880 error = EINVAL;
4881 goto out;
4882 }
4883 /*
4884 * Only one bit defined for now
4885 */
4886 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
4887 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
4888 so->so_flags |= SOF_NPX_SETOPTSHUT;
4889 else
4890 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
4891 }
4892 break;
4893 }
4894
4895 case SO_TRAFFIC_CLASS: {
4896 error = sooptcopyin(sopt, &optval, sizeof (optval),
4897 sizeof (optval));
4898 if (error != 0)
4899 goto out;
4900 error = so_set_traffic_class(so, optval);
4901 if (error != 0)
4902 goto out;
4903 break;
4904 }
4905
4906 case SO_RECV_TRAFFIC_CLASS: {
4907 error = sooptcopyin(sopt, &optval, sizeof (optval),
4908 sizeof (optval));
4909 if (error != 0)
4910 goto out;
4911 if (optval == 0)
4912 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
4913 else
4914 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
4915 break;
4916 }
4917
4918 case SO_TRAFFIC_CLASS_DBG: {
4919 struct so_tcdbg so_tcdbg;
4920
4921 error = sooptcopyin(sopt, &so_tcdbg,
4922 sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
4923 if (error != 0)
4924 goto out;
4925 error = so_set_tcdbg(so, &so_tcdbg);
4926 if (error != 0)
4927 goto out;
4928 break;
4929 }
4930
4931 case SO_PRIVILEGED_TRAFFIC_CLASS:
4932 error = priv_check_cred(kauth_cred_get(),
4933 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
4934 if (error != 0)
4935 goto out;
4936 error = sooptcopyin(sopt, &optval, sizeof (optval),
4937 sizeof (optval));
4938 if (error != 0)
4939 goto out;
4940 if (optval == 0)
4941 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
4942 else
4943 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
4944 break;
4945
4946 case SO_DEFUNCTOK:
4947 error = sooptcopyin(sopt, &optval, sizeof (optval),
4948 sizeof (optval));
4949 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
4950 if (error == 0)
4951 error = EBADF;
4952 goto out;
4953 }
4954 /*
4955 * Any process can set SO_DEFUNCTOK (clear
4956 * SOF_NODEFUNCT), but only root can clear
4957 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
4958 */
4959 if (optval == 0 &&
4960 kauth_cred_issuser(kauth_cred_get()) == 0) {
4961 error = EPERM;
4962 goto out;
4963 }
4964 if (optval)
4965 so->so_flags &= ~SOF_NODEFUNCT;
4966 else
4967 so->so_flags |= SOF_NODEFUNCT;
4968
4969 if (SOCK_DOM(so) == PF_INET ||
4970 SOCK_DOM(so) == PF_INET6) {
4971 char s[MAX_IPv6_STR_LEN];
4972 char d[MAX_IPv6_STR_LEN];
4973 struct inpcb *inp = sotoinpcb(so);
4974
4975 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%s %s:%d -> "
4976 "%s:%d] is now marked as %seligible for "
4977 "defunct\n", __func__, proc_selfpid(),
4978 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4979 (SOCK_TYPE(so) == SOCK_STREAM) ?
4980 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
4981 ((SOCK_DOM(so) == PF_INET) ?
4982 (void *)&inp->inp_laddr.s_addr :
4983 (void *)&inp->in6p_laddr), s, sizeof (s)),
4984 ntohs(inp->in6p_lport),
4985 inet_ntop(SOCK_DOM(so),
4986 (SOCK_DOM(so) == PF_INET) ?
4987 (void *)&inp->inp_faddr.s_addr :
4988 (void *)&inp->in6p_faddr, d, sizeof (d)),
4989 ntohs(inp->in6p_fport),
4990 (so->so_flags & SOF_NODEFUNCT) ?
4991 "not " : ""));
4992 } else {
4993 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d] is "
4994 "now marked as %seligible for defunct\n",
4995 __func__, proc_selfpid(),
4996 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4997 SOCK_DOM(so), SOCK_TYPE(so),
4998 (so->so_flags & SOF_NODEFUNCT) ?
4999 "not " : ""));
5000 }
5001 break;
5002
5003 case SO_ISDEFUNCT:
5004 /* This option is not settable */
5005 error = EINVAL;
5006 break;
5007
5008 case SO_OPPORTUNISTIC:
5009 error = sooptcopyin(sopt, &optval, sizeof (optval),
5010 sizeof (optval));
5011 if (error == 0)
5012 error = so_set_opportunistic(so, optval);
5013 break;
5014
5015 case SO_FLUSH:
5016 /* This option is handled by lower layer(s) */
5017 error = 0;
5018 break;
5019
5020 case SO_RECV_ANYIF:
5021 error = sooptcopyin(sopt, &optval, sizeof (optval),
5022 sizeof (optval));
5023 if (error == 0)
5024 error = so_set_recv_anyif(so, optval);
5025 break;
5026
5027 case SO_TRAFFIC_MGT_BACKGROUND: {
5028 /* This option is handled by lower layer(s) */
5029 error = 0;
5030 break;
5031 }
5032
5033 #if FLOW_DIVERT
5034 case SO_FLOW_DIVERT_TOKEN:
5035 error = flow_divert_token_set(so, sopt);
5036 break;
5037 #endif /* FLOW_DIVERT */
5038
5039
5040 case SO_DELEGATED:
5041 if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
5042 sizeof (optval))) != 0)
5043 break;
5044
5045 error = so_set_effective_pid(so, optval, sopt->sopt_p);
5046 break;
5047
5048 case SO_DELEGATED_UUID: {
5049 uuid_t euuid;
5050
5051 if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
5052 sizeof (euuid))) != 0)
5053 break;
5054
5055 error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
5056 break;
5057 }
5058
5059 #if NECP
5060 case SO_NECP_ATTRIBUTES:
5061 error = necp_set_socket_attributes(so, sopt);
5062 break;
5063 #endif /* NECP */
5064
5065 #if MPTCP
5066 case SO_MPTCP_FASTJOIN:
5067 if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5068 ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5069 (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5070 error = ENOPROTOOPT;
5071 break;
5072 }
5073
5074 error = sooptcopyin(sopt, &optval, sizeof (optval),
5075 sizeof (optval));
5076 if (error != 0)
5077 goto out;
5078 if (optval == 0)
5079 so->so_flags &= ~SOF_MPTCP_FASTJOIN;
5080 else
5081 so->so_flags |= SOF_MPTCP_FASTJOIN;
5082 break;
5083 #endif /* MPTCP */
5084
5085 case SO_EXTENDED_BK_IDLE:
5086 error = sooptcopyin(sopt, &optval, sizeof (optval),
5087 sizeof (optval));
5088 if (error == 0)
5089 error = so_set_extended_bk_idle(so, optval);
5090 break;
5091
5092 case SO_MARK_CELLFALLBACK:
5093 error = sooptcopyin(sopt, &optval, sizeof(optval),
5094 sizeof(optval));
5095 if (error != 0)
5096 goto out;
5097 if (optval < 0) {
5098 error = EINVAL;
5099 goto out;
5100 }
5101 if (optval == 0)
5102 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5103 else
5104 so->so_flags1 |= SOF1_CELLFALLBACK;
5105 break;
5106 default:
5107 error = ENOPROTOOPT;
5108 break;
5109 }
5110 if (error == 0 && so->so_proto != NULL &&
5111 so->so_proto->pr_ctloutput != NULL) {
5112 (void) so->so_proto->pr_ctloutput(so, sopt);
5113 }
5114 }
5115 out:
5116 if (dolock)
5117 socket_unlock(so, 1);
5118 return (error);
5119 }
5120
5121 /* Helper routines for getsockopt */
5122 int
5123 sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5124 {
5125 int error;
5126 size_t valsize;
5127
5128 error = 0;
5129
5130 /*
5131 * Documented get behavior is that we always return a value,
5132 * possibly truncated to fit in the user's buffer.
5133 * Traditional behavior is that we always tell the user
5134 * precisely how much we copied, rather than something useful
5135 * like the total amount we had available for her.
5136 * Note that this interface is not idempotent; the entire answer must
5137 * generated ahead of time.
5138 */
5139 valsize = min(len, sopt->sopt_valsize);
5140 sopt->sopt_valsize = valsize;
5141 if (sopt->sopt_val != USER_ADDR_NULL) {
5142 if (sopt->sopt_p != kernproc)
5143 error = copyout(buf, sopt->sopt_val, valsize);
5144 else
5145 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5146 }
5147 return (error);
5148 }
5149
5150 static int
5151 sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5152 {
5153 int error;
5154 size_t len;
5155 struct user64_timeval tv64;
5156 struct user32_timeval tv32;
5157 const void * val;
5158 size_t valsize;
5159
5160 error = 0;
5161 if (proc_is64bit(sopt->sopt_p)) {
5162 len = sizeof (tv64);
5163 tv64.tv_sec = tv_p->tv_sec;
5164 tv64.tv_usec = tv_p->tv_usec;
5165 val = &tv64;
5166 } else {
5167 len = sizeof (tv32);
5168 tv32.tv_sec = tv_p->tv_sec;
5169 tv32.tv_usec = tv_p->tv_usec;
5170 val = &tv32;
5171 }
5172 valsize = min(len, sopt->sopt_valsize);
5173 sopt->sopt_valsize = valsize;
5174 if (sopt->sopt_val != USER_ADDR_NULL) {
5175 if (sopt->sopt_p != kernproc)
5176 error = copyout(val, sopt->sopt_val, valsize);
5177 else
5178 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5179 }
5180 return (error);
5181 }
5182
5183 /*
5184 * Return: 0 Success
5185 * ENOPROTOOPT
5186 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5187 * <pr_ctloutput>:???
5188 * <sf_getoption>:???
5189 */
5190 int
5191 sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5192 {
5193 int error, optval;
5194 struct linger l;
5195 struct timeval tv;
5196 #if CONFIG_MACF_SOCKET
5197 struct mac extmac;
5198 #endif /* MAC_SOCKET */
5199
5200 if (sopt->sopt_dir != SOPT_GET)
5201 sopt->sopt_dir = SOPT_GET;
5202
5203 if (dolock)
5204 socket_lock(so, 1);
5205
5206 error = sflt_getsockopt(so, sopt);
5207 if (error != 0) {
5208 if (error == EJUSTRETURN)
5209 error = 0;
5210 goto out;
5211 }
5212
5213 if (sopt->sopt_level != SOL_SOCKET) {
5214 if (so->so_proto != NULL &&
5215 so->so_proto->pr_ctloutput != NULL) {
5216 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5217 goto out;
5218 }
5219 error = ENOPROTOOPT;
5220 } else {
5221 /*
5222 * Allow socket-level (SOL_SOCKET) options to be filtered by
5223 * the protocol layer, if needed. A zero value returned from
5224 * the handler means use default socket-level processing as
5225 * done by the rest of this routine. Otherwise, any other
5226 * return value indicates that the option is unsupported.
5227 */
5228 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5229 pru_socheckopt(so, sopt)) != 0)
5230 goto out;
5231
5232 error = 0;
5233 switch (sopt->sopt_name) {
5234 case SO_LINGER:
5235 case SO_LINGER_SEC:
5236 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5237 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5238 so->so_linger : so->so_linger / hz;
5239 error = sooptcopyout(sopt, &l, sizeof (l));
5240 break;
5241
5242 case SO_USELOOPBACK:
5243 case SO_DONTROUTE:
5244 case SO_DEBUG:
5245 case SO_KEEPALIVE:
5246 case SO_REUSEADDR:
5247 case SO_REUSEPORT:
5248 case SO_BROADCAST:
5249 case SO_OOBINLINE:
5250 case SO_TIMESTAMP:
5251 case SO_TIMESTAMP_MONOTONIC:
5252 case SO_DONTTRUNC:
5253 case SO_WANTMORE:
5254 case SO_WANTOOBFLAG:
5255 case SO_NOWAKEFROMSLEEP:
5256 optval = so->so_options & sopt->sopt_name;
5257 integer:
5258 error = sooptcopyout(sopt, &optval, sizeof (optval));
5259 break;
5260
5261 case SO_TYPE:
5262 optval = so->so_type;
5263 goto integer;
5264
5265 case SO_NREAD:
5266 if (so->so_proto->pr_flags & PR_ATOMIC) {
5267 int pkt_total;
5268 struct mbuf *m1;
5269
5270 pkt_total = 0;
5271 m1 = so->so_rcv.sb_mb;
5272 while (m1 != NULL) {
5273 if (m1->m_type == MT_DATA ||
5274 m1->m_type == MT_HEADER ||
5275 m1->m_type == MT_OOBDATA)
5276 pkt_total += m1->m_len;
5277 m1 = m1->m_next;
5278 }
5279 optval = pkt_total;
5280 } else {
5281 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5282 }
5283 goto integer;
5284
5285 case SO_NUMRCVPKT:
5286 if (so->so_proto->pr_flags & PR_ATOMIC) {
5287 int cnt = 0;
5288 struct mbuf *m1;
5289
5290 m1 = so->so_rcv.sb_mb;
5291 while (m1 != NULL) {
5292 if (m1->m_type == MT_DATA ||
5293 m1->m_type == MT_HEADER ||
5294 m1->m_type == MT_OOBDATA)
5295 cnt += 1;
5296 m1 = m1->m_nextpkt;
5297 }
5298 optval = cnt;
5299 goto integer;
5300 } else {
5301 error = EINVAL;
5302 break;
5303 }
5304
5305 case SO_NWRITE:
5306 optval = so->so_snd.sb_cc;
5307 goto integer;
5308
5309 case SO_ERROR:
5310 optval = so->so_error;
5311 so->so_error = 0;
5312 goto integer;
5313
5314 case SO_SNDBUF: {
5315 u_int32_t hiwat = so->so_snd.sb_hiwat;
5316
5317 if (so->so_snd.sb_flags & SB_UNIX) {
5318 struct unpcb *unp =
5319 (struct unpcb *)(so->so_pcb);
5320 if (unp != NULL && unp->unp_conn != NULL) {
5321 hiwat += unp->unp_conn->unp_cc;
5322 }
5323 }
5324
5325 optval = hiwat;
5326 goto integer;
5327 }
5328 case SO_RCVBUF:
5329 optval = so->so_rcv.sb_hiwat;
5330 goto integer;
5331
5332 case SO_SNDLOWAT:
5333 optval = so->so_snd.sb_lowat;
5334 goto integer;
5335
5336 case SO_RCVLOWAT:
5337 optval = so->so_rcv.sb_lowat;
5338 goto integer;
5339
5340 case SO_SNDTIMEO:
5341 case SO_RCVTIMEO:
5342 tv = (sopt->sopt_name == SO_SNDTIMEO ?
5343 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5344
5345 error = sooptcopyout_timeval(sopt, &tv);
5346 break;
5347
5348 case SO_NOSIGPIPE:
5349 optval = (so->so_flags & SOF_NOSIGPIPE);
5350 goto integer;
5351
5352 case SO_NOADDRERR:
5353 optval = (so->so_flags & SOF_NOADDRAVAIL);
5354 goto integer;
5355
5356 case SO_REUSESHAREUID:
5357 optval = (so->so_flags & SOF_REUSESHAREUID);
5358 goto integer;
5359
5360
5361 case SO_NOTIFYCONFLICT:
5362 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5363 goto integer;
5364
5365 case SO_RESTRICTIONS:
5366 optval = so_get_restrictions(so);
5367 goto integer;
5368
5369 case SO_AWDL_UNRESTRICTED:
5370 if (SOCK_DOM(so) == PF_INET ||
5371 SOCK_DOM(so) == PF_INET6) {
5372 optval = inp_get_awdl_unrestricted(
5373 sotoinpcb(so));
5374 goto integer;
5375 } else
5376 error = EOPNOTSUPP;
5377 break;
5378
5379 case SO_LABEL:
5380 #if CONFIG_MACF_SOCKET
5381 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5382 sizeof (extmac))) != 0 ||
5383 (error = mac_socket_label_get(proc_ucred(
5384 sopt->sopt_p), so, &extmac)) != 0)
5385 break;
5386
5387 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5388 #else
5389 error = EOPNOTSUPP;
5390 #endif /* MAC_SOCKET */
5391 break;
5392
5393 case SO_PEERLABEL:
5394 #if CONFIG_MACF_SOCKET
5395 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5396 sizeof (extmac))) != 0 ||
5397 (error = mac_socketpeer_label_get(proc_ucred(
5398 sopt->sopt_p), so, &extmac)) != 0)
5399 break;
5400
5401 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5402 #else
5403 error = EOPNOTSUPP;
5404 #endif /* MAC_SOCKET */
5405 break;
5406
5407 #ifdef __APPLE_API_PRIVATE
5408 case SO_UPCALLCLOSEWAIT:
5409 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
5410 goto integer;
5411 #endif
5412 case SO_RANDOMPORT:
5413 optval = (so->so_flags & SOF_BINDRANDOMPORT);
5414 goto integer;
5415
5416 case SO_NP_EXTENSIONS: {
5417 struct so_np_extensions sonpx;
5418
5419 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
5420 SONPX_SETOPTSHUT : 0;
5421 sonpx.npx_mask = SONPX_MASK_VALID;
5422
5423 error = sooptcopyout(sopt, &sonpx,
5424 sizeof (struct so_np_extensions));
5425 break;
5426 }
5427
5428 case SO_TRAFFIC_CLASS:
5429 optval = so->so_traffic_class;
5430 goto integer;
5431
5432 case SO_RECV_TRAFFIC_CLASS:
5433 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
5434 goto integer;
5435
5436 case SO_TRAFFIC_CLASS_STATS:
5437 error = sooptcopyout(sopt, &so->so_tc_stats,
5438 sizeof (so->so_tc_stats));
5439 break;
5440
5441 case SO_TRAFFIC_CLASS_DBG:
5442 error = sogetopt_tcdbg(so, sopt);
5443 break;
5444
5445 case SO_PRIVILEGED_TRAFFIC_CLASS:
5446 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
5447 goto integer;
5448
5449 case SO_DEFUNCTOK:
5450 optval = !(so->so_flags & SOF_NODEFUNCT);
5451 goto integer;
5452
5453 case SO_ISDEFUNCT:
5454 optval = (so->so_flags & SOF_DEFUNCT);
5455 goto integer;
5456
5457 case SO_OPPORTUNISTIC:
5458 optval = so_get_opportunistic(so);
5459 goto integer;
5460
5461 case SO_FLUSH:
5462 /* This option is not gettable */
5463 error = EINVAL;
5464 break;
5465
5466 case SO_RECV_ANYIF:
5467 optval = so_get_recv_anyif(so);
5468 goto integer;
5469
5470 case SO_TRAFFIC_MGT_BACKGROUND:
5471 /* This option is handled by lower layer(s) */
5472 if (so->so_proto != NULL &&
5473 so->so_proto->pr_ctloutput != NULL) {
5474 (void) so->so_proto->pr_ctloutput(so, sopt);
5475 }
5476 break;
5477
5478 #if FLOW_DIVERT
5479 case SO_FLOW_DIVERT_TOKEN:
5480 error = flow_divert_token_get(so, sopt);
5481 break;
5482 #endif /* FLOW_DIVERT */
5483
5484 #if NECP
5485 case SO_NECP_ATTRIBUTES:
5486 error = necp_get_socket_attributes(so, sopt);
5487 break;
5488 #endif /* NECP */
5489
5490 #if CONTENT_FILTER
5491 case SO_CFIL_SOCK_ID: {
5492 cfil_sock_id_t sock_id;
5493
5494 sock_id = cfil_sock_id_from_socket(so);
5495
5496 error = sooptcopyout(sopt, &sock_id,
5497 sizeof(cfil_sock_id_t));
5498 break;
5499 }
5500 #endif /* CONTENT_FILTER */
5501
5502 #if MPTCP
5503 case SO_MPTCP_FASTJOIN:
5504 if (!((so->so_flags & SOF_MP_SUBFLOW) ||
5505 ((SOCK_CHECK_DOM(so, PF_MULTIPATH)) &&
5506 (SOCK_CHECK_PROTO(so, IPPROTO_TCP))))) {
5507 error = ENOPROTOOPT;
5508 break;
5509 }
5510 optval = (so->so_flags & SOF_MPTCP_FASTJOIN);
5511 /* Fixed along with rdar://19391339 */
5512 goto integer;
5513 #endif /* MPTCP */
5514
5515 case SO_EXTENDED_BK_IDLE:
5516 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
5517 goto integer;
5518 case SO_MARK_CELLFALLBACK:
5519 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
5520 ? 1 : 0;
5521 goto integer;
5522 default:
5523 error = ENOPROTOOPT;
5524 break;
5525 }
5526 }
5527 out:
5528 if (dolock)
5529 socket_unlock(so, 1);
5530 return (error);
5531 }
5532
5533 /*
5534 * The size limits on our soopt_getm is different from that on FreeBSD.
5535 * We limit the size of options to MCLBYTES. This will have to change
5536 * if we need to define options that need more space than MCLBYTES.
5537 */
5538 int
5539 soopt_getm(struct sockopt *sopt, struct mbuf **mp)
5540 {
5541 struct mbuf *m, *m_prev;
5542 int sopt_size = sopt->sopt_valsize;
5543 int how;
5544
5545 if (sopt_size <= 0 || sopt_size > MCLBYTES)
5546 return (EMSGSIZE);
5547
5548 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
5549 MGET(m, how, MT_DATA);
5550 if (m == NULL)
5551 return (ENOBUFS);
5552 if (sopt_size > MLEN) {
5553 MCLGET(m, how);
5554 if ((m->m_flags & M_EXT) == 0) {
5555 m_free(m);
5556 return (ENOBUFS);
5557 }
5558 m->m_len = min(MCLBYTES, sopt_size);
5559 } else {
5560 m->m_len = min(MLEN, sopt_size);
5561 }
5562 sopt_size -= m->m_len;
5563 *mp = m;
5564 m_prev = m;
5565
5566 while (sopt_size > 0) {
5567 MGET(m, how, MT_DATA);
5568 if (m == NULL) {
5569 m_freem(*mp);
5570 return (ENOBUFS);
5571 }
5572 if (sopt_size > MLEN) {
5573 MCLGET(m, how);
5574 if ((m->m_flags & M_EXT) == 0) {
5575 m_freem(*mp);
5576 m_freem(m);
5577 return (ENOBUFS);
5578 }
5579 m->m_len = min(MCLBYTES, sopt_size);
5580 } else {
5581 m->m_len = min(MLEN, sopt_size);
5582 }
5583 sopt_size -= m->m_len;
5584 m_prev->m_next = m;
5585 m_prev = m;
5586 }
5587 return (0);
5588 }
5589
5590 /* copyin sopt data into mbuf chain */
5591 int
5592 soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
5593 {
5594 struct mbuf *m0 = m;
5595
5596 if (sopt->sopt_val == USER_ADDR_NULL)
5597 return (0);
5598 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5599 if (sopt->sopt_p != kernproc) {
5600 int error;
5601
5602 error = copyin(sopt->sopt_val, mtod(m, char *),
5603 m->m_len);
5604 if (error != 0) {
5605 m_freem(m0);
5606 return (error);
5607 }
5608 } else {
5609 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
5610 mtod(m, char *), m->m_len);
5611 }
5612 sopt->sopt_valsize -= m->m_len;
5613 sopt->sopt_val += m->m_len;
5614 m = m->m_next;
5615 }
5616 /* should be allocated enoughly at ip6_sooptmcopyin() */
5617 if (m != NULL) {
5618 panic("soopt_mcopyin");
5619 /* NOTREACHED */
5620 }
5621 return (0);
5622 }
5623
5624 /* copyout mbuf chain data into soopt */
5625 int
5626 soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
5627 {
5628 struct mbuf *m0 = m;
5629 size_t valsize = 0;
5630
5631 if (sopt->sopt_val == USER_ADDR_NULL)
5632 return (0);
5633 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5634 if (sopt->sopt_p != kernproc) {
5635 int error;
5636
5637 error = copyout(mtod(m, char *), sopt->sopt_val,
5638 m->m_len);
5639 if (error != 0) {
5640 m_freem(m0);
5641 return (error);
5642 }
5643 } else {
5644 bcopy(mtod(m, char *),
5645 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
5646 }
5647 sopt->sopt_valsize -= m->m_len;
5648 sopt->sopt_val += m->m_len;
5649 valsize += m->m_len;
5650 m = m->m_next;
5651 }
5652 if (m != NULL) {
5653 /* enough soopt buffer should be given from user-land */
5654 m_freem(m0);
5655 return (EINVAL);
5656 }
5657 sopt->sopt_valsize = valsize;
5658 return (0);
5659 }
5660
5661 void
5662 sohasoutofband(struct socket *so)
5663 {
5664 if (so->so_pgid < 0)
5665 gsignal(-so->so_pgid, SIGURG);
5666 else if (so->so_pgid > 0)
5667 proc_signal(so->so_pgid, SIGURG);
5668 selwakeup(&so->so_rcv.sb_sel);
5669 }
5670
5671 int
5672 sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
5673 {
5674 #pragma unused(cred)
5675 struct proc *p = current_proc();
5676 int revents = 0;
5677
5678 socket_lock(so, 1);
5679 so_update_last_owner_locked(so, PROC_NULL);
5680 so_update_policy(so);
5681
5682 if (events & (POLLIN | POLLRDNORM))
5683 if (soreadable(so))
5684 revents |= events & (POLLIN | POLLRDNORM);
5685
5686 if (events & (POLLOUT | POLLWRNORM))
5687 if (sowriteable(so))
5688 revents |= events & (POLLOUT | POLLWRNORM);
5689
5690 if (events & (POLLPRI | POLLRDBAND))
5691 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
5692 revents |= events & (POLLPRI | POLLRDBAND);
5693
5694 if (revents == 0) {
5695 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
5696 /*
5697 * Darwin sets the flag first,
5698 * BSD calls selrecord first
5699 */
5700 so->so_rcv.sb_flags |= SB_SEL;
5701 selrecord(p, &so->so_rcv.sb_sel, wql);
5702 }
5703
5704 if (events & (POLLOUT | POLLWRNORM)) {
5705 /*
5706 * Darwin sets the flag first,
5707 * BSD calls selrecord first
5708 */
5709 so->so_snd.sb_flags |= SB_SEL;
5710 selrecord(p, &so->so_snd.sb_sel, wql);
5711 }
5712 }
5713
5714 socket_unlock(so, 1);
5715 return (revents);
5716 }
5717
5718 int
5719 soo_kqfilter(struct fileproc *fp, struct knote *kn, vfs_context_t ctx)
5720 {
5721 #pragma unused(fp)
5722 #if !CONFIG_MACF_SOCKET
5723 #pragma unused(ctx)
5724 #endif /* MAC_SOCKET */
5725 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5726 struct klist *skl;
5727
5728 socket_lock(so, 1);
5729 so_update_last_owner_locked(so, PROC_NULL);
5730 so_update_policy(so);
5731
5732 #if CONFIG_MACF_SOCKET
5733 if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
5734 kn, so) != 0) {
5735 socket_unlock(so, 1);
5736 return (1);
5737 }
5738 #endif /* MAC_SOCKET */
5739
5740 switch (kn->kn_filter) {
5741 case EVFILT_READ:
5742 kn->kn_fop = &soread_filtops;
5743 /*
5744 * If the caller explicitly asked for OOB results (e.g. poll()),
5745 * save that off in the hookid field and reserve the kn_flags
5746 * EV_OOBAND bit for output only.
5747 */
5748 if (kn->kn_flags & EV_OOBAND) {
5749 kn->kn_flags &= ~EV_OOBAND;
5750 kn->kn_hookid = EV_OOBAND;
5751 } else {
5752 kn->kn_hookid = 0;
5753 }
5754 skl = &so->so_rcv.sb_sel.si_note;
5755 break;
5756 case EVFILT_WRITE:
5757 kn->kn_fop = &sowrite_filtops;
5758 skl = &so->so_snd.sb_sel.si_note;
5759 break;
5760 case EVFILT_SOCK:
5761 kn->kn_fop = &sock_filtops;
5762 skl = &so->so_klist;
5763 kn->kn_hookid = 0;
5764 kn->kn_status |= KN_TOUCH;
5765 break;
5766 default:
5767 socket_unlock(so, 1);
5768 return (1);
5769 }
5770
5771 if (KNOTE_ATTACH(skl, kn)) {
5772 switch (kn->kn_filter) {
5773 case EVFILT_READ:
5774 so->so_rcv.sb_flags |= SB_KNOTE;
5775 break;
5776 case EVFILT_WRITE:
5777 so->so_snd.sb_flags |= SB_KNOTE;
5778 break;
5779 case EVFILT_SOCK:
5780 so->so_flags |= SOF_KNOTE;
5781 break;
5782 default:
5783 socket_unlock(so, 1);
5784 return (1);
5785 }
5786 }
5787 socket_unlock(so, 1);
5788 return (0);
5789 }
5790
5791 static void
5792 filt_sordetach(struct knote *kn)
5793 {
5794 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5795
5796 socket_lock(so, 1);
5797 if (so->so_rcv.sb_flags & SB_KNOTE)
5798 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
5799 so->so_rcv.sb_flags &= ~SB_KNOTE;
5800 socket_unlock(so, 1);
5801 }
5802
5803 /*ARGSUSED*/
5804 static int
5805 filt_soread(struct knote *kn, long hint)
5806 {
5807 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5808
5809 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5810 socket_lock(so, 1);
5811
5812 if (so->so_options & SO_ACCEPTCONN) {
5813 int isempty;
5814
5815 /*
5816 * Radar 6615193 handle the listen case dynamically
5817 * for kqueue read filter. This allows to call listen()
5818 * after registering the kqueue EVFILT_READ.
5819 */
5820
5821 kn->kn_data = so->so_qlen;
5822 isempty = ! TAILQ_EMPTY(&so->so_comp);
5823
5824 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5825 socket_unlock(so, 1);
5826
5827 return (isempty);
5828 }
5829
5830 /* socket isn't a listener */
5831 /*
5832 * NOTE_LOWAT specifies new low water mark in data, i.e.
5833 * the bytes of protocol data. We therefore exclude any
5834 * control bytes.
5835 */
5836 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5837
5838 /*
5839 * Clear out EV_OOBAND that filt_soread may have set in the
5840 * past.
5841 */
5842 kn->kn_flags &= ~EV_OOBAND;
5843 if ((so->so_oobmark) || (so->so_state & SS_RCVATMARK)) {
5844 kn->kn_flags |= EV_OOBAND;
5845 /*
5846 * If caller registered explicit interest in OOB data,
5847 * return immediately (data == amount beyond mark, for
5848 * legacy reasons - that should be changed later).
5849 */
5850 if (kn->kn_hookid == EV_OOBAND) {
5851 /*
5852 * When so_state is SS_RCVATMARK, so_oobmark
5853 * is 0.
5854 */
5855 kn->kn_data -= so->so_oobmark;
5856 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5857 socket_unlock(so, 1);
5858 return (1);
5859 }
5860 }
5861
5862 if ((so->so_state & SS_CANTRCVMORE)
5863 #if CONTENT_FILTER
5864 && cfil_sock_data_pending(&so->so_rcv) == 0
5865 #endif /* CONTENT_FILTER */
5866 ) {
5867 kn->kn_flags |= EV_EOF;
5868 kn->kn_fflags = so->so_error;
5869 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5870 socket_unlock(so, 1);
5871 return (1);
5872 }
5873
5874 if (so->so_error) { /* temporary udp error */
5875 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5876 socket_unlock(so, 1);
5877 return (1);
5878 }
5879
5880 int64_t lowwat = so->so_rcv.sb_lowat;
5881 /*
5882 * Ensure that when NOTE_LOWAT is used, the derived
5883 * low water mark is bounded by socket's rcv buf's
5884 * high and low water mark values.
5885 */
5886 if (kn->kn_sfflags & NOTE_LOWAT) {
5887 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
5888 lowwat = so->so_rcv.sb_hiwat;
5889 else if (kn->kn_sdata > lowwat)
5890 lowwat = kn->kn_sdata;
5891 }
5892
5893 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5894 socket_unlock(so, 1);
5895
5896 /*
5897 * The order below is important. Since NOTE_LOWAT
5898 * overrides sb_lowat, check for NOTE_LOWAT case
5899 * first.
5900 */
5901 if (kn->kn_sfflags & NOTE_LOWAT)
5902 return (kn->kn_data >= lowwat);
5903
5904 return (so->so_rcv.sb_cc >= lowwat);
5905 }
5906
5907 static void
5908 filt_sowdetach(struct knote *kn)
5909 {
5910 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5911 socket_lock(so, 1);
5912
5913 if (so->so_snd.sb_flags & SB_KNOTE)
5914 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
5915 so->so_snd.sb_flags &= ~SB_KNOTE;
5916 socket_unlock(so, 1);
5917 }
5918
5919 int
5920 so_wait_for_if_feedback(struct socket *so)
5921 {
5922 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
5923 (so->so_state & SS_ISCONNECTED)) {
5924 struct inpcb *inp = sotoinpcb(so);
5925 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
5926 return (1);
5927 }
5928 return (0);
5929 }
5930
5931 /*ARGSUSED*/
5932 static int
5933 filt_sowrite(struct knote *kn, long hint)
5934 {
5935 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
5936 int ret = 0;
5937
5938 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5939 socket_lock(so, 1);
5940
5941 kn->kn_data = sbspace(&so->so_snd);
5942 if (so->so_state & SS_CANTSENDMORE) {
5943 kn->kn_flags |= EV_EOF;
5944 kn->kn_fflags = so->so_error;
5945 ret = 1;
5946 goto out;
5947 }
5948 if (so->so_error) { /* temporary udp error */
5949 ret = 1;
5950 goto out;
5951 }
5952 if (!socanwrite(so)) {
5953 ret = 0;
5954 goto out;
5955 }
5956 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
5957 ret = 1;
5958 goto out;
5959 }
5960 int64_t lowwat = so->so_snd.sb_lowat;
5961 if (kn->kn_sfflags & NOTE_LOWAT) {
5962 if (kn->kn_sdata > so->so_snd.sb_hiwat)
5963 lowwat = so->so_snd.sb_hiwat;
5964 else if (kn->kn_sdata > lowwat)
5965 lowwat = kn->kn_sdata;
5966 }
5967 if (kn->kn_data >= lowwat) {
5968 if (so->so_flags & SOF_NOTSENT_LOWAT) {
5969 if ((SOCK_DOM(so) == PF_INET
5970 || SOCK_DOM(so) == PF_INET6)
5971 && so->so_type == SOCK_STREAM) {
5972 ret = tcp_notsent_lowat_check(so);
5973 }
5974 #if MPTCP
5975 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
5976 (SOCK_PROTO(so) == IPPROTO_TCP)) {
5977 ret = mptcp_notsent_lowat_check(so);
5978 }
5979 #endif
5980 else {
5981 ret = 1;
5982 goto out;
5983 }
5984 } else {
5985 ret = 1;
5986 }
5987 }
5988 if (so_wait_for_if_feedback(so))
5989 ret = 0;
5990 out:
5991 if ((hint & SO_FILT_HINT_LOCKED) == 0)
5992 socket_unlock(so, 1);
5993 return (ret);
5994 }
5995
5996 static void
5997 filt_sockdetach(struct knote *kn)
5998 {
5999 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6000 socket_lock(so, 1);
6001
6002 if ((so->so_flags & SOF_KNOTE) != 0)
6003 if (KNOTE_DETACH(&so->so_klist, kn))
6004 so->so_flags &= ~SOF_KNOTE;
6005 socket_unlock(so, 1);
6006 }
6007
6008 static int
6009 filt_sockev(struct knote *kn, long hint)
6010 {
6011 int ret = 0, locked = 0;
6012 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6013 long ev_hint = (hint & SO_FILT_HINT_EV);
6014 uint32_t level_trigger = 0;
6015
6016 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6017 socket_lock(so, 1);
6018 locked = 1;
6019 }
6020
6021 if (ev_hint & SO_FILT_HINT_CONNRESET) {
6022 kn->kn_fflags |= NOTE_CONNRESET;
6023 }
6024 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6025 kn->kn_fflags |= NOTE_TIMEOUT;
6026 }
6027 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6028 kn->kn_fflags |= NOTE_NOSRCADDR;
6029 }
6030 if (ev_hint & SO_FILT_HINT_IFDENIED) {
6031 kn->kn_fflags |= NOTE_IFDENIED;
6032 }
6033 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6034 kn->kn_fflags |= NOTE_KEEPALIVE;
6035 }
6036 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6037 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6038 }
6039 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6040 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6041 }
6042 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6043 (so->so_state & SS_ISCONNECTED)) {
6044 kn->kn_fflags |= NOTE_CONNECTED;
6045 level_trigger |= NOTE_CONNECTED;
6046 }
6047 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6048 (so->so_state & SS_ISDISCONNECTED)) {
6049 kn->kn_fflags |= NOTE_DISCONNECTED;
6050 level_trigger |= NOTE_DISCONNECTED;
6051 }
6052 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6053 if (so->so_proto != NULL &&
6054 (so->so_proto->pr_flags & PR_EVCONNINFO))
6055 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6056 }
6057
6058 if ((so->so_state & SS_CANTRCVMORE)
6059 #if CONTENT_FILTER
6060 && cfil_sock_data_pending(&so->so_rcv) == 0
6061 #endif /* CONTENT_FILTER */
6062 ) {
6063 kn->kn_fflags |= NOTE_READCLOSED;
6064 level_trigger |= NOTE_READCLOSED;
6065 }
6066
6067 if (so->so_state & SS_CANTSENDMORE) {
6068 kn->kn_fflags |= NOTE_WRITECLOSED;
6069 level_trigger |= NOTE_WRITECLOSED;
6070 }
6071
6072 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6073 (so->so_flags & SOF_SUSPENDED)) {
6074 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6075
6076 /* If resume event was delivered before, reset it */
6077 kn->kn_hookid &= ~NOTE_RESUME;
6078
6079 kn->kn_fflags |= NOTE_SUSPEND;
6080 level_trigger |= NOTE_SUSPEND;
6081 }
6082
6083 if ((ev_hint & SO_FILT_HINT_RESUME) ||
6084 (so->so_flags & SOF_SUSPENDED) == 0) {
6085 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6086
6087 /* If suspend event was delivered before, reset it */
6088 kn->kn_hookid &= ~NOTE_SUSPEND;
6089
6090 kn->kn_fflags |= NOTE_RESUME;
6091 level_trigger |= NOTE_RESUME;
6092 }
6093
6094 if (so->so_error != 0) {
6095 ret = 1;
6096 kn->kn_data = so->so_error;
6097 kn->kn_flags |= EV_EOF;
6098 } else {
6099 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
6100 }
6101
6102 /* Reset any events that are not requested on this knote */
6103 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6104 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6105
6106 /* Find the level triggerred events that are already delivered */
6107 level_trigger &= kn->kn_hookid;
6108 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6109
6110 /* Do not deliver level triggerred events more than once */
6111 if ((kn->kn_fflags & ~level_trigger) != 0)
6112 ret = 1;
6113
6114 if (locked)
6115 socket_unlock(so, 1);
6116
6117 return (ret);
6118 }
6119
6120 static void
6121 filt_socktouch(struct knote *kn, struct kevent_internal_s *kev, long type)
6122 {
6123 #pragma unused(kev)
6124 switch (type) {
6125 case EVENT_REGISTER:
6126 {
6127 uint32_t changed_flags;
6128 changed_flags = (kn->kn_sfflags ^ kn->kn_hookid);
6129
6130 /*
6131 * Since we keep track of events that are already
6132 * delivered, if any of those events are not requested
6133 * anymore the state related to them can be reset
6134 */
6135 kn->kn_hookid &=
6136 ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6137 break;
6138 }
6139 case EVENT_PROCESS:
6140 /*
6141 * Store the state of the events being delivered. This
6142 * state can be used to deliver level triggered events
6143 * ateast once and still avoid waking up the application
6144 * multiple times as long as the event is active.
6145 */
6146 if (kn->kn_fflags != 0)
6147 kn->kn_hookid |= (kn->kn_fflags &
6148 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6149
6150 /*
6151 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6152 * only one of them and remember the last one that was
6153 * delivered last
6154 */
6155 if (kn->kn_fflags & NOTE_SUSPEND)
6156 kn->kn_hookid &= ~NOTE_RESUME;
6157 if (kn->kn_fflags & NOTE_RESUME)
6158 kn->kn_hookid &= ~NOTE_SUSPEND;
6159 break;
6160 default:
6161 break;
6162 }
6163 }
6164
6165 void
6166 get_sockev_state(struct socket *so, u_int32_t *statep)
6167 {
6168 u_int32_t state = *(statep);
6169
6170 if (so->so_state & SS_ISCONNECTED)
6171 state |= SOCKEV_CONNECTED;
6172 else
6173 state &= ~(SOCKEV_CONNECTED);
6174 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
6175 *(statep) = state;
6176 }
6177
6178 #define SO_LOCK_HISTORY_STR_LEN \
6179 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
6180
6181 __private_extern__ const char *
6182 solockhistory_nr(struct socket *so)
6183 {
6184 size_t n = 0;
6185 int i;
6186 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
6187
6188 bzero(lock_history_str, sizeof (lock_history_str));
6189 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
6190 n += snprintf(lock_history_str + n,
6191 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
6192 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
6193 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
6194 }
6195 return (lock_history_str);
6196 }
6197
6198 int
6199 socket_lock(struct socket *so, int refcount)
6200 {
6201 int error = 0;
6202 void *lr_saved;
6203
6204 lr_saved = __builtin_return_address(0);
6205
6206 if (so->so_proto->pr_lock) {
6207 error = (*so->so_proto->pr_lock)(so, refcount, lr_saved);
6208 } else {
6209 #ifdef MORE_LOCKING_DEBUG
6210 lck_mtx_assert(so->so_proto->pr_domain->dom_mtx,
6211 LCK_MTX_ASSERT_NOTOWNED);
6212 #endif
6213 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
6214 if (refcount)
6215 so->so_usecount++;
6216 so->lock_lr[so->next_lock_lr] = lr_saved;
6217 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
6218 }
6219
6220 return (error);
6221 }
6222
6223 int
6224 socket_unlock(struct socket *so, int refcount)
6225 {
6226 int error = 0;
6227 void *lr_saved;
6228 lck_mtx_t *mutex_held;
6229
6230 lr_saved = __builtin_return_address(0);
6231
6232 if (so->so_proto == NULL) {
6233 panic("%s: null so_proto so=%p\n", __func__, so);
6234 /* NOTREACHED */
6235 }
6236
6237 if (so && so->so_proto->pr_unlock) {
6238 error = (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
6239 } else {
6240 mutex_held = so->so_proto->pr_domain->dom_mtx;
6241 #ifdef MORE_LOCKING_DEBUG
6242 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
6243 #endif
6244 so->unlock_lr[so->next_unlock_lr] = lr_saved;
6245 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
6246
6247 if (refcount) {
6248 if (so->so_usecount <= 0) {
6249 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
6250 "lrh=%s", __func__, so->so_usecount, so,
6251 SOCK_DOM(so), so->so_type,
6252 SOCK_PROTO(so), solockhistory_nr(so));
6253 /* NOTREACHED */
6254 }
6255
6256 so->so_usecount--;
6257 if (so->so_usecount == 0)
6258 sofreelastref(so, 1);
6259 }
6260 lck_mtx_unlock(mutex_held);
6261 }
6262
6263 return (error);
6264 }
6265
6266 /* Called with socket locked, will unlock socket */
6267 void
6268 sofree(struct socket *so)
6269 {
6270 lck_mtx_t *mutex_held;
6271
6272 if (so->so_proto->pr_getlock != NULL)
6273 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
6274 else
6275 mutex_held = so->so_proto->pr_domain->dom_mtx;
6276 lck_mtx_assert(mutex_held, LCK_MTX_ASSERT_OWNED);
6277
6278 sofreelastref(so, 0);
6279 }
6280
6281 void
6282 soreference(struct socket *so)
6283 {
6284 socket_lock(so, 1); /* locks & take one reference on socket */
6285 socket_unlock(so, 0); /* unlock only */
6286 }
6287
6288 void
6289 sodereference(struct socket *so)
6290 {
6291 socket_lock(so, 0);
6292 socket_unlock(so, 1);
6293 }
6294
6295 /*
6296 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
6297 * possibility of using jumbo clusters. Caller must ensure to hold
6298 * the socket lock.
6299 */
6300 void
6301 somultipages(struct socket *so, boolean_t set)
6302 {
6303 if (set)
6304 so->so_flags |= SOF_MULTIPAGES;
6305 else
6306 so->so_flags &= ~SOF_MULTIPAGES;
6307 }
6308
6309 void
6310 soif2kcl(struct socket *so, boolean_t set)
6311 {
6312 if (set)
6313 so->so_flags1 |= SOF1_IF_2KCL;
6314 else
6315 so->so_flags1 &= ~SOF1_IF_2KCL;
6316 }
6317
6318 int
6319 so_isdstlocal(struct socket *so) {
6320
6321 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6322
6323 if (SOCK_DOM(so) == PF_INET)
6324 return (inaddr_local(inp->inp_faddr));
6325 else if (SOCK_DOM(so) == PF_INET6)
6326 return (in6addr_local(&inp->in6p_faddr));
6327
6328 return (0);
6329 }
6330
6331 int
6332 sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
6333 {
6334 struct sockbuf *rcv, *snd;
6335 int err = 0, defunct;
6336
6337 rcv = &so->so_rcv;
6338 snd = &so->so_snd;
6339
6340 defunct = (so->so_flags & SOF_DEFUNCT);
6341 if (defunct) {
6342 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
6343 panic("%s: SB_DROP not set", __func__);
6344 /* NOTREACHED */
6345 }
6346 goto done;
6347 }
6348
6349 if (so->so_flags & SOF_NODEFUNCT) {
6350 if (noforce) {
6351 err = EOPNOTSUPP;
6352 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) "
6353 "so 0x%llx [%d,%d] is not eligible for defunct "
6354 "(%d)\n", __func__, proc_selfpid(), proc_pid(p),
6355 level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6356 SOCK_DOM(so), SOCK_TYPE(so), err));
6357 return (err);
6358 }
6359 so->so_flags &= ~SOF_NODEFUNCT;
6360 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
6361 "[%d,%d] defunct by force\n", __func__, proc_selfpid(),
6362 proc_pid(p), level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6363 SOCK_DOM(so), SOCK_TYPE(so)));
6364 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
6365 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6366 struct ifnet *ifp = inp->inp_last_outifp;
6367
6368 if (ifp && IFNET_IS_CELLULAR(ifp)) {
6369 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
6370 } else if (so->so_flags & SOF_DELEGATED) {
6371 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
6372 } else if (soextbkidlestat.so_xbkidle_time == 0) {
6373 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
6374 } else if (noforce) {
6375 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
6376
6377 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
6378 so->so_extended_bk_start = net_uptime();
6379 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
6380
6381 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
6382
6383 err = EOPNOTSUPP;
6384 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) "
6385 "extend bk idle "
6386 "so 0x%llx rcv hw %d cc %d\n",
6387 __func__, proc_selfpid(), proc_pid(p),
6388 level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6389 so->so_rcv.sb_hiwat, so->so_rcv.sb_cc));
6390 return (err);
6391 } else {
6392 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
6393 }
6394 }
6395
6396 so->so_flags |= SOF_DEFUNCT;
6397
6398 /* Prevent further data from being appended to the socket buffers */
6399 snd->sb_flags |= SB_DROP;
6400 rcv->sb_flags |= SB_DROP;
6401
6402 /* Flush any existing data in the socket buffers */
6403 if (rcv->sb_cc != 0) {
6404 rcv->sb_flags &= ~SB_SEL;
6405 selthreadclear(&rcv->sb_sel);
6406 sbrelease(rcv);
6407 }
6408 if (snd->sb_cc != 0) {
6409 snd->sb_flags &= ~SB_SEL;
6410 selthreadclear(&snd->sb_sel);
6411 sbrelease(snd);
6412 }
6413
6414 done:
6415 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%d,%d] %s "
6416 "defunct%s\n", __func__, proc_selfpid(), proc_pid(p), level,
6417 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so), SOCK_TYPE(so),
6418 defunct ? "is already" : "marked as",
6419 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ? " extbkidle" : ""));
6420
6421 return (err);
6422 }
6423
6424 int
6425 sodefunct(struct proc *p, struct socket *so, int level)
6426 {
6427 struct sockbuf *rcv, *snd;
6428
6429 if (!(so->so_flags & SOF_DEFUNCT)) {
6430 panic("%s improperly called", __func__);
6431 /* NOTREACHED */
6432 }
6433 if (so->so_state & SS_DEFUNCT)
6434 goto done;
6435
6436 rcv = &so->so_rcv;
6437 snd = &so->so_snd;
6438
6439 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6440 char s[MAX_IPv6_STR_LEN];
6441 char d[MAX_IPv6_STR_LEN];
6442 struct inpcb *inp = sotoinpcb(so);
6443
6444 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx [%s "
6445 "%s:%d -> %s:%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
6446 "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
6447 proc_pid(p), level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6448 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
6449 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
6450 (void *)&inp->inp_laddr.s_addr : (void *)&inp->in6p_laddr),
6451 s, sizeof (s)), ntohs(inp->in6p_lport),
6452 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
6453 (void *)&inp->inp_faddr.s_addr : (void *)&inp->in6p_faddr,
6454 d, sizeof (d)), ntohs(inp->in6p_fport),
6455 (uint32_t)rcv->sb_sel.si_flags,
6456 (uint32_t)snd->sb_sel.si_flags,
6457 rcv->sb_flags, snd->sb_flags));
6458 } else {
6459 SODEFUNCTLOG(("%s[%d]: (target pid %d level %d) so 0x%llx "
6460 "[%d,%d] is now defunct [rcv_si 0x%x, snd_si 0x%x, "
6461 "rcv_fl 0x%x, snd_fl 0x%x]\n", __func__, proc_selfpid(),
6462 proc_pid(p), level, (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6463 SOCK_DOM(so), SOCK_TYPE(so), (uint32_t)rcv->sb_sel.si_flags,
6464 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
6465 snd->sb_flags));
6466 }
6467
6468 /*
6469 * Unwedge threads blocked on sbwait() and sb_lock().
6470 */
6471 sbwakeup(rcv);
6472 sbwakeup(snd);
6473
6474 so->so_flags1 |= SOF1_DEFUNCTINPROG;
6475 if (rcv->sb_flags & SB_LOCK)
6476 sbunlock(rcv, TRUE); /* keep socket locked */
6477 if (snd->sb_flags & SB_LOCK)
6478 sbunlock(snd, TRUE); /* keep socket locked */
6479
6480 /*
6481 * Flush the buffers and disconnect. We explicitly call shutdown
6482 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
6483 * states are set for the socket. This would also flush out data
6484 * hanging off the receive list of this socket.
6485 */
6486 (void) soshutdownlock_final(so, SHUT_RD);
6487 (void) soshutdownlock_final(so, SHUT_WR);
6488 (void) sodisconnectlocked(so);
6489
6490 /*
6491 * Explicitly handle connectionless-protocol disconnection
6492 * and release any remaining data in the socket buffers.
6493 */
6494 if (!(so->so_flags & SS_ISDISCONNECTED))
6495 (void) soisdisconnected(so);
6496
6497 if (so->so_error == 0)
6498 so->so_error = EBADF;
6499
6500 if (rcv->sb_cc != 0) {
6501 rcv->sb_flags &= ~SB_SEL;
6502 selthreadclear(&rcv->sb_sel);
6503 sbrelease(rcv);
6504 }
6505 if (snd->sb_cc != 0) {
6506 snd->sb_flags &= ~SB_SEL;
6507 selthreadclear(&snd->sb_sel);
6508 sbrelease(snd);
6509 }
6510 so->so_state |= SS_DEFUNCT;
6511
6512 done:
6513 return (0);
6514 }
6515
6516 int
6517 soresume(struct proc *p, struct socket *so, int locked)
6518 {
6519 if (locked == 0)
6520 socket_lock(so, 1);
6521
6522 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
6523 SODEFUNCTLOG(("%s[%d]: )target pid %d) so 0x%llx [%d,%d] "
6524 "resumed from bk idle\n",
6525 __func__, proc_selfpid(), proc_pid(p),
6526 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6527 SOCK_DOM(so), SOCK_TYPE(so)));
6528
6529 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
6530 so->so_extended_bk_start = 0;
6531 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
6532
6533 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
6534 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
6535 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
6536 }
6537 if (locked == 0)
6538 socket_unlock(so, 1);
6539
6540 return (0);
6541 }
6542
6543 /*
6544 * Does not attempt to account for sockets that are delegated from
6545 * the current process
6546 */
6547 int
6548 so_set_extended_bk_idle(struct socket *so, int optval)
6549 {
6550 int error = 0;
6551
6552 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
6553 SOCK_PROTO(so) != IPPROTO_TCP) {
6554 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
6555 error = EOPNOTSUPP;
6556 } else if (optval == 0) {
6557 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
6558
6559 soresume(current_proc(), so, 1);
6560 } else {
6561 struct proc *p = current_proc();
6562 int i;
6563 struct filedesc *fdp;
6564 int count = 0;
6565
6566 proc_fdlock(p);
6567
6568 fdp = p->p_fd;
6569 for (i = 0; i < fdp->fd_nfiles; i++) {
6570 struct fileproc *fp = fdp->fd_ofiles[i];
6571 struct socket *so2;
6572
6573 if (fp == NULL ||
6574 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
6575 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
6576 continue;
6577
6578 so2 = (struct socket *)fp->f_fglob->fg_data;
6579 if (so != so2 &&
6580 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED)
6581 count++;
6582 if (count >= soextbkidlestat.so_xbkidle_maxperproc)
6583 break;
6584 }
6585 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
6586 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
6587 error = EBUSY;
6588 } else if (so->so_flags & SOF_DELEGATED) {
6589 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
6590 error = EBUSY;
6591 } else {
6592 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
6593 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
6594 }
6595 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d] "
6596 "%s marked for extended bk idle\n",
6597 __func__, proc_selfpid(),
6598 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6599 SOCK_DOM(so), SOCK_TYPE(so),
6600 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
6601 "is" : "not"));
6602
6603 proc_fdunlock(p);
6604 }
6605
6606 return (error);
6607 }
6608
6609 static void
6610 so_stop_extended_bk_idle(struct socket *so)
6611 {
6612 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
6613 so->so_extended_bk_start = 0;
6614
6615 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
6616 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
6617 /*
6618 * Force defunct
6619 */
6620 sosetdefunct(current_proc(), so,
6621 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
6622 if (so->so_flags & SOF_DEFUNCT) {
6623 sodefunct(current_proc(), so,
6624 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
6625 }
6626 }
6627
6628 void
6629 so_drain_extended_bk_idle(struct socket *so)
6630 {
6631 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
6632 /*
6633 * Only penalize sockets that have outstanding data
6634 */
6635 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
6636 so_stop_extended_bk_idle(so);
6637
6638 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
6639 }
6640 }
6641 }
6642
6643 /*
6644 * Return values tells if socket is still in extended background idle
6645 */
6646 int
6647 so_check_extended_bk_idle_time(struct socket *so)
6648 {
6649 int ret = 1;
6650
6651 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
6652 SODEFUNCTLOG(("%s[%d]: so 0x%llx [%d,%d]\n",
6653 __func__, proc_selfpid(),
6654 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6655 SOCK_DOM(so), SOCK_TYPE(so)));
6656 if (net_uptime() - so->so_extended_bk_start >
6657 soextbkidlestat.so_xbkidle_time) {
6658 so_stop_extended_bk_idle(so);
6659
6660 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
6661
6662 ret = 0;
6663 } else {
6664 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6665
6666 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
6667 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
6668 }
6669 }
6670
6671 return (ret);
6672 }
6673
6674 void
6675 resume_proc_sockets(proc_t p)
6676 {
6677 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
6678 struct filedesc *fdp;
6679 int i;
6680
6681 proc_fdlock(p);
6682 fdp = p->p_fd;
6683 for (i = 0; i < fdp->fd_nfiles; i++) {
6684 struct fileproc *fp;
6685 struct socket *so;
6686
6687 fp = fdp->fd_ofiles[i];
6688 if (fp == NULL ||
6689 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
6690 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
6691 continue;
6692
6693 so = (struct socket *)fp->f_fglob->fg_data;
6694 (void) soresume(p, so, 0);
6695 }
6696 proc_fdunlock(p);
6697
6698 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
6699 }
6700 }
6701
6702 __private_extern__ int
6703 so_set_recv_anyif(struct socket *so, int optval)
6704 {
6705 int ret = 0;
6706
6707 #if INET6
6708 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6709 #else
6710 if (SOCK_DOM(so) == PF_INET) {
6711 #endif /* !INET6 */
6712 if (optval)
6713 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
6714 else
6715 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
6716 }
6717
6718 return (ret);
6719 }
6720
6721 __private_extern__ int
6722 so_get_recv_anyif(struct socket *so)
6723 {
6724 int ret = 0;
6725
6726 #if INET6
6727 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6728 #else
6729 if (SOCK_DOM(so) == PF_INET) {
6730 #endif /* !INET6 */
6731 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
6732 }
6733
6734 return (ret);
6735 }
6736
6737 int
6738 so_set_restrictions(struct socket *so, uint32_t vals)
6739 {
6740 int nocell_old, nocell_new;
6741 int noexpensive_old, noexpensive_new;
6742
6743 /*
6744 * Deny-type restrictions are trapdoors; once set they cannot be
6745 * unset for the lifetime of the socket. This allows them to be
6746 * issued by a framework on behalf of the application without
6747 * having to worry that they can be undone.
6748 *
6749 * Note here that socket-level restrictions overrides any protocol
6750 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
6751 * socket restriction issued on the socket has a higher precendence
6752 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
6753 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
6754 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
6755 */
6756 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
6757 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
6758 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
6759 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
6760 SO_RESTRICT_DENY_EXPENSIVE));
6761 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
6762 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
6763
6764 /* we can only set, not clear restrictions */
6765 if ((nocell_new - nocell_old) == 0 &&
6766 (noexpensive_new - noexpensive_old) == 0)
6767 return (0);
6768 #if INET6
6769 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6770 #else
6771 if (SOCK_DOM(so) == PF_INET) {
6772 #endif /* !INET6 */
6773 if (nocell_new - nocell_old != 0) {
6774 /*
6775 * if deny cellular is now set, do what's needed
6776 * for INPCB
6777 */
6778 inp_set_nocellular(sotoinpcb(so));
6779 }
6780 if (noexpensive_new - noexpensive_old != 0) {
6781 inp_set_noexpensive(sotoinpcb(so));
6782 }
6783 }
6784
6785 return (0);
6786 }
6787
6788 uint32_t
6789 so_get_restrictions(struct socket *so)
6790 {
6791 return (so->so_restrictions & (SO_RESTRICT_DENY_IN |
6792 SO_RESTRICT_DENY_OUT |
6793 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE));
6794 }
6795
6796 struct sockaddr_entry *
6797 sockaddrentry_alloc(int how)
6798 {
6799 struct sockaddr_entry *se;
6800
6801 se = (how == M_WAITOK) ? zalloc(se_zone) : zalloc_noblock(se_zone);
6802 if (se != NULL)
6803 bzero(se, se_zone_size);
6804
6805 return (se);
6806 }
6807
6808 void
6809 sockaddrentry_free(struct sockaddr_entry *se)
6810 {
6811 if (se->se_addr != NULL) {
6812 FREE(se->se_addr, M_SONAME);
6813 se->se_addr = NULL;
6814 }
6815 zfree(se_zone, se);
6816 }
6817
6818 struct sockaddr_entry *
6819 sockaddrentry_dup(const struct sockaddr_entry *src_se, int how)
6820 {
6821 struct sockaddr_entry *dst_se;
6822
6823 dst_se = sockaddrentry_alloc(how);
6824 if (dst_se != NULL) {
6825 int len = src_se->se_addr->sa_len;
6826
6827 MALLOC(dst_se->se_addr, struct sockaddr *,
6828 len, M_SONAME, how | M_ZERO);
6829 if (dst_se->se_addr != NULL) {
6830 bcopy(src_se->se_addr, dst_se->se_addr, len);
6831 } else {
6832 sockaddrentry_free(dst_se);
6833 dst_se = NULL;
6834 }
6835 }
6836
6837 return (dst_se);
6838 }
6839
6840 struct sockaddr_list *
6841 sockaddrlist_alloc(int how)
6842 {
6843 struct sockaddr_list *sl;
6844
6845 sl = (how == M_WAITOK) ? zalloc(sl_zone) : zalloc_noblock(sl_zone);
6846 if (sl != NULL) {
6847 bzero(sl, sl_zone_size);
6848 TAILQ_INIT(&sl->sl_head);
6849 }
6850 return (sl);
6851 }
6852
6853 void
6854 sockaddrlist_free(struct sockaddr_list *sl)
6855 {
6856 struct sockaddr_entry *se, *tse;
6857
6858 TAILQ_FOREACH_SAFE(se, &sl->sl_head, se_link, tse) {
6859 sockaddrlist_remove(sl, se);
6860 sockaddrentry_free(se);
6861 }
6862 VERIFY(sl->sl_cnt == 0 && TAILQ_EMPTY(&sl->sl_head));
6863 zfree(sl_zone, sl);
6864 }
6865
6866 void
6867 sockaddrlist_insert(struct sockaddr_list *sl, struct sockaddr_entry *se)
6868 {
6869 VERIFY(!(se->se_flags & SEF_ATTACHED));
6870 se->se_flags |= SEF_ATTACHED;
6871 TAILQ_INSERT_TAIL(&sl->sl_head, se, se_link);
6872 sl->sl_cnt++;
6873 VERIFY(sl->sl_cnt != 0);
6874 }
6875
6876 void
6877 sockaddrlist_remove(struct sockaddr_list *sl, struct sockaddr_entry *se)
6878 {
6879 VERIFY(se->se_flags & SEF_ATTACHED);
6880 se->se_flags &= ~SEF_ATTACHED;
6881 VERIFY(sl->sl_cnt != 0);
6882 sl->sl_cnt--;
6883 TAILQ_REMOVE(&sl->sl_head, se, se_link);
6884 }
6885
6886 struct sockaddr_list *
6887 sockaddrlist_dup(const struct sockaddr_list *src_sl, int how)
6888 {
6889 struct sockaddr_entry *src_se, *tse;
6890 struct sockaddr_list *dst_sl;
6891
6892 dst_sl = sockaddrlist_alloc(how);
6893 if (dst_sl == NULL)
6894 return (NULL);
6895
6896 TAILQ_FOREACH_SAFE(src_se, &src_sl->sl_head, se_link, tse) {
6897 struct sockaddr_entry *dst_se;
6898
6899 if (src_se->se_addr == NULL)
6900 continue;
6901
6902 dst_se = sockaddrentry_dup(src_se, how);
6903 if (dst_se == NULL) {
6904 sockaddrlist_free(dst_sl);
6905 return (NULL);
6906 }
6907
6908 sockaddrlist_insert(dst_sl, dst_se);
6909 }
6910 VERIFY(src_sl->sl_cnt == dst_sl->sl_cnt);
6911
6912 return (dst_sl);
6913 }
6914
6915 int
6916 so_set_effective_pid(struct socket *so, int epid, struct proc *p)
6917 {
6918 struct proc *ep = PROC_NULL;
6919 int error = 0;
6920
6921 /* pid 0 is reserved for kernel */
6922 if (epid == 0) {
6923 error = EINVAL;
6924 goto done;
6925 }
6926
6927 /*
6928 * If this is an in-kernel socket, prevent its delegate
6929 * association from changing unless the socket option is
6930 * coming from within the kernel itself.
6931 */
6932 if (so->last_pid == 0 && p != kernproc) {
6933 error = EACCES;
6934 goto done;
6935 }
6936
6937 /*
6938 * If this is issued by a process that's recorded as the
6939 * real owner of the socket, or if the pid is the same as
6940 * the process's own pid, then proceed. Otherwise ensure
6941 * that the issuing process has the necessary privileges.
6942 */
6943 if (epid != so->last_pid || epid != proc_pid(p)) {
6944 if ((error = priv_check_cred(kauth_cred_get(),
6945 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
6946 error = EACCES;
6947 goto done;
6948 }
6949 }
6950
6951 /* Find the process that corresponds to the effective pid */
6952 if ((ep = proc_find(epid)) == PROC_NULL) {
6953 error = ESRCH;
6954 goto done;
6955 }
6956
6957 /*
6958 * If a process tries to delegate the socket to itself, then
6959 * there's really nothing to do; treat it as a way for the
6960 * delegate association to be cleared. Note that we check
6961 * the passed-in proc rather than calling proc_selfpid(),
6962 * as we need to check the process issuing the socket option
6963 * which could be kernproc. Given that we don't allow 0 for
6964 * effective pid, it means that a delegated in-kernel socket
6965 * stays delegated during its lifetime (which is probably OK.)
6966 */
6967 if (epid == proc_pid(p)) {
6968 so->so_flags &= ~SOF_DELEGATED;
6969 so->e_upid = 0;
6970 so->e_pid = 0;
6971 uuid_clear(so->e_uuid);
6972 } else {
6973 so->so_flags |= SOF_DELEGATED;
6974 so->e_upid = proc_uniqueid(ep);
6975 so->e_pid = proc_pid(ep);
6976 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
6977 }
6978 done:
6979 if (error == 0 && net_io_policy_log) {
6980 uuid_string_t buf;
6981
6982 uuid_unparse(so->e_uuid, buf);
6983 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
6984 "euuid %s%s\n", __func__, proc_name_address(p),
6985 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6986 SOCK_DOM(so), SOCK_TYPE(so),
6987 so->e_pid, proc_name_address(ep), buf,
6988 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
6989 } else if (error != 0 && net_io_policy_log) {
6990 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
6991 "ERROR (%d)\n", __func__, proc_name_address(p),
6992 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6993 SOCK_DOM(so), SOCK_TYPE(so),
6994 epid, (ep == PROC_NULL) ? "PROC_NULL" :
6995 proc_name_address(ep), error);
6996 }
6997
6998 /* Update this socket's policy upon success */
6999 if (error == 0) {
7000 so->so_policy_gencnt *= -1;
7001 so_update_policy(so);
7002 #if NECP
7003 so_update_necp_policy(so, NULL, NULL);
7004 #endif /* NECP */
7005 }
7006
7007 if (ep != PROC_NULL)
7008 proc_rele(ep);
7009
7010 return (error);
7011 }
7012
7013 int
7014 so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
7015 {
7016 uuid_string_t buf;
7017 uuid_t uuid;
7018 int error = 0;
7019
7020 /* UUID must not be all-zeroes (reserved for kernel) */
7021 if (uuid_is_null(euuid)) {
7022 error = EINVAL;
7023 goto done;
7024 }
7025
7026 /*
7027 * If this is an in-kernel socket, prevent its delegate
7028 * association from changing unless the socket option is
7029 * coming from within the kernel itself.
7030 */
7031 if (so->last_pid == 0 && p != kernproc) {
7032 error = EACCES;
7033 goto done;
7034 }
7035
7036 /* Get the UUID of the issuing process */
7037 proc_getexecutableuuid(p, uuid, sizeof (uuid));
7038
7039 /*
7040 * If this is issued by a process that's recorded as the
7041 * real owner of the socket, or if the uuid is the same as
7042 * the process's own uuid, then proceed. Otherwise ensure
7043 * that the issuing process has the necessary privileges.
7044 */
7045 if (uuid_compare(euuid, so->last_uuid) != 0 ||
7046 uuid_compare(euuid, uuid) != 0) {
7047 if ((error = priv_check_cred(kauth_cred_get(),
7048 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7049 error = EACCES;
7050 goto done;
7051 }
7052 }
7053
7054 /*
7055 * If a process tries to delegate the socket to itself, then
7056 * there's really nothing to do; treat it as a way for the
7057 * delegate association to be cleared. Note that we check
7058 * the uuid of the passed-in proc rather than that of the
7059 * current process, as we need to check the process issuing
7060 * the socket option which could be kernproc itself. Given
7061 * that we don't allow 0 for effective uuid, it means that
7062 * a delegated in-kernel socket stays delegated during its
7063 * lifetime (which is okay.)
7064 */
7065 if (uuid_compare(euuid, uuid) == 0) {
7066 so->so_flags &= ~SOF_DELEGATED;
7067 so->e_upid = 0;
7068 so->e_pid = 0;
7069 uuid_clear(so->e_uuid);
7070 } else {
7071 so->so_flags |= SOF_DELEGATED;
7072 /*
7073 * Unlike so_set_effective_pid(), we only have the UUID
7074 * here and the process ID is not known. Inherit the
7075 * real {pid,upid} of the socket.
7076 */
7077 so->e_upid = so->last_upid;
7078 so->e_pid = so->last_pid;
7079 uuid_copy(so->e_uuid, euuid);
7080 }
7081
7082 done:
7083 if (error == 0 && net_io_policy_log) {
7084 uuid_unparse(so->e_uuid, buf);
7085 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7086 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
7087 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7088 SOCK_TYPE(so), so->e_pid, buf,
7089 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7090 } else if (error != 0 && net_io_policy_log) {
7091 uuid_unparse(euuid, buf);
7092 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7093 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
7094 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7095 SOCK_TYPE(so), buf, error);
7096 }
7097
7098 /* Update this socket's policy upon success */
7099 if (error == 0) {
7100 so->so_policy_gencnt *= -1;
7101 so_update_policy(so);
7102 #if NECP
7103 so_update_necp_policy(so, NULL, NULL);
7104 #endif /* NECP */
7105 }
7106
7107 return (error);
7108 }
7109
7110 void
7111 netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
7112 uint32_t ev_datalen)
7113 {
7114 struct kev_msg ev_msg;
7115
7116 /*
7117 * A netpolicy event always starts with a netpolicy_event_data
7118 * structure, but the caller can provide for a longer event
7119 * structure to post, depending on the event code.
7120 */
7121 VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
7122
7123 bzero(&ev_msg, sizeof (ev_msg));
7124 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7125 ev_msg.kev_class = KEV_NETWORK_CLASS;
7126 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
7127 ev_msg.event_code = ev_code;
7128
7129 ev_msg.dv[0].data_ptr = ev_data;
7130 ev_msg.dv[0].data_length = ev_datalen;
7131
7132 kev_post_msg(&ev_msg);
7133 }
7134
7135 void
7136 socket_post_kev_msg(uint32_t ev_code,
7137 struct kev_socket_event_data *ev_data,
7138 uint32_t ev_datalen)
7139 {
7140 struct kev_msg ev_msg;
7141
7142 bzero(&ev_msg, sizeof(ev_msg));
7143 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7144 ev_msg.kev_class = KEV_NETWORK_CLASS;
7145 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
7146 ev_msg.event_code = ev_code;
7147
7148 ev_msg.dv[0].data_ptr = ev_data;
7149 ev_msg.dv[0]. data_length = ev_datalen;
7150
7151 kev_post_msg(&ev_msg);
7152 }
7153
7154 void
7155 socket_post_kev_msg_closed(struct socket *so)
7156 {
7157 struct kev_socket_closed ev;
7158 struct sockaddr *socksa = NULL, *peersa = NULL;
7159 int err;
7160 bzero(&ev, sizeof(ev));
7161 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
7162 if (err == 0) {
7163 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
7164 &peersa);
7165 if (err == 0) {
7166 memcpy(&ev.ev_data.kev_sockname, socksa,
7167 min(socksa->sa_len,
7168 sizeof (ev.ev_data.kev_sockname)));
7169 memcpy(&ev.ev_data.kev_peername, peersa,
7170 min(peersa->sa_len,
7171 sizeof (ev.ev_data.kev_peername)));
7172 socket_post_kev_msg(KEV_SOCKET_CLOSED,
7173 &ev.ev_data, sizeof (ev));
7174 }
7175 }
7176 if (socksa != NULL)
7177 FREE(socksa, M_SONAME);
7178 if (peersa != NULL)
7179 FREE(peersa, M_SONAME);
7180 }